From 1acdd5c0f58b119f6aed3712322474de959a9ea2 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 14:59:45 +0100
Subject: [PATCH 01/77] Implement Trainer & TrainingArguments w. tests

Note: This commit involves three deprecations: the SetFitTrainer class (and DistilledSetFitTrainer), additional arguments to Trainer.train and the keep_body_frozen argument to Trainer.unfreeze. The first and last of these are 'graceful', i.e. old code will still work, but the Trainer.train changes are breaking in some situations. For example, e.g.
um_epochs can no longer be passed to Trainer.train. The new 'deprecated' test files are identical to the old test files. The goal here is to test whether old behaviour is still possible. For the most part it is, with exception of using Trainer.train with extra arguments. As a result, I skipped two tests in test_deprecated_trainer.py. Also note that docstrings have yet to be updated!
---
 src/setfit/__init__.py                        |   4 +-
 src/setfit/modeling.py                        |  80 ++--
 src/setfit/trainer.py                         | 311 +++++++--------
 src/setfit/trainer_distillation.py            | 276 ++++++--------
 src/setfit/training_args.py                   |  68 ++++
 tests/test_deprecated_trainer.py              | 359 ++++++++++++++++++
 tests/test_deprecated_trainer_distillation.py | 102 +++++
 tests/test_trainer.py                         |  96 ++---
 tests/test_trainer_distillation.py            |  25 +-
 tests/test_training_args.py                   |  15 +
 10 files changed, 901 insertions(+), 435 deletions(-)
 create mode 100644 src/setfit/training_args.py
 create mode 100644 tests/test_deprecated_trainer.py
 create mode 100644 tests/test_deprecated_trainer_distillation.py
 create mode 100644 tests/test_training_args.py

diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index 9c9665fc..37db149b 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -2,5 +2,5 @@
 
 from .data import add_templated_examples, sample_dataset
 from .modeling import SetFitHead, SetFitModel
-from .trainer import SetFitTrainer
-from .trainer_distillation import DistillationSetFitTrainer
+from .trainer import SetFitTrainer, Trainer
+from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 5abcf91d..0fe5abcf 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -1,7 +1,9 @@
 import os
 from dataclasses import dataclass
 from pathlib import Path
+import time
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import warnings
 
 
 # Google Colab runs on Python 3.7, so we need this to be compatible
@@ -14,14 +16,14 @@
 import numpy as np
 import requests
 import torch
-import torch.nn as nn
+from torch import nn
 from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
 from sentence_transformers import InputExample, SentenceTransformer, models
 from sklearn.linear_model import LogisticRegression
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
 from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
+from tqdm.auto import trange, tqdm
 
 from . import logging
 from .data import SetFitDataset
@@ -208,7 +210,7 @@ def predict(self, x_test: torch.Tensor) -> torch.Tensor:
 
         return out
 
-    def get_loss_fn(self):
+    def get_loss_fn(self) -> nn.Module:
         return torch.nn.CrossEntropyLoss()
 
     @property
@@ -232,9 +234,9 @@ def get_config_dict(self) -> Dict[str, Optional[Union[int, float, bool]]]:
     @staticmethod
     def _init_weight(module):
         if isinstance(module, nn.Linear):
-            torch.nn.init.xavier_uniform_(module.weight)
+            nn.init.xavier_uniform_(module.weight)
             if module.bias is not None:
-                torch.nn.init.constant_(module.bias, 1e-2)
+                nn.init.constant_(module.bias, 1e-2)
 
     def __repr__(self):
         return "SetFitHead({})".format(self.get_config_dict())
@@ -270,25 +272,29 @@ def fit(
         self,
         x_train: List[str],
         y_train: List[int],
-        num_epochs: int,
-        batch_size: Optional[int] = None,
-        learning_rate: Optional[float] = None,
-        body_learning_rate: Optional[float] = None,
+        classifier_num_epochs: int,
+        classifier_batch_size: Optional[int] = None,
+        classifier_learning_rate: Optional[Tuple[float, float]] = (None, None),
         l2_weight: Optional[float] = None,
         max_length: Optional[int] = None,
-        show_progress_bar: Optional[bool] = None,
+        show_progress_bar: bool = True,
+        end_to_end: bool = False,
+        **kwargs
     ) -> None:
         if self.has_differentiable_head:  # train with pyTorch
             device = self.model_body.device
             self.model_body.train()
             self.model_head.train()
+            if not end_to_end:
+                self.freeze("body")
 
-            dataloader = self._prepare_dataloader(x_train, y_train, batch_size, max_length)
+            dataloader = self._prepare_dataloader(x_train, y_train, classifier_batch_size, max_length)
             criterion = self.model_head.get_loss_fn()
-            optimizer = self._prepare_optimizer(learning_rate, body_learning_rate, l2_weight)
+            embedding_learning_rate, classifier_learning_rate = classifier_learning_rate
+            optimizer = self._prepare_optimizer(classifier_learning_rate, embedding_learning_rate, l2_weight)
             scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
-            for epoch_idx in tqdm(range(num_epochs), desc="Epoch", disable=not show_progress_bar):
-                for batch in dataloader:
+            for epoch_idx in trange(classifier_num_epochs, desc="Epoch", disable=not show_progress_bar):
+                for batch in tqdm(dataloader, desc="Iteration", disable=not show_progress_bar, leave=False):
                     features, labels = batch
                     optimizer.zero_grad()
 
@@ -298,15 +304,18 @@ def fit(
 
                     outputs = self.model_body(features)
                     if self.normalize_embeddings:
-                        outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
+                        outputs = nn.functional.normalize(outputs, p=2, dim=1)
                     outputs = self.model_head(outputs)
                     logits = outputs["logits"]
 
-                    loss = criterion(logits, labels)
+                    loss: torch.Tensor = criterion(logits, labels)
                     loss.backward()
                     optimizer.step()
 
                 scheduler.step()
+
+            if not end_to_end:
+                self.unfreeze("body")
         else:  # train with sklearn
             embeddings = self.model_body.encode(x_train, normalize_embeddings=self.normalize_embeddings)
             self.model_head.fit(embeddings, y_train)
@@ -349,16 +358,16 @@ def _prepare_dataloader(
 
     def _prepare_optimizer(
         self,
-        learning_rate: float,
-        body_learning_rate: Optional[float],
+        classifier_learning_rate: float,
+        embedding_learning_rate: Optional[float],
         l2_weight: float,
     ) -> torch.optim.Optimizer:
-        body_learning_rate = body_learning_rate or learning_rate
+        embedding_learning_rate = embedding_learning_rate or classifier_learning_rate
         l2_weight = l2_weight or self.l2_weight
         optimizer = torch.optim.AdamW(
             [
-                {"params": self.model_body.parameters(), "lr": body_learning_rate, "weight_decay": l2_weight},
-                {"params": self.model_head.parameters(), "lr": learning_rate, "weight_decay": l2_weight},
+                {"params": self.model_body.parameters(), "lr": embedding_learning_rate, "weight_decay": l2_weight},
+                {"params": self.model_head.parameters(), "lr": classifier_learning_rate, "weight_decay": l2_weight},
             ],
         )
 
@@ -368,25 +377,30 @@ def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=True)
 
-        if component is None or component == "head":
+        if (component is None or component == "head") and self.has_differentiable_head:
             self._freeze_or_not(self.model_head, to_freeze=True)
 
-    def unfreeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
+    def unfreeze(self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None) -> None:
+        if keep_body_frozen is not None:
+            warnings.warn("`keep_body_frozen` is deprecated. Please either pass \"head\", \"body\" or no arguments to unfreeze both.")
+
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=False)
 
-        if component is None or component == "head":
+        if (component is None or component == "head") and self.has_differentiable_head:
             self._freeze_or_not(self.model_head, to_freeze=False)
 
-    def _freeze_or_not(self, model: torch.nn.Module, to_freeze: bool) -> None:
+    def _freeze_or_not(self, model: nn.Module, to_freeze: bool) -> None:
         for param in model.parameters():
             param.requires_grad = not to_freeze
 
-    def predict(self, x_test: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
-        embeddings = self.model_body.encode(
-            x_test, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
+    def encode(self, inputs: List[str]) -> Union[torch.Tensor, "ndarray"]:
+        return self.model_body.encode(
+            inputs, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
         )
 
+    def predict(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
+        embeddings = self.encode(inputs)
         outputs = self.model_head.predict(embeddings)
 
         if as_numpy and self.has_differentiable_head:
@@ -396,11 +410,8 @@ def predict(self, x_test: List[str], as_numpy: bool = False) -> Union[torch.Tens
 
         return outputs
 
-    def predict_proba(self, x_test: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
-        embeddings = self.model_body.encode(
-            x_test, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
-        )
-
+    def predict_proba(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
+        embeddings = self.encode(inputs)
         outputs = self.model_head.predict_proba(embeddings)
 
         if as_numpy and self.has_differentiable_head:
@@ -419,6 +430,9 @@ def to(self, device: Union[str, torch.device]) -> "SetFitModel":
         Returns:
             SetFitModel: Returns the original model, but now on the desired device.
         """
+        # Note that we must also set _target_device, or any SentenceTransformer.fit() call will reset
+        # the body location
+        self.model_body._target_device = device if isinstance(device, torch.device) else torch.device(device)
         self.model_body = self.model_body.to(device)
 
         if self.has_differentiable_head:
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 776c0f1d..f05c3e97 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -1,5 +1,6 @@
 import math
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+import warnings
 
 import evaluate
 import numpy as np
@@ -9,6 +10,8 @@
 from torch.utils.data import DataLoader
 from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
 
+from setfit.training_args import TrainingArguments
+
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
 from .modeling import SupConLoss, sentence_pairs_generation, sentence_pairs_generation_multilabel
@@ -25,7 +28,7 @@
 logger = logging.get_logger(__name__)
 
 
-class SetFitTrainer:
+class Trainer:
     """Trainer to train a SetFit model.
 
     Args:
@@ -78,44 +81,21 @@ class SetFitTrainer:
     def __init__(
         self,
         model: Optional["SetFitModel"] = None,
+        args: Optional[TrainingArguments] = None,
         train_dataset: Optional["Dataset"] = None,
         eval_dataset: Optional["Dataset"] = None,
         model_init: Optional[Callable[[], "SetFitModel"]] = None,
         metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
         loss_class=losses.CosineSimilarityLoss,
-        num_iterations: int = 20,
-        num_epochs: int = 1,
-        learning_rate: float = 2e-5,
-        batch_size: int = 16,
-        seed: int = 42,
         column_mapping: Optional[Dict[str, str]] = None,
-        use_amp: bool = False,
-        warmup_proportion: float = 0.1,
-        distance_metric: Callable = BatchHardTripletLossDistanceFunction.cosine_distance,
-        margin: float = 0.25,
-        samples_per_label: int = 2,
     ):
-        if (warmup_proportion < 0.0) or (warmup_proportion > 1.0):
-            raise ValueError(
-                f"warmup_proportion must be greater than or equal to 0.0 and less than or equal to 1.0! But it was: {warmup_proportion}"
-            )
-
+        self.args = args
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
         self.model_init = model_init
         self.metric = metric
         self.loss_class = loss_class
-        self.num_iterations = num_iterations
-        self.num_epochs = num_epochs
-        self.learning_rate = learning_rate
-        self.batch_size = batch_size
-        self.seed = seed
         self.column_mapping = column_mapping
-        self.use_amp = use_amp
-        self.warmup_proportion = warmup_proportion
-        self.distance_metric = distance_metric
-        self.margin = margin
-        self.samples_per_label = samples_per_label
 
         if model is None:
             if model_init is not None:
@@ -127,6 +107,10 @@ def __init__(
                 raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both")
 
         self.model = model
+        # Adopt Trainer.(un)freeze from SetFitModel.(un)freeze
+        self.freeze = self.model.freeze
+        self.unfreeze = self.model.unfreeze
+
         self.hp_search_backend = None
         self._freeze = True  # If True, will train the body only; otherwise, train the body and head
 
@@ -226,81 +210,18 @@ def call_model_init(self, params: Optional[Dict[str, Any]] = None):
 
         return model
 
-    def freeze(self):
-        """
-        Freeze SetFitModel's differentiable head.
-        Note: call this function only when using the differentiable head.
-        """
-        if not self.model.has_differentiable_head:
-            raise ValueError("Please use the differentiable head in `SetFitModel` when calling this function.")
-
-        self._freeze = True  # Currently use self._freeze as a switch
-        self.model.freeze("head")
-
-    def unfreeze(self, keep_body_frozen: bool = False):
-        """
-        Unfreeze SetFitModel's differentiable head.
-        Note: call this function only when using the differentiable head.
-
-        Args:
-            keep_body_frozen (`bool`, *optional*, defaults to `False`):
-                Whether to freeze the body when unfreeze the head.
-        """
-        if not self.model.has_differentiable_head:
-            raise ValueError("Please use the differentiable head in `SetFitModel` when calling this function.")
-
-        self._freeze = False  # Currently use self._freeze as a switch
-        self.model.unfreeze("head")
-        if keep_body_frozen:
-            self.model.freeze("body")
-        else:  # ensure to unfreeze the body
-            self.model.unfreeze("body")
-
     def train(
-        self,
-        num_epochs: Optional[int] = None,
-        batch_size: Optional[int] = None,
-        learning_rate: Optional[float] = None,
-        body_learning_rate: Optional[float] = None,
-        l2_weight: Optional[float] = None,
-        max_length: Optional[int] = None,
-        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
-        show_progress_bar: bool = True,
+        self, args: Optional[TrainingArguments] = None, trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None
     ):
-        """
-        Main training entry point.
+        args = args or self.args or TrainingArguments()
 
-        Args:
-            num_epochs (`int`, *optional*):
-                Temporary change the number of epochs to train the Sentence Transformer body/head for.
-                If ignore, will use the value given in initialization.
-            batch_size (`int`, *optional*):
-                Temporary change the batch size to use for contrastive training or logistic regression.
-                If ignore, will use the value given in initialization.
-            learning_rate (`float`, *optional*):
-                Temporary change the learning rate to use for contrastive training or SetFitModel's head in logistic regression.
-                If ignore, will use the value given in initialization.
-            body_learning_rate (`float`, *optional*):
-                Temporary change the learning rate to use for SetFitModel's body in logistic regression only.
-                If ignore, will be the same as `learning_rate`.
-            l2_weight (`float`, *optional*):
-                Temporary change the weight of L2 regularization for SetFitModel's differentiable head in logistic regression.
-            max_length (int, *optional*, defaults to `None`):
-                The maximum number of tokens for one data sample. Currently only for training the differentiable head.
-                If `None`, will use the maximum number of tokens the model body can accept.
-                If `max_length` is greater than the maximum number of acceptable tokens the model body can accept, it will be set to the maximum number of acceptable tokens.
-            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
-                The trial run or the hyperparameter dictionary for hyperparameter search.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Whether to show a bar that indicates training progress.
-        """
-        set_seed(self.seed)  # Seed must be set before instantiating the model when using model_init.
+        set_seed(args.seed)  # Seed must be set before instantiating the model when using model_init.
 
         if trial:  # Trial and model initialization
             self._hp_search_setup(trial)  # sets trainer parameters and initializes model
 
         if self.train_dataset is None:
-            raise ValueError("Training requires a `train_dataset` given to the `SetFitTrainer` initialization.")
+            raise ValueError(f"Training requires a `train_dataset` given to the `{self.__class__.__name__}` initialization.")
 
         self._validate_column_mapping(self.train_dataset)
         train_dataset = self.train_dataset
@@ -308,93 +229,88 @@ def train(
             logger.info("Applying column mapping to training dataset")
             train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
 
-        x_train = train_dataset["text"]
-        y_train = train_dataset["label"]
+        x_train: List[str] = train_dataset["text"]
+        y_train: List[int] = train_dataset["label"]
         if self.loss_class is None:
             logger.warning("No `loss_class` detected! Using `CosineSimilarityLoss` as the default.")
             self.loss_class = losses.CosineSimilarityLoss
 
-        num_epochs = num_epochs or self.num_epochs
-        batch_size = batch_size or self.batch_size
-        learning_rate = learning_rate or self.learning_rate
-
-        if not self.model.has_differentiable_head or self._freeze:
-            # sentence-transformers adaptation
-            if self.loss_class in [
-                losses.BatchAllTripletLoss,
-                losses.BatchHardTripletLoss,
-                losses.BatchSemiHardTripletLoss,
-                losses.BatchHardSoftMarginTripletLoss,
-                SupConLoss,
-            ]:
-                train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
-                train_data_sampler = SentenceLabelDataset(train_examples, samples_per_label=self.samples_per_label)
-
-                batch_size = min(batch_size, len(train_data_sampler))
-                train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
-
-                if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
-                    train_loss = self.loss_class(
-                        model=self.model.model_body,
-                        distance_metric=self.distance_metric,
+        self.train_embeddings(x_train, y_train, args)
+        self.train_classifier(x_train, y_train, args)
+
+    def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optional[TrainingArguments] = None):
+        args = args or self.args or TrainingArguments()
+
+        # sentence-transformers adaptation
+        if self.loss_class in [
+            losses.BatchAllTripletLoss,
+            losses.BatchHardTripletLoss,
+            losses.BatchSemiHardTripletLoss,
+            losses.BatchHardSoftMarginTripletLoss,
+            SupConLoss,
+        ]:
+            train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
+            train_data_sampler = SentenceLabelDataset(train_examples, samples_per_label=args.samples_per_label)
+
+            batch_size = min(args.embedding_batch_size, len(train_data_sampler))
+            train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
+
+            if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
+                train_loss = self.loss_class(
+                    model=self.model.model_body,
+                    distance_metric=args.distance_metric,
+                )
+            elif self.loss_class is SupConLoss:
+                train_loss = self.loss_class(model=self.model.model_body)
+            else:
+                train_loss = self.loss_class(
+                    model=self.model.model_body,
+                    distance_metric=args.distance_metric,
+                    margin=args.margin,
+                )
+
+            train_steps = len(train_dataloader) * args.embedding_num_epochs
+        else:
+            train_examples = []
+
+            for _ in range(args.num_iterations):
+                if self.model.multi_target_strategy is not None:
+                    train_examples = sentence_pairs_generation_multilabel(
+                        np.array(x_train), np.array(y_train), train_examples
                     )
-                elif self.loss_class is SupConLoss:
-                    train_loss = self.loss_class(model=self.model.model_body)
                 else:
-                    train_loss = self.loss_class(
-                        model=self.model.model_body,
-                        distance_metric=self.distance_metric,
-                        margin=self.margin,
-                    )
+                    train_examples = sentence_pairs_generation(np.array(x_train), np.array(y_train), train_examples)
+
+            batch_size = args.embedding_batch_size
+            train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
+            train_loss = self.loss_class(self.model.model_body)
+            train_steps = len(train_dataloader) * args.embedding_num_epochs
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_examples)}")
+        logger.info(f"  Num epochs = {args.embedding_num_epochs}")
+        logger.info(f"  Total optimization steps = {train_steps}")
+        logger.info(f"  Total train batch size = {batch_size}")
+
+        warmup_steps = math.ceil(train_steps * args.warmup_proportion)
+        self.model.model_body.fit(
+            train_objectives=[(train_dataloader, train_loss)],
+            epochs=args.embedding_num_epochs,
+            steps_per_epoch=train_steps,
+            optimizer_params={"lr": args.embedding_learning_rate},
+            warmup_steps=warmup_steps,
+            show_progress_bar=args.show_progress_bar,
+            use_amp=args.use_amp,
+        )
 
-                train_steps = len(train_dataloader) * self.num_epochs
-            else:
-                train_examples = []
-
-                for _ in range(self.num_iterations):
-                    if self.model.multi_target_strategy is not None:
-                        train_examples = sentence_pairs_generation_multilabel(
-                            np.array(x_train), np.array(y_train), train_examples
-                        )
-                    else:
-                        train_examples = sentence_pairs_generation(
-                            np.array(x_train), np.array(y_train), train_examples
-                        )
-
-                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-                train_loss = self.loss_class(self.model.model_body)
-                train_steps = len(train_dataloader) * num_epochs
-
-            logger.info("***** Running training *****")
-            logger.info(f"  Num examples = {len(train_examples)}")
-            logger.info(f"  Num epochs = {num_epochs}")
-            logger.info(f"  Total optimization steps = {train_steps}")
-            logger.info(f"  Total train batch size = {batch_size}")
-
-            warmup_steps = math.ceil(train_steps * self.warmup_proportion)
-            self.model.model_body.fit(
-                train_objectives=[(train_dataloader, train_loss)],
-                epochs=num_epochs,
-                steps_per_epoch=train_steps,
-                optimizer_params={"lr": learning_rate},
-                warmup_steps=warmup_steps,
-                show_progress_bar=show_progress_bar,
-                use_amp=self.use_amp,
-            )
+    def train_classifier(self, x_train: List[str], y_train: List[int], args: Optional[TrainingArguments] = None):
+        args = args or self.args or TrainingArguments()
 
-        if not self.model.has_differentiable_head or not self._freeze:
-            # Train the final classifier
-            self.model.fit(
-                x_train,
-                y_train,
-                num_epochs=num_epochs,
-                batch_size=batch_size,
-                learning_rate=learning_rate,
-                body_learning_rate=body_learning_rate,
-                l2_weight=l2_weight,
-                max_length=max_length,
-                show_progress_bar=True,
-            )
+        self.model.fit(
+            x_train,
+            y_train,
+            **args.to_dict(),
+        )
 
     def evaluate(self):
         """
@@ -533,3 +449,52 @@ def push_to_hub(
             config,
             skip_lfs_files,
         )
+
+
+class SetFitTrainer(Trainer):
+    def __init__(
+        self,
+        model: Optional["SetFitModel"] = None,
+        train_dataset: Optional["Dataset"] = None,
+        eval_dataset: Optional["Dataset"] = None,
+        model_init: Optional[Callable[[], "SetFitModel"]] = None,
+        metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
+        loss_class=losses.CosineSimilarityLoss,
+        num_iterations: int = 20,
+        num_epochs: int = 1,
+        learning_rate: float = 2e-5,
+        batch_size: int = 16,
+        seed: int = 42,
+        column_mapping: Optional[Dict[str, str]] = None,
+        use_amp: bool = False,
+        warmup_proportion: float = 0.1,
+        distance_metric: Callable = BatchHardTripletLossDistanceFunction.cosine_distance,
+        margin: float = 0.25,
+        samples_per_label: int = 2,
+    ):
+        warnings.warn(
+            "`SetFitTrainer` has been deprecated. Please use `from setfit import Trainer` instead.", DeprecationWarning
+        )
+        args = TrainingArguments(
+            num_iterations=num_iterations,
+            num_epochs=num_epochs,
+            classifier_learning_rate=learning_rate,
+            embedding_learning_rate=learning_rate,
+            batch_size=batch_size,
+            seed=seed,
+            use_amp=use_amp,
+            warmup_proportion=warmup_proportion,
+            distance_metric=distance_metric,
+            margin=margin,
+            samples_per_label=samples_per_label,
+        )
+        super().__init__(
+            model=model,
+            args=args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_init=model_init,
+            metric=metric,
+            loss_class=loss_class,
+            column_mapping=column_mapping,
+        )
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 2546f7ea..9bf53888 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -1,5 +1,5 @@
 import math
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -9,8 +9,11 @@
 from torch.utils.data import DataLoader
 from transformers.trainer_utils import set_seed
 
-from . import SetFitTrainer, logging
+from setfit.training_args import TrainingArguments
+
+from . import logging
 from .modeling import SupConLoss, sentence_pairs_generation_cos_sim
+from .trainer import Trainer
 
 
 if TYPE_CHECKING:
@@ -23,7 +26,7 @@
 logger = logging.get_logger(__name__)
 
 
-class DistillationSetFitTrainer(SetFitTrainer):
+class DistillationTrainer(Trainer):
     """Trainer to compress a SetFit model with knowledge distillation.
 
     Args:
@@ -67,177 +70,140 @@ def __init__(
         self,
         teacher_model: "SetFitModel",
         student_model: Optional["SetFitModel"] = None,
+        args: TrainingArguments = None,
         train_dataset: Optional["Dataset"] = None,
         eval_dataset: Optional["Dataset"] = None,
         model_init: Optional[Callable[[], "SetFitModel"]] = None,
         metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
         loss_class: torch.nn.Module = losses.CosineSimilarityLoss,
-        num_iterations: int = 20,
-        num_epochs: int = 1,
-        learning_rate: float = 2e-5,
-        batch_size: int = 16,
-        seed: int = 42,
         column_mapping: Optional[Dict[str, str]] = None,
-        use_amp: bool = False,
-        warmup_proportion: float = 0.1,
     ) -> None:
-        super(DistillationSetFitTrainer, self).__init__(
+        super().__init__(
             model=student_model,
+            args=args,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             model_init=model_init,
             metric=metric,
             loss_class=loss_class,
-            num_iterations=num_iterations,
-            num_epochs=num_epochs,
-            learning_rate=learning_rate,
-            batch_size=batch_size,
-            seed=seed,
             column_mapping=column_mapping,
-            use_amp=use_amp,
-            warmup_proportion=warmup_proportion,
         )
 
         self.teacher_model = teacher_model
         self.student_model = self.model
 
-    def train(
+    def train_embeddings(
         self,
-        num_epochs: Optional[int] = None,
-        batch_size: Optional[int] = None,
-        learning_rate: Optional[float] = None,
-        body_learning_rate: Optional[float] = None,
-        l2_weight: Optional[float] = None,
-        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
-        show_progress_bar: bool = True,
+        x_train: List[str],
+        y_train: List[int],
+        args: Optional[TrainingArguments] = None,
     ):
-        """
-        Main training entry point.
-
-        Args:
-            num_epochs (`int`, *optional*):
-                Temporary change the number of epochs to train the Sentence Transformer body/head for.
-                If ignore, will use the value given in initialization.
-            batch_size (`int`, *optional*):
-                Temporary change the batch size to use for contrastive training or logistic regression.
-                If ignore, will use the value given in initialization.
-            learning_rate (`float`, *optional*):
-                Temporary change the learning rate to use for contrastive training or SetFitModel's head in logistic regression.
-                If ignore, will use the value given in initialization.
-            body_learning_rate (`float`, *optional*):
-                Temporary change the learning rate to use for SetFitModel's body in logistic regression only.
-                If ignore, will be the same as `learning_rate`.
-            l2_weight (`float`, *optional*):
-                Temporary change the weight of L2 regularization for SetFitModel's differentiable head in logistic regression.
-            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
-                The trial run or the hyperparameter dictionary for hyperparameter search.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Whether to show a bar that indicates training progress.
-        """
-        set_seed(self.seed)  # Seed must be set before instantiating the model when using model_init.
-
-        if trial:  # Trial and model initialization
-            self._hp_search_setup(trial)  # sets trainer parameters and initializes model
-
-        if self.train_dataset is None:
-            raise ValueError(
-                "Training requires a `train_dataset` given to the `DistillationSetFitTrainer` initialization."
-            )
-
-        self._validate_column_mapping(self.train_dataset)
-        train_dataset = self.train_dataset
-        if self.column_mapping is not None:
-            logger.info("Applying column mapping to training dataset")
-            train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
-
-        x_train = train_dataset["text"]
-        y_train = train_dataset["label"]
-        if self.loss_class is None:
-            logger.warning("No `loss_class` detected! Using `CosineSimilarityLoss` as the default.")
-            self.loss_class = losses.CosineSimilarityLoss
-
-        num_epochs = num_epochs or self.num_epochs
-        batch_size = batch_size or self.batch_size
-        learning_rate = learning_rate or self.learning_rate
-
-        if not self.student_model.has_differentiable_head or self._freeze:
-            # sentence-transformers adaptation
-            if self.loss_class in [
-                losses.BatchAllTripletLoss,
-                losses.BatchHardTripletLoss,
-                losses.BatchSemiHardTripletLoss,
-                losses.BatchHardSoftMarginTripletLoss,
-                SupConLoss,
-            ]:
-                train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
-                train_data_sampler = SentenceLabelDataset(train_examples)
-
-                batch_size = min(batch_size, len(train_data_sampler))
-                train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
-
-                if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
-                    train_loss = self.loss_class(
-                        model=self.student_model,
-                        distance_metric=BatchHardTripletLossDistanceFunction.cosine_distance,
-                    )
-                elif self.loss_class is SupConLoss:
-                    train_loss = self.loss_class(model=self.student_model)
-                else:
-
-                    train_loss = self.loss_class(
-                        model=self.student_model,
-                        distance_metric=BatchHardTripletLossDistanceFunction.cosine_distance,
-                        margin=0.25,
-                    )
-
-                train_steps = len(train_dataloader) * self.num_epochs
+        args = args or self.args or TrainingArguments()
+
+        # sentence-transformers adaptation
+        if self.loss_class in [
+            losses.BatchAllTripletLoss,
+            losses.BatchHardTripletLoss,
+            losses.BatchSemiHardTripletLoss,
+            losses.BatchHardSoftMarginTripletLoss,
+            SupConLoss,
+        ]:
+            train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
+            train_data_sampler = SentenceLabelDataset(train_examples)
+
+            batch_size = min(args.embedding_batch_size, len(train_data_sampler))
+            train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
+
+            if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
+                train_loss = self.loss_class(
+                    model=self.student_model.model_body,
+                    distance_metric=args.distance_metric,
+                )
+            elif self.loss_class is SupConLoss:
+                train_loss = self.loss_class(model=self.student_model)
             else:
-                train_examples = []
-
-                # **************** student training ****************
-                x_train_embd_student = self.teacher_model.model_body.encode(x_train)
-                y_train = self.teacher_model.model_head.predict(x_train_embd_student)
-
-                cos_sim_matrix = util.cos_sim(x_train_embd_student, x_train_embd_student)
-
-                train_examples = []
-                for _ in range(self.num_iterations):
-                    train_examples = sentence_pairs_generation_cos_sim(
-                        np.array(x_train), train_examples, cos_sim_matrix
-                    )
-
-                # **************** student training END ****************
-
-                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-                train_loss = self.loss_class(self.student_model.model_body)
-                train_steps = len(train_dataloader) * num_epochs
-
-            logger.info("***** Running training *****")
-            logger.info(f"  Num examples = {len(train_examples)}")
-            logger.info(f"  Num epochs = {num_epochs}")
-            logger.info(f"  Total optimization steps = {train_steps}")
-            logger.info(f"  Total train batch size = {batch_size}")
-
-            warmup_steps = math.ceil(train_steps * self.warmup_proportion)
-            self.student_model.model_body.fit(
-                train_objectives=[(train_dataloader, train_loss)],
-                epochs=num_epochs,
-                steps_per_epoch=train_steps,
-                optimizer_params={"lr": learning_rate},
-                warmup_steps=warmup_steps,
-                show_progress_bar=show_progress_bar,
-                use_amp=self.use_amp,
-            )
-
-        if not self.student_model.has_differentiable_head or not self._freeze:
-            # Train the final classifier
-            self.student_model.fit(
-                x_train,
-                y_train,
-                num_epochs=num_epochs,
-                batch_size=batch_size,
-                learning_rate=learning_rate,
-                body_learning_rate=body_learning_rate,
-                l2_weight=l2_weight,
-                show_progress_bar=show_progress_bar,
-            )
+                train_loss = self.loss_class(
+                    model=self.student_model.model_body,
+                    distance_metric=args.distance_metric,
+                    margin=args.margin,
+                )
+
+            train_steps = len(train_dataloader) * args.embedding_num_epochs
+        else:
+            train_examples = []
+
+            # **************** student training *********************
+            # Only this snippet differs from Trainer.train_embeddings
+            x_train_embd_student = self.teacher_model.model_body.encode(x_train)
+            y_train = self.teacher_model.model_head.predict(x_train_embd_student)
+
+            cos_sim_matrix = util.cos_sim(x_train_embd_student, x_train_embd_student)
+
+            train_examples = []
+            for _ in range(args.num_iterations):
+                train_examples = sentence_pairs_generation_cos_sim(np.array(x_train), train_examples, cos_sim_matrix)
+            # **************** student training END *****************
+
+            batch_size = args.embedding_batch_size
+            train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
+            train_loss = self.loss_class(self.student_model.model_body)
+            train_steps = len(train_dataloader) * args.embedding_num_epochs
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_examples)}")
+        logger.info(f"  Num epochs = {args.embedding_num_epochs}")
+        logger.info(f"  Total optimization steps = {train_steps}")
+        logger.info(f"  Total train batch size = {batch_size}")
+
+        warmup_steps = math.ceil(train_steps * args.warmup_proportion)
+        self.student_model.model_body.fit(
+            train_objectives=[(train_dataloader, train_loss)],
+            epochs=args.embedding_num_epochs,
+            steps_per_epoch=train_steps,
+            optimizer_params={"lr": args.embedding_learning_rate},
+            warmup_steps=warmup_steps,
+            show_progress_bar=args.show_progress_bar,
+            use_amp=args.use_amp,
+        )
+
+
+class DistillationSetFitTrainer(DistillationTrainer):
+    def __init__(
+        self,
+        teacher_model: "SetFitModel",
+        student_model: Optional["SetFitModel"] = None,
+        train_dataset: Optional["Dataset"] = None,
+        eval_dataset: Optional["Dataset"] = None,
+        model_init: Optional[Callable[[], "SetFitModel"]] = None,
+        metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
+        loss_class: torch.nn.Module = losses.CosineSimilarityLoss,
+        num_iterations: int = 20,
+        num_epochs: int = 1,
+        learning_rate: float = 2e-5,
+        batch_size: int = 16,
+        seed: int = 42,
+        column_mapping: Optional[Dict[str, str]] = None,
+        use_amp: bool = False,
+        warmup_proportion: float = 0.1,
+    ):
+        args = TrainingArguments(
+            num_iterations=num_iterations,
+            num_epochs=num_epochs,
+            embedding_learning_rate=learning_rate,
+            classifier_learning_rate=learning_rate,
+            batch_size=batch_size,
+            seed=seed,
+            use_amp=use_amp,
+            warmup_proportion=warmup_proportion,
+        )
+        super().__init__(
+            teacher_model=teacher_model,
+            student_model=student_model,
+            args=args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_init=model_init,
+            metric=metric,
+            loss_class=loss_class,
+            column_mapping=column_mapping,
+        )
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
new file mode 100644
index 00000000..5a796d7a
--- /dev/null
+++ b/src/setfit/training_args.py
@@ -0,0 +1,68 @@
+from copy import copy
+from dataclasses import dataclass, fields, field
+from typing import Callable, Tuple, Union
+from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
+
+
+@dataclass
+class TrainingArguments:
+
+    # batch_size is only used to conveniently set `embedding_batch_size` and `classifier_batch_size`
+    # which are used in practice
+    batch_size: Union[int, Tuple[int, int]] = field(default=(16, 2), repr=False)
+    embedding_batch_size: int = None
+    classifier_batch_size: int = None
+
+    # num_epochs is only used to conveniently set `embedding_num_epochs` and `classifier_num_epochs`
+    # which are used in practice
+    num_epochs: Union[int, Tuple[int, int]] = field(default=(1, 16), repr=False)
+    embedding_num_epochs: int = None
+    classifier_num_epochs: int = None
+
+    num_iterations: int = 20
+
+    embedding_learning_rate: float = 2e-5
+    classifier_learning_rate: Union[float, Tuple[float, float]] = (1e-5, 1e-2)
+
+    seed: int = 42
+    use_amp: bool = False
+    warmup_proportion: float = 0.1
+    distance_metric: Callable = BatchHardTripletLossDistanceFunction.cosine_distance
+    margin: float = 0.25
+    samples_per_label: int = 2
+    show_progress_bar: bool = True
+
+    l2_weight: float = None
+    max_length: int = None
+
+    end_to_end: bool = False
+
+    def __post_init__(self):
+        if isinstance(self.batch_size, int):
+            self.batch_size = (self.batch_size, self.batch_size)
+        if self.embedding_batch_size is None:
+            self.embedding_batch_size = self.batch_size[0]
+        if self.classifier_batch_size is None:
+            self.classifier_batch_size = self.batch_size[1]
+
+        if isinstance(self.num_epochs, int):
+            self.num_epochs = (self.num_epochs, self.num_epochs)
+        if self.embedding_num_epochs is None:
+            self.embedding_num_epochs = self.num_epochs[0]
+        if self.classifier_num_epochs is None:
+            self.classifier_num_epochs = self.num_epochs[1]
+
+        if isinstance(self.classifier_learning_rate, float):
+            self.classifier_learning_rate = (self.embedding_learning_rate, self.classifier_learning_rate)
+
+        if self.warmup_proportion < 0.0 or self.warmup_proportion > 1.0:
+            raise ValueError(
+                f"warmup_proportion must be greater than or equal to 0.0 and less than or equal to 1.0! But it was: {self.warmup_proportion}"
+            )
+
+    def to_dict(self):
+        # filter out fields that are defined as field(init=False)
+        return {field.name: getattr(self, field.name) for field in fields(self) if field.init}
+
+    def copy(self):
+        return copy(self)
\ No newline at end of file
diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
new file mode 100644
index 00000000..467b77e0
--- /dev/null
+++ b/tests/test_deprecated_trainer.py
@@ -0,0 +1,359 @@
+from unittest import TestCase
+
+import evaluate
+import pytest
+from datasets import Dataset
+from sentence_transformers import losses
+from transformers.testing_utils import require_optuna
+from transformers.utils.hp_naming import TrialShortNamer
+
+from setfit import logging
+from setfit.modeling import SetFitModel, SupConLoss
+from setfit.trainer import SetFitTrainer
+from setfit.utils import BestRun
+
+
+logging.set_verbosity_warning()
+logging.enable_propagation()
+
+
+class SetFitTrainerTest(TestCase):
+    def setUp(self):
+        self.model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+        self.num_iterations = 1
+
+    def test_trainer_works_with_model_init(self):
+        def get_model():
+            model_name = "sentence-transformers/paraphrase-albert-small-v2"
+            return SetFitModel.from_pretrained(model_name)
+
+        dataset = Dataset.from_dict(
+            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+        trainer = SetFitTrainer(
+            model_init=get_model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+        trainer.train()
+        metrics = trainer.evaluate()
+        self.assertEqual(metrics["accuracy"], 1.0)
+
+    def test_trainer_works_with_column_mapping(self):
+        dataset = Dataset.from_dict(
+            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+        trainer.train()
+        metrics = trainer.evaluate()
+        self.assertEqual(metrics["accuracy"], 1.0)
+
+    def test_trainer_works_with_default_columns(self):
+        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
+        trainer = SetFitTrainer(
+            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
+        )
+        trainer.train()
+        metrics = trainer.evaluate()
+        self.assertEqual(metrics["accuracy"], 1.0)
+
+    def test_trainer_raises_error_with_missing_label(self):
+        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
+        trainer = SetFitTrainer(
+            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
+        )
+        with pytest.raises(ValueError):
+            trainer.train()
+
+    def test_trainer_raises_error_with_missing_text(self):
+        dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
+        trainer = SetFitTrainer(
+            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
+        )
+        with pytest.raises(ValueError):
+            trainer.train()
+
+    def test_column_mapping_with_missing_text(self):
+        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"label_new": "label"},
+        )
+        with pytest.raises(ValueError):
+            trainer._validate_column_mapping(trainer.train_dataset)
+
+    def test_column_mapping_multilabel(self):
+        dataset = Dataset.from_dict({"text_new": ["a", "b", "c"], "label_new": [[0, 1], [1, 2], [2, 0]]})
+
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+
+        trainer._validate_column_mapping(trainer.train_dataset)
+        formatted_dataset = trainer._apply_column_mapping(trainer.train_dataset, trainer.column_mapping)
+
+        assert formatted_dataset.column_names == ["text", "label"]
+
+        assert formatted_dataset[0]["text"] == "a"
+        assert formatted_dataset[0]["label"] == [0, 1]
+
+        assert formatted_dataset[1]["text"] == "b"
+
+    def test_trainer_support_callable_as_metric(self):
+        dataset = Dataset.from_dict(
+            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+
+        f1_metric = evaluate.load("f1")
+        accuracy_metric = evaluate.load("accuracy")
+
+        def compute_metrics(y_pred, y_test):
+            return {
+                "f1": f1_metric.compute(predictions=y_pred, references=y_test, average="micro")["f1"],
+                "accuracy": accuracy_metric.compute(predictions=y_pred, references=y_test)["accuracy"],
+            }
+
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            metric=compute_metrics,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+
+        trainer.train()
+        metrics = trainer.evaluate()
+
+        self.assertEqual(
+            {
+                "f1": 1.0,
+                "accuracy": 1.0,
+            },
+            metrics,
+        )
+
+    def test_raise_when_metric_value_is_invalid(self):
+        dataset = Dataset.from_dict(
+            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            metric="this-metric-does-not-exist",  # invalid metric value
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+
+        trainer.train()
+
+        with self.assertRaises(FileNotFoundError):
+            trainer.evaluate()
+
+    def test_trainer_raises_error_with_wrong_warmup_proportion(self):
+        # warmup_proportion must not be > 1.0
+        with pytest.raises(ValueError):
+            SetFitTrainer(warmup_proportion=1.1)
+
+        # warmup_proportion must not be < 0.0
+        with pytest.raises(ValueError):
+            SetFitTrainer(warmup_proportion=-0.1)
+
+
+class SetFitTrainerDifferentiableHeadTest(TestCase):
+    def setUp(self):
+        self.dataset = Dataset.from_dict(
+            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+        self.model = SetFitModel.from_pretrained(
+            "sentence-transformers/paraphrase-albert-small-v2",
+            use_differentiable_head=True,
+            head_params={"out_features": 3},
+        )
+        self.num_iterations = 1
+
+    @pytest.mark.skip(reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error.")
+    def test_trainer_max_length_exceeds_max_acceptable_length(self):
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+        trainer.unfreeze(keep_body_frozen=True)
+        with self.assertLogs(level=logging.WARNING) as cm:
+            max_length = 4096
+            max_acceptable_length = self.model.model_body.get_max_seq_length()
+            trainer.train(
+                num_epochs=1,
+                batch_size=3,
+                learning_rate=1e-2,
+                l2_weight=0.0,
+                max_length=max_length,
+            )
+            self.assertEqual(
+                cm.output,
+                [
+                    (
+                        f"WARNING:setfit.modeling:The specified `max_length`: {max_length} is greater than the maximum length "
+                        f"of the current model body: {max_acceptable_length}. Using {max_acceptable_length} instead."
+                    )
+                ],
+            )
+
+    @pytest.mark.skip(reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error.")
+    def test_trainer_max_length_is_smaller_than_max_acceptable_length(self):
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+        trainer.unfreeze(keep_body_frozen=True)
+
+        # An alternative way of `assertNoLogs`, which is new in Python 3.10
+        try:
+            with self.assertLogs(level=logging.WARNING) as cm:
+                max_length = 32
+                trainer.train(
+                    num_epochs=1,
+                    batch_size=3,
+                    learning_rate=1e-2,
+                    l2_weight=0.0,
+                    max_length=max_length,
+                )
+                self.assertEqual(cm.output, [])
+        except AssertionError as e:
+            if e.args[0] != "no logs of level WARNING or higher triggered on root":
+                raise AssertionError(e)
+
+
+class SetFitTrainerMultilabelTest(TestCase):
+    def setUp(self):
+        self.model = SetFitModel.from_pretrained(
+            "sentence-transformers/paraphrase-albert-small-v2", multi_target_strategy="one-vs-rest"
+        )
+        self.num_iterations = 1
+
+    def test_trainer_multilabel_support_callable_as_metric(self):
+        dataset = Dataset.from_dict({"text_new": ["a", "b", "c"], "label_new": [[1, 0, 0], [0, 1, 0], [0, 0, 1]]})
+
+        multilabel_f1_metric = evaluate.load("f1", "multilabel")
+        multilabel_accuracy_metric = evaluate.load("accuracy", "multilabel")
+
+        def compute_metrics(y_pred, y_test):
+            return {
+                "f1": multilabel_f1_metric.compute(predictions=y_pred, references=y_test, average="micro")["f1"],
+                "accuracy": multilabel_accuracy_metric.compute(predictions=y_pred, references=y_test)["accuracy"],
+            }
+
+        trainer = SetFitTrainer(
+            model=self.model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            metric=compute_metrics,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+
+        trainer.train()
+        metrics = trainer.evaluate()
+
+        self.assertEqual(
+            {
+                "f1": 1.0,
+                "accuracy": 1.0,
+            },
+            metrics,
+        )
+
+
+@require_optuna
+class TrainerHyperParameterOptunaIntegrationTest(TestCase):
+    def setUp(self):
+        self.dataset = Dataset.from_dict(
+            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+        self.num_iterations = 1
+
+    def test_hyperparameter_search(self):
+        class MyTrialShortNamer(TrialShortNamer):
+            DEFAULTS = {"max_iter": 100, "solver": "liblinear"}
+
+        def hp_space(trial):
+            return {
+                "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+                "batch_size": trial.suggest_categorical("batch_size", [4, 8, 16, 32, 64]),
+                "max_iter": trial.suggest_int("max_iter", 50, 300),
+                "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"]),
+            }
+
+        def model_init(params):
+            params = params or {}
+            max_iter = params.get("max_iter", 100)
+            solver = params.get("solver", "liblinear")
+            params = {
+                "head_params": {
+                    "max_iter": max_iter,
+                    "solver": solver,
+                }
+            }
+            return SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", **params)
+
+        def hp_name(trial):
+            return MyTrialShortNamer.shortname(trial.params)
+
+        trainer = SetFitTrainer(
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            num_iterations=self.num_iterations,
+            model_init=model_init,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+        result = trainer.hyperparameter_search(direction="minimize", hp_space=hp_space, hp_name=hp_name, n_trials=4)
+        assert isinstance(result, BestRun)
+        assert result.hyperparameters.keys() == {"learning_rate", "batch_size", "max_iter", "solver"}
+
+
+# regression test for https://github.com/huggingface/setfit/issues/153
+@pytest.mark.parametrize(
+    "loss_class",
+    [
+        losses.BatchAllTripletLoss,
+        losses.BatchHardTripletLoss,
+        losses.BatchSemiHardTripletLoss,
+        losses.BatchHardSoftMarginTripletLoss,
+        SupConLoss,
+    ],
+)
+def test_trainer_works_with_non_default_loss_class(loss_class):
+    dataset = Dataset.from_dict({"text": ["a 1", "b 1", "c 1", "a 2", "b 2", "c 2"], "label": [0, 1, 2, 0, 1, 2]})
+    model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+    trainer = SetFitTrainer(
+        model=model,
+        train_dataset=dataset,
+        eval_dataset=dataset,
+        num_iterations=1,
+        loss_class=loss_class,
+    )
+    trainer.train()
+    # no asserts here because this is a regression test - we only test if an exception is raised
diff --git a/tests/test_deprecated_trainer_distillation.py b/tests/test_deprecated_trainer_distillation.py
new file mode 100644
index 00000000..4257a42e
--- /dev/null
+++ b/tests/test_deprecated_trainer_distillation.py
@@ -0,0 +1,102 @@
+from unittest import TestCase
+
+import pytest
+from datasets import Dataset
+from sentence_transformers.losses import CosineSimilarityLoss
+
+from setfit import DistillationSetFitTrainer, SetFitTrainer
+from setfit.modeling import SetFitModel
+
+
+class DistillationSetFitTrainerTest(TestCase):
+    def setUp(self):
+        self.teacher_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+        self.student_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L3-v2")
+        self.num_iterations = 1
+
+    def test_trainer_works_with_default_columns(self):
+        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
+        # train a teacher model
+        teacher_trainer = SetFitTrainer(
+            model=self.teacher_model,
+            train_dataset=dataset,
+            eval_dataset=dataset,
+            loss_class=CosineSimilarityLoss,
+            metric="accuracy",
+        )
+        # Teacher Train and evaluate
+        teacher_trainer.train()
+        metrics = teacher_trainer.evaluate()
+        teacher_model = teacher_trainer.model
+
+        student_trainer = DistillationSetFitTrainer(
+            teacher_model=teacher_model,
+            train_dataset=dataset,
+            student_model=self.student_model,
+            eval_dataset=dataset,
+            loss_class=CosineSimilarityLoss,
+            metric="accuracy",
+        )
+
+        # Student Train and evaluate
+        student_trainer.train()
+        metrics = student_trainer.evaluate()
+        print("Student results: ", metrics)
+        self.assertEqual(metrics["accuracy"], 1.0)
+
+    def test_trainer_raises_error_with_missing_label(self):
+        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
+        trainer = DistillationSetFitTrainer(
+            teacher_model=self.teacher_model,
+            train_dataset=dataset,
+            student_model=self.student_model,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+        )
+        with pytest.raises(ValueError):
+            trainer.train()
+
+    def test_trainer_raises_error_with_missing_text(self):
+        dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
+        trainer = DistillationSetFitTrainer(
+            teacher_model=self.teacher_model,
+            train_dataset=dataset,
+            student_model=self.student_model,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+        )
+        with pytest.raises(ValueError):
+            trainer.train()
+
+    def test_column_mapping_with_missing_text(self):
+        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
+        trainer = DistillationSetFitTrainer(
+            teacher_model=self.teacher_model,
+            train_dataset=dataset,
+            student_model=self.student_model,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"label_new": "label"},
+        )
+        with pytest.raises(ValueError):
+            trainer._validate_column_mapping(trainer.train_dataset)
+
+    def test_column_mapping_multilabel(self):
+        dataset = Dataset.from_dict({"text_new": ["a", "b", "c"], "label_new": [[0, 1], [1, 2], [2, 0]]})
+
+        trainer = DistillationSetFitTrainer(
+            teacher_model=self.teacher_model,
+            train_dataset=dataset,
+            student_model=self.student_model,
+            eval_dataset=dataset,
+            num_iterations=self.num_iterations,
+            column_mapping={"text_new": "text", "label_new": "label"},
+        )
+
+        trainer._validate_column_mapping(trainer.train_dataset)
+        formatted_dataset = trainer._apply_column_mapping(trainer.train_dataset, trainer.column_mapping)
+
+        assert formatted_dataset.column_names == ["text", "label"]
+        assert formatted_dataset[0]["text"] == "a"
+        assert formatted_dataset[0]["label"] == [0, 1]
+        assert formatted_dataset[1]["text"] == "b"
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 439242cd..c5a53ba2 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -9,7 +9,8 @@
 
 from setfit import logging
 from setfit.modeling import SetFitModel, SupConLoss
-from setfit.trainer import SetFitTrainer
+from setfit.trainer import Trainer
+from setfit.training_args import TrainingArguments
 from setfit.utils import BestRun
 
 
@@ -20,7 +21,7 @@
 class SetFitTrainerTest(TestCase):
     def setUp(self):
         self.model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
-        self.num_iterations = 1
+        self.args = TrainingArguments(num_iterations=1)
 
     def test_trainer_works_with_model_init(self):
         def get_model():
@@ -30,11 +31,11 @@ def get_model():
         dataset = Dataset.from_dict(
             {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
         )
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model_init=get_model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
         trainer.train()
@@ -45,11 +46,11 @@ def test_trainer_works_with_column_mapping(self):
         dataset = Dataset.from_dict(
             {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
         )
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
         trainer.train()
@@ -58,36 +59,30 @@ def test_trainer_works_with_column_mapping(self):
 
     def test_trainer_works_with_default_columns(self):
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
-        trainer = SetFitTrainer(
-            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
-        )
+        trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
         trainer.train()
         metrics = trainer.evaluate()
         self.assertEqual(metrics["accuracy"], 1.0)
 
     def test_trainer_raises_error_with_missing_label(self):
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
-        trainer = SetFitTrainer(
-            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
-        )
+        trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
         with pytest.raises(ValueError):
             trainer.train()
 
     def test_trainer_raises_error_with_missing_text(self):
         dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
-        trainer = SetFitTrainer(
-            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
-        )
+        trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
         with pytest.raises(ValueError):
             trainer.train()
 
     def test_column_mapping_with_missing_text(self):
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
             column_mapping={"label_new": "label"},
         )
         with pytest.raises(ValueError):
@@ -96,11 +91,11 @@ def test_column_mapping_with_missing_text(self):
     def test_column_mapping_multilabel(self):
         dataset = Dataset.from_dict({"text_new": ["a", "b", "c"], "label_new": [[0, 1], [1, 2], [2, 0]]})
 
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
 
@@ -128,12 +123,12 @@ def compute_metrics(y_pred, y_test):
                 "accuracy": accuracy_metric.compute(predictions=y_pred, references=y_test)["accuracy"],
             }
 
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
             metric=compute_metrics,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
 
@@ -153,12 +148,12 @@ def test_raise_when_metric_value_is_invalid(self):
             {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
         )
 
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
             metric="this-metric-does-not-exist",  # invalid metric value
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
 
@@ -167,15 +162,6 @@ def test_raise_when_metric_value_is_invalid(self):
         with self.assertRaises(FileNotFoundError):
             trainer.evaluate()
 
-    def test_trainer_raises_error_with_wrong_warmup_proportion(self):
-        # warmup_proportion must not be > 1.0
-        with pytest.raises(ValueError):
-            SetFitTrainer(warmup_proportion=1.1)
-
-        # warmup_proportion must not be < 0.0
-        with pytest.raises(ValueError):
-            SetFitTrainer(warmup_proportion=-0.1)
-
 
 class SetFitTrainerDifferentiableHeadTest(TestCase):
     def setUp(self):
@@ -187,27 +173,22 @@ def setUp(self):
             use_differentiable_head=True,
             head_params={"out_features": 3},
         )
-        self.num_iterations = 1
+        self.args = TrainingArguments(num_iterations=1)
 
     def test_trainer_max_length_exceeds_max_acceptable_length(self):
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
         trainer.unfreeze(keep_body_frozen=True)
         with self.assertLogs(level=logging.WARNING) as cm:
             max_length = 4096
             max_acceptable_length = self.model.model_body.get_max_seq_length()
-            trainer.train(
-                num_epochs=1,
-                batch_size=3,
-                learning_rate=1e-2,
-                l2_weight=0.0,
-                max_length=max_length,
-            )
+            args = TrainingArguments(num_iterations=1, max_length=max_length)
+            trainer.train(args)
             self.assertEqual(
                 cm.output,
                 [
@@ -219,26 +200,20 @@ def test_trainer_max_length_exceeds_max_acceptable_length(self):
             )
 
     def test_trainer_max_length_is_smaller_than_max_acceptable_length(self):
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
-        trainer.unfreeze(keep_body_frozen=True)
 
         # An alternative way of `assertNoLogs`, which is new in Python 3.10
         try:
             with self.assertLogs(level=logging.WARNING) as cm:
                 max_length = 32
-                trainer.train(
-                    num_epochs=1,
-                    batch_size=3,
-                    learning_rate=1e-2,
-                    l2_weight=0.0,
-                    max_length=max_length,
-                )
+                args = TrainingArguments(num_iterations=1, max_length=max_length)
+                trainer.train(args)
                 self.assertEqual(cm.output, [])
         except AssertionError as e:
             if e.args[0] != "no logs of level WARNING or higher triggered on root":
@@ -250,7 +225,7 @@ def setUp(self):
         self.model = SetFitModel.from_pretrained(
             "sentence-transformers/paraphrase-albert-small-v2", multi_target_strategy="one-vs-rest"
         )
-        self.num_iterations = 1
+        self.args = TrainingArguments(num_iterations=1)
 
     def test_trainer_multilabel_support_callable_as_metric(self):
         dataset = Dataset.from_dict({"text_new": ["a", "b", "c"], "label_new": [[1, 0, 0], [0, 1, 0], [0, 0, 1]]})
@@ -264,12 +239,12 @@ def compute_metrics(y_pred, y_test):
                 "accuracy": multilabel_accuracy_metric.compute(predictions=y_pred, references=y_test)["accuracy"],
             }
 
-        trainer = SetFitTrainer(
+        trainer = Trainer(
             model=self.model,
+            args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
             metric=compute_metrics,
-            num_iterations=self.num_iterations,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
 
@@ -291,7 +266,7 @@ def setUp(self):
         self.dataset = Dataset.from_dict(
             {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
         )
-        self.num_iterations = 1
+        self.args = TrainingArguments(num_iterations=1)
 
     def test_hyperparameter_search(self):
         class MyTrialShortNamer(TrialShortNamer):
@@ -320,10 +295,10 @@ def model_init(params):
         def hp_name(trial):
             return MyTrialShortNamer.shortname(trial.params)
 
-        trainer = SetFitTrainer(
+        trainer = Trainer(
+            args=self.args,
             train_dataset=self.dataset,
             eval_dataset=self.dataset,
-            num_iterations=self.num_iterations,
             model_init=model_init,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
@@ -346,11 +321,12 @@ def hp_name(trial):
 def test_trainer_works_with_non_default_loss_class(loss_class):
     dataset = Dataset.from_dict({"text": ["a 1", "b 1", "c 1", "a 2", "b 2", "c 2"], "label": [0, 1, 2, 0, 1, 2]})
     model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
-    trainer = SetFitTrainer(
+    args = TrainingArguments(num_iterations=1)
+    trainer = Trainer(
         model=model,
+        args=args,
         train_dataset=dataset,
         eval_dataset=dataset,
-        num_iterations=1,
         loss_class=loss_class,
     )
     trainer.train()
diff --git a/tests/test_trainer_distillation.py b/tests/test_trainer_distillation.py
index 4257a42e..df2bad4c 100644
--- a/tests/test_trainer_distillation.py
+++ b/tests/test_trainer_distillation.py
@@ -4,20 +4,21 @@
 from datasets import Dataset
 from sentence_transformers.losses import CosineSimilarityLoss
 
-from setfit import DistillationSetFitTrainer, SetFitTrainer
+from setfit import DistillationTrainer, Trainer
 from setfit.modeling import SetFitModel
+from setfit.training_args import TrainingArguments
 
 
 class DistillationSetFitTrainerTest(TestCase):
     def setUp(self):
         self.teacher_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
         self.student_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L3-v2")
-        self.num_iterations = 1
+        self.args = TrainingArguments(num_iterations=1)
 
     def test_trainer_works_with_default_columns(self):
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
         # train a teacher model
-        teacher_trainer = SetFitTrainer(
+        teacher_trainer = Trainer(
             model=self.teacher_model,
             train_dataset=dataset,
             eval_dataset=dataset,
@@ -29,7 +30,7 @@ def test_trainer_works_with_default_columns(self):
         metrics = teacher_trainer.evaluate()
         teacher_model = teacher_trainer.model
 
-        student_trainer = DistillationSetFitTrainer(
+        student_trainer = DistillationTrainer(
             teacher_model=teacher_model,
             train_dataset=dataset,
             student_model=self.student_model,
@@ -46,36 +47,36 @@ def test_trainer_works_with_default_columns(self):
 
     def test_trainer_raises_error_with_missing_label(self):
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
-        trainer = DistillationSetFitTrainer(
+        trainer = DistillationTrainer(
             teacher_model=self.teacher_model,
             train_dataset=dataset,
             student_model=self.student_model,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
+            args=self.args,
         )
         with pytest.raises(ValueError):
             trainer.train()
 
     def test_trainer_raises_error_with_missing_text(self):
         dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
-        trainer = DistillationSetFitTrainer(
+        trainer = DistillationTrainer(
             teacher_model=self.teacher_model,
             train_dataset=dataset,
             student_model=self.student_model,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
+            args=self.args,
         )
         with pytest.raises(ValueError):
             trainer.train()
 
     def test_column_mapping_with_missing_text(self):
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
-        trainer = DistillationSetFitTrainer(
+        trainer = DistillationTrainer(
             teacher_model=self.teacher_model,
             train_dataset=dataset,
             student_model=self.student_model,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
+            args=self.args,
             column_mapping={"label_new": "label"},
         )
         with pytest.raises(ValueError):
@@ -84,12 +85,12 @@ def test_column_mapping_with_missing_text(self):
     def test_column_mapping_multilabel(self):
         dataset = Dataset.from_dict({"text_new": ["a", "b", "c"], "label_new": [[0, 1], [1, 2], [2, 0]]})
 
-        trainer = DistillationSetFitTrainer(
+        trainer = DistillationTrainer(
             teacher_model=self.teacher_model,
             train_dataset=dataset,
             student_model=self.student_model,
             eval_dataset=dataset,
-            num_iterations=self.num_iterations,
+            args=self.args,
             column_mapping={"text_new": "text", "label_new": "label"},
         )
 
diff --git a/tests/test_training_args.py b/tests/test_training_args.py
new file mode 100644
index 00000000..5575aee6
--- /dev/null
+++ b/tests/test_training_args.py
@@ -0,0 +1,15 @@
+from unittest import TestCase
+
+import pytest
+
+from setfit.training_args import TrainingArguments
+
+class TestTrainingArguments(TestCase):
+    def test_training_args_raises_error_with_wrong_warmup_proportion(self):
+        # warmup_proportion must not be > 1.0
+        with pytest.raises(ValueError):
+            TrainingArguments(warmup_proportion=1.1)
+
+        # warmup_proportion must not be < 0.0
+        with pytest.raises(ValueError):
+            TrainingArguments(warmup_proportion=-0.1)
\ No newline at end of file

From 89f4435d565effb2b86975ee6a2e5b4dfd47c4c9 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 16:26:22 +0100
Subject: [PATCH 02/77] Readded support for hyperparameter tuning

---
 src/setfit/trainer.py       | 19 ++++++-------------
 src/setfit/training_args.py | 21 ++++++++++++++++++---
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index f05c3e97..aa124a67 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -162,18 +162,11 @@ def apply_hyperparameters(self, params: Dict[str, Any], final_model: bool = Fals
             params (`Dict[str, Any]`): The parameters, usually from `BestRun.hyperparameters`
             final_model (`bool`, *optional*, defaults to `False`): If `True`, replace the `model_init()` function with a fixed model based on the parameters.
         """
-        for key, value in params.items():
-            if hasattr(self, key):
-                old_attr = getattr(self, key, None)
-                # Casting value to the proper type
-                if old_attr is not None:
-                    value = type(old_attr)(value)
-                setattr(self, key, value)
-            elif number_of_arguments(self.model_init) == 0:  # we do not warn if model_init could be using it
-                logger.warning(
-                    f"Trying to set {key!r} in the hyperparameter search but there is no corresponding field in "
-                    "`SetFitTrainer`, and `model_init` does not take any arguments."
-                )
+
+        if self.args:
+            self.args = self.args.update(params, ignore_extra=True)
+        else:
+            self.args = TrainingArguments.from_dict(params, ignore_extra=True)
 
         self.model = self.model_init(params)
         if final_model:
@@ -397,7 +390,7 @@ def hyperparameter_search(
         if backend is None:
             backend = default_hp_search_backend()
             if backend is None:
-                raise RuntimeError("optuna should be installed. " "To install optuna run `pip install optuna`. ")
+                raise RuntimeError("optuna should be installed. To install optuna run `pip install optuna`. ")
         backend = HPSearchBackend(backend)
         if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
             raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 5a796d7a..2a0f6bfd 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -1,6 +1,9 @@
+from __future__ import annotations
+
 from copy import copy
 from dataclasses import dataclass, fields, field
-from typing import Callable, Tuple, Union
+import inspect
+from typing import Any, Callable, Dict, Tuple, Union
 from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
 
 
@@ -64,5 +67,17 @@ def to_dict(self):
         # filter out fields that are defined as field(init=False)
         return {field.name: getattr(self, field.name) for field in fields(self) if field.init}
 
-    def copy(self):
-        return copy(self)
\ No newline at end of file
+    @classmethod
+    def from_dict(cls, arguments: Dict[str, Any], ignore_extra: bool = False) -> TrainingArguments:
+        if ignore_extra:
+            return cls(**{
+                key: value for key, value in arguments.items()
+                if key in inspect.signature(cls).parameters
+            })
+        return cls(**arguments)
+
+    def copy(self) -> TrainingArguments:
+        return copy(self)
+
+    def update(self, arguments: Dict[str, Any], ignore_extra: bool = False) -> TrainingArguments:
+        return TrainingArguments.from_dict({**self.to_dict(), **arguments}, ignore_extra=ignore_extra)

From 5f2a6b3c4170f38b59c8b32ef54f8208a693d279 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 16:28:33 +0100
Subject: [PATCH 03/77] Remove unused imports and reformat

---
 src/setfit/modeling.py             | 17 ++++++++++-------
 src/setfit/trainer.py              |  6 ++++--
 src/setfit/trainer_distillation.py |  5 +----
 src/setfit/training_args.py        | 10 ++++------
 tests/test_deprecated_trainer.py   |  8 ++++++--
 tests/test_training_args.py        |  3 ++-
 6 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 0fe5abcf..c1b673db 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -1,9 +1,8 @@
 import os
+import warnings
 from dataclasses import dataclass
 from pathlib import Path
-import time
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
-import warnings
 
 
 # Google Colab runs on Python 3.7, so we need this to be compatible
@@ -16,14 +15,14 @@
 import numpy as np
 import requests
 import torch
-from torch import nn
 from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
 from sentence_transformers import InputExample, SentenceTransformer, models
 from sklearn.linear_model import LogisticRegression
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
+from torch import nn
 from torch.utils.data import DataLoader
-from tqdm.auto import trange, tqdm
+from tqdm.auto import tqdm, trange
 
 from . import logging
 from .data import SetFitDataset
@@ -279,7 +278,7 @@ def fit(
         max_length: Optional[int] = None,
         show_progress_bar: bool = True,
         end_to_end: bool = False,
-        **kwargs
+        **kwargs,
     ) -> None:
         if self.has_differentiable_head:  # train with pyTorch
             device = self.model_body.device
@@ -380,9 +379,13 @@ def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
         if (component is None or component == "head") and self.has_differentiable_head:
             self._freeze_or_not(self.model_head, to_freeze=True)
 
-    def unfreeze(self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None) -> None:
+    def unfreeze(
+        self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None
+    ) -> None:
         if keep_body_frozen is not None:
-            warnings.warn("`keep_body_frozen` is deprecated. Please either pass \"head\", \"body\" or no arguments to unfreeze both.")
+            warnings.warn(
+                '`keep_body_frozen` is deprecated. Please either pass "head", "body" or no arguments to unfreeze both.'
+            )
 
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=False)
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index aa124a67..aa45c3e7 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -1,6 +1,6 @@
 import math
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 import warnings
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import evaluate
 import numpy as np
@@ -214,7 +214,9 @@ def train(
             self._hp_search_setup(trial)  # sets trainer parameters and initializes model
 
         if self.train_dataset is None:
-            raise ValueError(f"Training requires a `train_dataset` given to the `{self.__class__.__name__}` initialization.")
+            raise ValueError(
+                f"Training requires a `train_dataset` given to the `{self.__class__.__name__}` initialization."
+            )
 
         self._validate_column_mapping(self.train_dataset)
         train_dataset = self.train_dataset
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 9bf53888..373d037d 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -1,13 +1,11 @@
 import math
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import torch
 from sentence_transformers import InputExample, losses, util
 from sentence_transformers.datasets import SentenceLabelDataset
-from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
 from torch.utils.data import DataLoader
-from transformers.trainer_utils import set_seed
 
 from setfit.training_args import TrainingArguments
 
@@ -17,7 +15,6 @@
 
 
 if TYPE_CHECKING:
-    import optuna
     from datasets import Dataset
 
     from .modeling import SetFitModel
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 2a0f6bfd..7757072c 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
-from copy import copy
-from dataclasses import dataclass, fields, field
 import inspect
+from copy import copy
+from dataclasses import dataclass, field, fields
 from typing import Any, Callable, Dict, Tuple, Union
+
 from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
 
 
@@ -70,10 +71,7 @@ def to_dict(self):
     @classmethod
     def from_dict(cls, arguments: Dict[str, Any], ignore_extra: bool = False) -> TrainingArguments:
         if ignore_extra:
-            return cls(**{
-                key: value for key, value in arguments.items()
-                if key in inspect.signature(cls).parameters
-            })
+            return cls(**{key: value for key, value in arguments.items() if key in inspect.signature(cls).parameters})
         return cls(**arguments)
 
     def copy(self) -> TrainingArguments:
diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
index 467b77e0..e05fb775 100644
--- a/tests/test_deprecated_trainer.py
+++ b/tests/test_deprecated_trainer.py
@@ -189,7 +189,9 @@ def setUp(self):
         )
         self.num_iterations = 1
 
-    @pytest.mark.skip(reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error.")
+    @pytest.mark.skip(
+        reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error."
+    )
     def test_trainer_max_length_exceeds_max_acceptable_length(self):
         trainer = SetFitTrainer(
             model=self.model,
@@ -219,7 +221,9 @@ def test_trainer_max_length_exceeds_max_acceptable_length(self):
                 ],
             )
 
-    @pytest.mark.skip(reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error.")
+    @pytest.mark.skip(
+        reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error."
+    )
     def test_trainer_max_length_is_smaller_than_max_acceptable_length(self):
         trainer = SetFitTrainer(
             model=self.model,
diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index 5575aee6..61261bb8 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -4,6 +4,7 @@
 
 from setfit.training_args import TrainingArguments
 
+
 class TestTrainingArguments(TestCase):
     def test_training_args_raises_error_with_wrong_warmup_proportion(self):
         # warmup_proportion must not be > 1.0
@@ -12,4 +13,4 @@ def test_training_args_raises_error_with_wrong_warmup_proportion(self):
 
         # warmup_proportion must not be < 0.0
         with pytest.raises(ValueError):
-            TrainingArguments(warmup_proportion=-0.1)
\ No newline at end of file
+            TrainingArguments(warmup_proportion=-0.1)

From 622f33bbbda5b2919330ca38a2b1c04723b2f2f6 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 16:48:34 +0100
Subject: [PATCH 04/77] Preserve desired behaviour despite deprecation of
 keep_body_frozen parameter

---
 src/setfit/modeling.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index c1b673db..d83072de 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -386,6 +386,10 @@ def unfreeze(
             warnings.warn(
                 '`keep_body_frozen` is deprecated. Please either pass "head", "body" or no arguments to unfreeze both.'
             )
+            # If the body must stay frozen, only unfreeze the head. Eventually, this entire if-branch
+            # can be removed.
+            if keep_body_frozen and not component:
+                component = "head"
 
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=False)

From ff591543398d2799eb527cf2f2b91dbf2bcdbb9a Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 17:51:01 +0100
Subject: [PATCH 05/77] Ensure that DeprecationWarnings are displayed

---
 src/setfit/__init__.py | 6 ++++++
 src/setfit/modeling.py | 4 +++-
 src/setfit/trainer.py  | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index 37db149b..ce9f19fc 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -1,6 +1,12 @@
 __version__ = "0.6.0.dev0"
 
+import warnings
+
 from .data import add_templated_examples, sample_dataset
 from .modeling import SetFitHead, SetFitModel
 from .trainer import SetFitTrainer, Trainer
 from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
+
+
+# Ensure that DeprecationWarnings are always shown
+warnings.filterwarnings("default", category=DeprecationWarning)
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index d83072de..b672392a 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -384,7 +384,9 @@ def unfreeze(
     ) -> None:
         if keep_body_frozen is not None:
             warnings.warn(
-                '`keep_body_frozen` is deprecated. Please either pass "head", "body" or no arguments to unfreeze both.'
+                '`keep_body_frozen` is deprecated. Please either pass "head", "body" or no arguments to unfreeze both.',
+                DeprecationWarning,
+                stacklevel=2,
             )
             # If the body must stay frozen, only unfreeze the head. Eventually, this entire if-branch
             # can be removed.
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index aa45c3e7..ce40873b 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -468,7 +468,7 @@ def __init__(
         samples_per_label: int = 2,
     ):
         warnings.warn(
-            "`SetFitTrainer` has been deprecated. Please use `from setfit import Trainer` instead.", DeprecationWarning
+            "`SetFitTrainer` has been deprecated. Please use `Trainer` instead.", DeprecationWarning, stacklevel=2
         )
         args = TrainingArguments(
             num_iterations=num_iterations,

From 3b4ef5812cc5626ff0cc72e49fe809c24c2ace19 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 17:57:15 +0100
Subject: [PATCH 06/77] Set Trainer.freeze and Trainer.unfreeze methods
 normally

---
 src/setfit/trainer.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index ce40873b..02572d91 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -2,6 +2,13 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
+
+# Google Colab runs on Python 3.7, so we need this to be compatible
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
 import evaluate
 import numpy as np
 from sentence_transformers import InputExample, losses
@@ -107,10 +114,6 @@ def __init__(
                 raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both")
 
         self.model = model
-        # Adopt Trainer.(un)freeze from SetFitModel.(un)freeze
-        self.freeze = self.model.freeze
-        self.unfreeze = self.model.unfreeze
-
         self.hp_search_backend = None
         self._freeze = True  # If True, will train the body only; otherwise, train the body and head
 
@@ -203,6 +206,14 @@ def call_model_init(self, params: Optional[Dict[str, Any]] = None):
 
         return model
 
+    def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
+        return self.model.freeze(component)
+
+    def unfreeze(
+        self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None
+    ) -> None:
+        return self.model.unfreeze(component, keep_body_frozen=keep_body_frozen)
+
     def train(
         self, args: Optional[TrainingArguments] = None, trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None
     ):

From fd68274ade7066a5317ba47dd8f3d1b892dfd117 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 18:33:59 +0100
Subject: [PATCH 07/77] Add TrainingArgument tests for num_epochs, batch_sizes,
 lr

---
 tests/test_training_args.py | 75 +++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index 61261bb8..941703b5 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -14,3 +14,78 @@ def test_training_args_raises_error_with_wrong_warmup_proportion(self):
         # warmup_proportion must not be < 0.0
         with pytest.raises(ValueError):
             TrainingArguments(warmup_proportion=-0.1)
+
+    def test_training_args_batch_sizes(self):
+        batch_size_A = 12
+        batch_size_B = 4
+        batch_size_C = 6
+
+        args = TrainingArguments(batch_size=batch_size_A)
+        assert args.batch_size == (batch_size_A, batch_size_A)
+        assert args.embedding_batch_size == batch_size_A
+        assert args.classifier_batch_size == batch_size_A
+
+        args = TrainingArguments(batch_size=(batch_size_A, batch_size_B))
+        assert args.batch_size == (batch_size_A, batch_size_B)
+        assert args.embedding_batch_size == batch_size_A
+        assert args.classifier_batch_size == batch_size_B
+
+        args = TrainingArguments(batch_size=(batch_size_A, batch_size_B), embedding_batch_size=batch_size_C)
+        assert args.batch_size == (batch_size_A, batch_size_B)
+        assert args.embedding_batch_size == batch_size_C
+        assert args.classifier_batch_size == batch_size_B
+
+        args = TrainingArguments(batch_size=batch_size_A, embedding_batch_size=batch_size_C)
+        assert args.batch_size == (batch_size_A, batch_size_A)
+        assert args.embedding_batch_size == batch_size_C
+        assert args.classifier_batch_size == batch_size_A
+
+    def test_training_args_num_epochs(self):
+        num_epochs_A = 12
+        num_epochs_B = 4
+        num_epochs_C = 6
+
+        args = TrainingArguments(num_epochs=num_epochs_A)
+        assert args.num_epochs == (num_epochs_A, num_epochs_A)
+        assert args.embedding_num_epochs == num_epochs_A
+        assert args.classifier_num_epochs == num_epochs_A
+
+        args = TrainingArguments(num_epochs=(num_epochs_A, num_epochs_B))
+        assert args.num_epochs == (num_epochs_A, num_epochs_B)
+        assert args.embedding_num_epochs == num_epochs_A
+        assert args.classifier_num_epochs == num_epochs_B
+
+        args = TrainingArguments(num_epochs=(num_epochs_A, num_epochs_B), embedding_num_epochs=num_epochs_C)
+        assert args.num_epochs == (num_epochs_A, num_epochs_B)
+        assert args.embedding_num_epochs == num_epochs_C
+        assert args.classifier_num_epochs == num_epochs_B
+
+        args = TrainingArguments(num_epochs=num_epochs_A, embedding_num_epochs=num_epochs_C)
+        assert args.num_epochs == (num_epochs_A, num_epochs_A)
+        assert args.embedding_num_epochs == num_epochs_C
+        assert args.classifier_num_epochs == num_epochs_A
+
+    def test_training_args_learning_rates(self):
+        learning_rate_A = 1e-2
+        learning_rate_B = 1e-3
+        learning_rate_C = 1e-4
+
+        base = TrainingArguments()
+
+        args = TrainingArguments(classifier_learning_rate=learning_rate_A)
+        assert args.classifier_learning_rate == (base.embedding_learning_rate, learning_rate_A)
+        assert args.embedding_learning_rate == base.embedding_learning_rate
+
+        args = TrainingArguments(classifier_learning_rate=learning_rate_A, embedding_learning_rate=learning_rate_B)
+        assert args.classifier_learning_rate == (learning_rate_B, learning_rate_A)
+        assert args.embedding_learning_rate == learning_rate_B
+
+        args = TrainingArguments(
+            classifier_learning_rate=(learning_rate_C, learning_rate_A), embedding_learning_rate=learning_rate_B
+        )
+        assert args.classifier_learning_rate == (learning_rate_C, learning_rate_A)
+        assert args.embedding_learning_rate == learning_rate_B
+
+        args = TrainingArguments(classifier_learning_rate=(learning_rate_C, learning_rate_A))
+        assert args.classifier_learning_rate == (learning_rate_C, learning_rate_A)
+        assert args.embedding_learning_rate == base.embedding_learning_rate

From 14602ea2773f77b82243624ed1bca5e0772519e7 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 11 Jan 2023 19:03:50 +0100
Subject: [PATCH 08/77] Convert trainer.train arguments into a softer
 deprecation

---
 src/setfit/trainer.py            | 14 +++++++++++++-
 tests/test_deprecated_trainer.py |  8 ++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 02572d91..e367b7b3 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -215,8 +215,20 @@ def unfreeze(
         return self.model.unfreeze(component, keep_body_frozen=keep_body_frozen)
 
     def train(
-        self, args: Optional[TrainingArguments] = None, trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None
+        self,
+        args: Optional[TrainingArguments] = None,
+        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
+        **kwargs,
     ):
+        if kwargs:
+            warnings.warn(
+                f"`{self.__class__.__name__}.train` does not accept keyword arguments anymore. "
+                f"Please provide training arguments via a `TrainingArguments` instance to the `{self.__class__.__name__}` "
+                f"initialisation or the `{self.__class__.__name__}.train` method.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
         args = args or self.args or TrainingArguments()
 
         set_seed(args.seed)  # Seed must be set before instantiating the model when using model_init.
diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
index e05fb775..b3b8de18 100644
--- a/tests/test_deprecated_trainer.py
+++ b/tests/test_deprecated_trainer.py
@@ -189,9 +189,7 @@ def setUp(self):
         )
         self.num_iterations = 1
 
-    @pytest.mark.skip(
-        reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error."
-    )
+    @pytest.mark.skip(reason="The `trainer.train` arguments are now ignored, causing this test to fail.")
     def test_trainer_max_length_exceeds_max_acceptable_length(self):
         trainer = SetFitTrainer(
             model=self.model,
@@ -221,9 +219,7 @@ def test_trainer_max_length_exceeds_max_acceptable_length(self):
                 ],
             )
 
-    @pytest.mark.skip(
-        reason="The `trainer.train` argument removals were a hard deprecation, so this test would throw an error."
-    )
+    @pytest.mark.skip(reason="The `trainer.train` arguments are now ignored, causing this test to fail.")
     def test_trainer_max_length_is_smaller_than_max_acceptable_length(self):
         trainer = SetFitTrainer(
             model=self.model,

From 9fc55a699be8fcb8c6c7b4471337fe5072bc0df4 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 23 Jan 2023 12:45:49 +0100
Subject: [PATCH 09/77] Use body/head_learning_rate instead of
 classifier/embedding_learning_rate

The reasoning is that with body_learning_rate, the tuple is for (training embedding phase, training classifier phase), which matches the tuples that you should give to num_epochs and batch_size.
---
 src/setfit/modeling.py             | 20 +++++++------
 src/setfit/trainer.py              |  6 ++--
 src/setfit/trainer_distillation.py |  6 ++--
 src/setfit/training_args.py        | 20 ++++++++++---
 tests/test_training_args.py        | 46 ++++++++++++++++++++++--------
 5 files changed, 68 insertions(+), 30 deletions(-)

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index b89f8aed..daf22d6b 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -283,7 +283,8 @@ def fit(
         y_train: Union[List[int], List[List[int]]],
         classifier_num_epochs: int,
         classifier_batch_size: Optional[int] = None,
-        classifier_learning_rate: Optional[Tuple[float, float]] = (None, None),
+        body_classifier_learning_rate: Optional[float] = None,
+        head_learning_rate: Optional[float] = None,
         l2_weight: Optional[float] = None,
         max_length: Optional[int] = None,
         show_progress_bar: bool = True,
@@ -299,8 +300,7 @@ def fit(
 
             dataloader = self._prepare_dataloader(x_train, y_train, classifier_batch_size, max_length)
             criterion = self.model_head.get_loss_fn()
-            embedding_learning_rate, classifier_learning_rate = classifier_learning_rate
-            optimizer = self._prepare_optimizer(classifier_learning_rate, embedding_learning_rate, l2_weight)
+            optimizer = self._prepare_optimizer(head_learning_rate, body_classifier_learning_rate, l2_weight)
             scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
             for epoch_idx in trange(classifier_num_epochs, desc="Epoch", disable=not show_progress_bar):
                 for batch in tqdm(dataloader, desc="Iteration", disable=not show_progress_bar, leave=False):
@@ -367,16 +367,20 @@ def _prepare_dataloader(
 
     def _prepare_optimizer(
         self,
-        classifier_learning_rate: float,
-        embedding_learning_rate: Optional[float],
+        head_learning_rate: float,
+        body_classifier_learning_rate: Optional[float],
         l2_weight: float,
     ) -> torch.optim.Optimizer:
-        embedding_learning_rate = embedding_learning_rate or classifier_learning_rate
+        body_classifier_learning_rate = body_classifier_learning_rate or head_learning_rate
         l2_weight = l2_weight or self.l2_weight
         optimizer = torch.optim.AdamW(
             [
-                {"params": self.model_body.parameters(), "lr": embedding_learning_rate, "weight_decay": l2_weight},
-                {"params": self.model_head.parameters(), "lr": classifier_learning_rate, "weight_decay": l2_weight},
+                {
+                    "params": self.model_body.parameters(),
+                    "lr": body_classifier_learning_rate,
+                    "weight_decay": l2_weight,
+                },
+                {"params": self.model_head.parameters(), "lr": head_learning_rate, "weight_decay": l2_weight},
             ],
         )
 
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index c4fe9b1d..dc5303ac 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -312,7 +312,7 @@ def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optiona
         self.model.model_body.fit(
             train_objectives=[(train_dataloader, train_loss)],
             epochs=args.embedding_num_epochs,
-            optimizer_params={"lr": args.embedding_learning_rate},
+            optimizer_params={"lr": args.body_embedding_learning_rate},
             warmup_steps=warmup_steps,
             show_progress_bar=args.show_progress_bar,
             use_amp=args.use_amp,
@@ -493,8 +493,8 @@ def __init__(
         args = TrainingArguments(
             num_iterations=num_iterations,
             num_epochs=num_epochs,
-            classifier_learning_rate=learning_rate,
-            embedding_learning_rate=learning_rate,
+            body_learning_rate=learning_rate,
+            head_learning_rate=learning_rate,
             batch_size=batch_size,
             seed=seed,
             use_amp=use_amp,
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 373d037d..6f16c296 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -157,7 +157,7 @@ def train_embeddings(
             train_objectives=[(train_dataloader, train_loss)],
             epochs=args.embedding_num_epochs,
             steps_per_epoch=train_steps,
-            optimizer_params={"lr": args.embedding_learning_rate},
+            optimizer_params={"lr": args.body_embedding_learning_rate},
             warmup_steps=warmup_steps,
             show_progress_bar=args.show_progress_bar,
             use_amp=args.use_amp,
@@ -186,8 +186,8 @@ def __init__(
         args = TrainingArguments(
             num_iterations=num_iterations,
             num_epochs=num_epochs,
-            embedding_learning_rate=learning_rate,
-            classifier_learning_rate=learning_rate,
+            body_learning_rate=learning_rate,
+            head_learning_rate=learning_rate,
             batch_size=batch_size,
             seed=seed,
             use_amp=use_amp,
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 7757072c..1d4ebcad 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -25,8 +25,12 @@ class TrainingArguments:
 
     num_iterations: int = 20
 
-    embedding_learning_rate: float = 2e-5
-    classifier_learning_rate: Union[float, Tuple[float, float]] = (1e-5, 1e-2)
+    # As with batch_size and num_epochs, the first value in the tuple is the learning rate
+    # for the embeddings step, while the second value is the learning rate for the classifier step.
+    body_learning_rate: Union[float, Tuple[float, float]] = field(default=(2e-5, 1e-5), repr=False)
+    body_embedding_learning_rate: float = None
+    body_classifier_learning_rate: float = None
+    head_learning_rate: float = 1e-2
 
     seed: int = 42
     use_amp: bool = False
@@ -42,6 +46,7 @@ class TrainingArguments:
     end_to_end: bool = False
 
     def __post_init__(self):
+        # Set `self.embedding_batch_size` and `self.classifier_batch_size` using values from `self.batch_size`
         if isinstance(self.batch_size, int):
             self.batch_size = (self.batch_size, self.batch_size)
         if self.embedding_batch_size is None:
@@ -49,6 +54,7 @@ def __post_init__(self):
         if self.classifier_batch_size is None:
             self.classifier_batch_size = self.batch_size[1]
 
+        # Set `self.embedding_num_epochs` and `self.classifier_num_epochs` using values from `self.num_epochs`
         if isinstance(self.num_epochs, int):
             self.num_epochs = (self.num_epochs, self.num_epochs)
         if self.embedding_num_epochs is None:
@@ -56,8 +62,14 @@ def __post_init__(self):
         if self.classifier_num_epochs is None:
             self.classifier_num_epochs = self.num_epochs[1]
 
-        if isinstance(self.classifier_learning_rate, float):
-            self.classifier_learning_rate = (self.embedding_learning_rate, self.classifier_learning_rate)
+        # Set `self.body_embedding_learning_rate` and `self.body_classifier_learning_rate` using
+        # values from `self.body_learning_rate`
+        if isinstance(self.body_learning_rate, float):
+            self.body_learning_rate = (self.body_learning_rate, self.body_learning_rate)
+        if self.body_embedding_learning_rate is None:
+            self.body_embedding_learning_rate = self.body_learning_rate[0]
+        if self.body_classifier_learning_rate is None:
+            self.body_classifier_learning_rate = self.body_learning_rate[1]
 
         if self.warmup_proportion < 0.0 or self.warmup_proportion > 1.0:
             raise ValueError(
diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index 941703b5..5ad6a850 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -72,20 +72,42 @@ def test_training_args_learning_rates(self):
 
         base = TrainingArguments()
 
-        args = TrainingArguments(classifier_learning_rate=learning_rate_A)
-        assert args.classifier_learning_rate == (base.embedding_learning_rate, learning_rate_A)
-        assert args.embedding_learning_rate == base.embedding_learning_rate
+        args = TrainingArguments(body_learning_rate=learning_rate_A)
+        assert args.body_learning_rate == (learning_rate_A, learning_rate_A)
+        assert args.body_embedding_learning_rate == learning_rate_A
+        assert args.body_classifier_learning_rate == learning_rate_A
+        assert args.head_learning_rate == base.head_learning_rate
+
+        args = TrainingArguments(body_learning_rate=(learning_rate_A, learning_rate_B))
+        assert args.body_learning_rate == (learning_rate_A, learning_rate_B)
+        assert args.body_embedding_learning_rate == learning_rate_A
+        assert args.body_classifier_learning_rate == learning_rate_B
+        assert args.head_learning_rate == base.head_learning_rate
 
-        args = TrainingArguments(classifier_learning_rate=learning_rate_A, embedding_learning_rate=learning_rate_B)
-        assert args.classifier_learning_rate == (learning_rate_B, learning_rate_A)
-        assert args.embedding_learning_rate == learning_rate_B
+        args = TrainingArguments(
+            body_learning_rate=(learning_rate_A, learning_rate_B), head_learning_rate=learning_rate_C
+        )
+        assert args.body_learning_rate == (learning_rate_A, learning_rate_B)
+        assert args.body_embedding_learning_rate == learning_rate_A
+        assert args.body_classifier_learning_rate == learning_rate_B
+        assert args.head_learning_rate == learning_rate_C
 
         args = TrainingArguments(
-            classifier_learning_rate=(learning_rate_C, learning_rate_A), embedding_learning_rate=learning_rate_B
+            body_learning_rate=learning_rate_A,
+            body_embedding_learning_rate=learning_rate_B,
+            head_learning_rate=learning_rate_C,
         )
-        assert args.classifier_learning_rate == (learning_rate_C, learning_rate_A)
-        assert args.embedding_learning_rate == learning_rate_B
+        # Perhaps not ideal, but body_learning_rate is never used directly:
+        assert args.body_learning_rate == (learning_rate_A, learning_rate_A)
+        assert args.body_embedding_learning_rate == learning_rate_B
+        assert args.body_classifier_learning_rate == learning_rate_A
+        assert args.head_learning_rate == learning_rate_C
 
-        args = TrainingArguments(classifier_learning_rate=(learning_rate_C, learning_rate_A))
-        assert args.classifier_learning_rate == (learning_rate_C, learning_rate_A)
-        assert args.embedding_learning_rate == base.embedding_learning_rate
+        args = TrainingArguments(
+            body_classifier_learning_rate=learning_rate_A,
+            body_embedding_learning_rate=learning_rate_B,
+            head_learning_rate=learning_rate_C,
+        )
+        assert args.body_embedding_learning_rate == learning_rate_B
+        assert args.body_classifier_learning_rate == learning_rate_A
+        assert args.head_learning_rate == learning_rate_C

From dee70b151e46932049739acf59dd9a80ec45a85e Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 14:38:45 +0100
Subject: [PATCH 10/77] Reformat according to the newest black version

---
 scripts/setfit/distillation_baseline.py    | 1 -
 scripts/setfit/run_fewshot_distillation.py | 1 -
 scripts/setfit/run_fewshot_multilingual.py | 1 -
 src/setfit/data.py                         | 1 -
 src/setfit/logging.py                      | 4 ----
 src/setfit/modeling.py                     | 1 -
 src/setfit/trainer.py                      | 1 -
 src/setfit/training_args.py                | 1 -
 8 files changed, 11 deletions(-)

diff --git a/scripts/setfit/distillation_baseline.py b/scripts/setfit/distillation_baseline.py
index 98be6bef..75b84b91 100644
--- a/scripts/setfit/distillation_baseline.py
+++ b/scripts/setfit/distillation_baseline.py
@@ -56,7 +56,6 @@ def compute_metrics_for_regression(self, eval_pred):
     # ------------------------ Student training ----------------------#
     # ----------------------------------------------------------------#
     def standard_model_distillation(self, train_raw_student, x_test, y_test, num_classes):
-
         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
         value2hot = {}
diff --git a/scripts/setfit/run_fewshot_distillation.py b/scripts/setfit/run_fewshot_distillation.py
index 7d1dabae..ef213b50 100644
--- a/scripts/setfit/run_fewshot_distillation.py
+++ b/scripts/setfit/run_fewshot_distillation.py
@@ -240,7 +240,6 @@ def train(self):
                     self.trained_teacher_model = teacher_trainer.model
 
                 if self.mode == self.SETFIT_STUDENT:
-
                     # student train data = teacher train data + unlabeled data
                     student_train_dataset = concatenate_datasets([self.teacher_train_dataset, fewshot_ds[name]])
 
diff --git a/scripts/setfit/run_fewshot_multilingual.py b/scripts/setfit/run_fewshot_multilingual.py
index 4ffc5577..1b80fbb5 100644
--- a/scripts/setfit/run_fewshot_multilingual.py
+++ b/scripts/setfit/run_fewshot_multilingual.py
@@ -108,7 +108,6 @@ def eval_setfit(train_data, test_data, model, loss_class, num_epochs, metric):
         losses.BatchHardSoftMarginTripletLoss,
         SupConLoss,
     ]:
-
         train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
         train_data_sampler = SentenceLabelDataset(train_examples)
 
diff --git a/src/setfit/data.py b/src/setfit/data.py
index 8e99a4de..ee35a428 100644
--- a/src/setfit/data.py
+++ b/src/setfit/data.py
@@ -265,7 +265,6 @@ def __getitem__(self, idx: int) -> Tuple[TokenizerOutput, Union[int, List[int]]]
         return feature, label
 
     def collate_fn(self, batch):
-
         features = {input_name: [] for input_name in self.tokenizer.model_input_names}
 
         labels = []
diff --git a/src/setfit/logging.py b/src/setfit/logging.py
index 91aa793e..13368b07 100644
--- a/src/setfit/logging.py
+++ b/src/setfit/logging.py
@@ -68,17 +68,14 @@ def _get_default_logging_level():
 
 
 def _get_library_name() -> str:
-
     return __name__.split(".")[0]
 
 
 def _get_library_root_logger() -> logging.Logger:
-
     return logging.getLogger(_get_library_name())
 
 
 def _configure_library_root_logger() -> None:
-
     global _default_handler
 
     with _lock:
@@ -96,7 +93,6 @@ def _configure_library_root_logger() -> None:
 
 
 def _reset_library_root_logger() -> None:
-
     global _default_handler
 
     with _lock:
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index c093b945..3bae0e51 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -717,7 +717,6 @@ def sentence_pairs_generation_multilabel(sentences, labels, pairs):
         if len(np.where(labels.dot(labels[first_idx, :].T) == 0)[0]) == 0:
             continue
         else:
-
             for _label in sample_labels:
                 second_idx = np.random.choice(np.where(labels[:, _label] == 1)[0])
                 positive_sentence = sentences[second_idx]
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index dc5303ac..a34f5615 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -450,7 +450,6 @@ def push_to_hub(
         config: Optional[dict] = None,
         skip_lfs_files: bool = False,
     ):
-
         return self.model.push_to_hub(
             repo_path_or_name,
             repo_url,
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 1d4ebcad..333a66ea 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -10,7 +10,6 @@
 
 @dataclass
 class TrainingArguments:
-
     # batch_size is only used to conveniently set `embedding_batch_size` and `classifier_batch_size`
     # which are used in practice
     batch_size: Union[int, Tuple[int, int]] = field(default=(16, 2), repr=False)

From abbbb03098d24756c1e541296d52fdd7f630d0f5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 15:48:49 +0100
Subject: [PATCH 11/77] Remove "classifier" from var names in SetFitHead

---
 src/setfit/modeling.py | 19 +++++++++----------
 src/setfit/trainer.py  |  9 ++++++++-
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 3bae0e51..f12eb6f9 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -281,15 +281,14 @@ def fit(
         self,
         x_train: List[str],
         y_train: Union[List[int], List[List[int]]],
-        classifier_num_epochs: int,
-        classifier_batch_size: Optional[int] = None,
-        body_classifier_learning_rate: Optional[float] = None,
+        num_epochs: int,
+        batch_size: Optional[int] = None,
+        body_learning_rate: Optional[float] = None,
         head_learning_rate: Optional[float] = None,
         l2_weight: Optional[float] = None,
         max_length: Optional[int] = None,
         show_progress_bar: bool = True,
         end_to_end: bool = False,
-        **kwargs,
     ) -> None:
         if self.has_differentiable_head:  # train with pyTorch
             device = self.model_body.device
@@ -298,11 +297,11 @@ def fit(
             if not end_to_end:
                 self.freeze("body")
 
-            dataloader = self._prepare_dataloader(x_train, y_train, classifier_batch_size, max_length)
+            dataloader = self._prepare_dataloader(x_train, y_train, batch_size, max_length)
             criterion = self.model_head.get_loss_fn()
-            optimizer = self._prepare_optimizer(head_learning_rate, body_classifier_learning_rate, l2_weight)
+            optimizer = self._prepare_optimizer(head_learning_rate, body_learning_rate, l2_weight)
             scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
-            for epoch_idx in trange(classifier_num_epochs, desc="Epoch", disable=not show_progress_bar):
+            for epoch_idx in trange(num_epochs, desc="Epoch", disable=not show_progress_bar):
                 for batch in tqdm(dataloader, desc="Iteration", disable=not show_progress_bar, leave=False):
                     features, labels = batch
                     optimizer.zero_grad()
@@ -372,16 +371,16 @@ def _prepare_dataloader(
     def _prepare_optimizer(
         self,
         head_learning_rate: float,
-        body_classifier_learning_rate: Optional[float],
+        body_learning_rate: Optional[float],
         l2_weight: float,
     ) -> torch.optim.Optimizer:
-        body_classifier_learning_rate = body_classifier_learning_rate or head_learning_rate
+        body_learning_rate = body_learning_rate or head_learning_rate
         l2_weight = l2_weight or self.l2_weight
         optimizer = torch.optim.AdamW(
             [
                 {
                     "params": self.model_body.parameters(),
-                    "lr": body_classifier_learning_rate,
+                    "lr": body_learning_rate,
                     "weight_decay": l2_weight,
                 },
                 {"params": self.model_head.parameters(), "lr": head_learning_rate, "weight_decay": l2_weight},
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index a34f5615..e034a776 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -324,7 +324,14 @@ def train_classifier(self, x_train: List[str], y_train: List[int], args: Optiona
         self.model.fit(
             x_train,
             y_train,
-            **args.to_dict(),
+            num_epochs=args.classifier_num_epochs,
+            batch_size=args.classifier_batch_size,
+            body_learning_rate=args.body_classifier_learning_rate,
+            head_learning_rate=args.head_learning_rate,
+            l2_weight=args.l2_weight,
+            max_length=args.max_length,
+            show_progress_bar=args.show_progress_bar,
+            end_to_end=args.end_to_end,
         )
 
     def evaluate(self):

From 12d326e01a2f965c138da0f94b2a569743bea0ea Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 16:01:07 +0100
Subject: [PATCH 12/77] Update DeprecationWarnings to include timeline

Also add DeprecationWarning for DistillationSetFitTrainer
---
 src/setfit/__init__.py             | 3 ++-
 src/setfit/modeling.py             | 3 ++-
 src/setfit/trainer.py              | 5 ++++-
 src/setfit/trainer_distillation.py | 7 +++++++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index ce9f19fc..03b96bd9 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -8,5 +8,6 @@
 from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
 
 
-# Ensure that DeprecationWarnings are always shown
+# Ensure that DeprecationWarnings are shown by default, as recommended by
+# https://docs.python.org/3/library/warnings.html#overriding-the-default-filter
 warnings.filterwarnings("default", category=DeprecationWarning)
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index f12eb6f9..44c6d367 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -401,7 +401,8 @@ def unfreeze(
     ) -> None:
         if keep_body_frozen is not None:
             warnings.warn(
-                '`keep_body_frozen` is deprecated. Please either pass "head", "body" or no arguments to unfreeze both.',
+                "`keep_body_frozen` is deprecated and will be removed in v2.0.0 of SetFit. "
+                'Please either pass "head", "body" or no arguments to unfreeze both.',
                 DeprecationWarning,
                 stacklevel=2,
             )
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index e034a776..bd007594 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -494,7 +494,10 @@ def __init__(
         samples_per_label: int = 2,
     ):
         warnings.warn(
-            "`SetFitTrainer` has been deprecated. Please use `Trainer` instead.", DeprecationWarning, stacklevel=2
+            "`SetFitTrainer` has been deprecated and will be removed in v2.0.0 of SetFit. "
+            " Please use `Trainer` instead.",
+            DeprecationWarning,
+            stacklevel=2,
         )
         args = TrainingArguments(
             num_iterations=num_iterations,
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index dedf616f..42864e58 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -1,4 +1,5 @@
 import math
+import warnings
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
@@ -180,6 +181,12 @@ def __init__(
         use_amp: bool = False,
         warmup_proportion: float = 0.1,
     ):
+        warnings.warn(
+            "`DistillationSetFitTrainer` has been deprecated and will be removed in v2.0.0 of SetFit. "
+            "Please use `DistillationTrainer` instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         args = TrainingArguments(
             num_iterations=num_iterations,
             num_epochs=num_epochs,

From fc246cc0e4b43a7881e42a0b9641d78a1a42b0a4 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 16:21:43 +0100
Subject: [PATCH 13/77] Convert training_argument imports to relative imports

---
 src/setfit/trainer.py              | 3 +--
 src/setfit/trainer_distillation.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index bd007594..7a31b5ef 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -17,11 +17,10 @@
 from torch.utils.data import DataLoader
 from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
 
-from setfit.training_args import TrainingArguments
-
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
 from .modeling import SupConLoss, sentence_pairs_generation, sentence_pairs_generation_multilabel
+from .training_args import TrainingArguments
 from .utils import BestRun, default_hp_space_optuna
 
 
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 42864e58..c54f3957 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -8,11 +8,10 @@
 from sentence_transformers.datasets import SentenceLabelDataset
 from torch.utils.data import DataLoader
 
-from setfit.training_args import TrainingArguments
-
 from . import logging
 from .modeling import SupConLoss, sentence_pairs_generation_cos_sim
 from .trainer import Trainer
+from .training_args import TrainingArguments
 
 
 if TYPE_CHECKING:

From 57aa54f5191661756bf56507b0ac6f77b847f81b Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 6 Feb 2023 16:24:56 +0100
Subject: [PATCH 14/77] Make conditional explicit

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 src/setfit/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 7a31b5ef..599e4c4c 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -165,7 +165,7 @@ def apply_hyperparameters(self, params: Dict[str, Any], final_model: bool = Fals
             final_model (`bool`, *optional*, defaults to `False`): If `True`, replace the `model_init()` function with a fixed model based on the parameters.
         """
 
-        if self.args:
+        if self.args is not None:
             self.args = self.args.update(params, ignore_extra=True)
         else:
             self.args = TrainingArguments.from_dict(params, ignore_extra=True)

From 7ebdf9302fab7a30deae9c934a1e7242b9844c2a Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Mon, 6 Feb 2023 16:25:13 +0100
Subject: [PATCH 15/77] Make conditional explicit

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 src/setfit/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 599e4c4c..44506edd 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -219,7 +219,7 @@ def train(
         trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
         **kwargs,
     ):
-        if kwargs:
+        if kwargs is not None:
             warnings.warn(
                 f"`{self.__class__.__name__}.train` does not accept keyword arguments anymore. "
                 f"Please provide training arguments via a `TrainingArguments` instance to the `{self.__class__.__name__}` "

From 46952933c431cdb1642f952af902e7521d725fc1 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 16:32:22 +0100
Subject: [PATCH 16/77] Use assertEqual rather than assert

---
 tests/test_training_args.py | 86 ++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index 5ad6a850..24250c54 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -21,24 +21,24 @@ def test_training_args_batch_sizes(self):
         batch_size_C = 6
 
         args = TrainingArguments(batch_size=batch_size_A)
-        assert args.batch_size == (batch_size_A, batch_size_A)
-        assert args.embedding_batch_size == batch_size_A
-        assert args.classifier_batch_size == batch_size_A
+        self.assertEqual(args.batch_size, (batch_size_A, batch_size_A))
+        self.assertEqual(args.embedding_batch_size, batch_size_A)
+        self.assertEqual(args.classifier_batch_size, batch_size_A)
 
         args = TrainingArguments(batch_size=(batch_size_A, batch_size_B))
-        assert args.batch_size == (batch_size_A, batch_size_B)
-        assert args.embedding_batch_size == batch_size_A
-        assert args.classifier_batch_size == batch_size_B
+        self.assertEqual(args.batch_size, (batch_size_A, batch_size_B))
+        self.assertEqual(args.embedding_batch_size, batch_size_A)
+        self.assertEqual(args.classifier_batch_size, batch_size_B)
 
         args = TrainingArguments(batch_size=(batch_size_A, batch_size_B), embedding_batch_size=batch_size_C)
-        assert args.batch_size == (batch_size_A, batch_size_B)
-        assert args.embedding_batch_size == batch_size_C
-        assert args.classifier_batch_size == batch_size_B
+        self.assertEqual(args.batch_size, (batch_size_A, batch_size_B))
+        self.assertEqual(args.embedding_batch_size, batch_size_C)
+        self.assertEqual(args.classifier_batch_size, batch_size_B)
 
         args = TrainingArguments(batch_size=batch_size_A, embedding_batch_size=batch_size_C)
-        assert args.batch_size == (batch_size_A, batch_size_A)
-        assert args.embedding_batch_size == batch_size_C
-        assert args.classifier_batch_size == batch_size_A
+        self.assertEqual(args.batch_size, (batch_size_A, batch_size_A))
+        self.assertEqual(args.embedding_batch_size, batch_size_C)
+        self.assertEqual(args.classifier_batch_size, batch_size_A)
 
     def test_training_args_num_epochs(self):
         num_epochs_A = 12
@@ -46,24 +46,24 @@ def test_training_args_num_epochs(self):
         num_epochs_C = 6
 
         args = TrainingArguments(num_epochs=num_epochs_A)
-        assert args.num_epochs == (num_epochs_A, num_epochs_A)
-        assert args.embedding_num_epochs == num_epochs_A
-        assert args.classifier_num_epochs == num_epochs_A
+        self.assertEqual(args.num_epochs, (num_epochs_A, num_epochs_A))
+        self.assertEqual(args.embedding_num_epochs, num_epochs_A)
+        self.assertEqual(args.classifier_num_epochs, num_epochs_A)
 
         args = TrainingArguments(num_epochs=(num_epochs_A, num_epochs_B))
-        assert args.num_epochs == (num_epochs_A, num_epochs_B)
-        assert args.embedding_num_epochs == num_epochs_A
-        assert args.classifier_num_epochs == num_epochs_B
+        self.assertEqual(args.num_epochs, (num_epochs_A, num_epochs_B))
+        self.assertEqual(args.embedding_num_epochs, num_epochs_A)
+        self.assertEqual(args.classifier_num_epochs, num_epochs_B)
 
         args = TrainingArguments(num_epochs=(num_epochs_A, num_epochs_B), embedding_num_epochs=num_epochs_C)
-        assert args.num_epochs == (num_epochs_A, num_epochs_B)
-        assert args.embedding_num_epochs == num_epochs_C
-        assert args.classifier_num_epochs == num_epochs_B
+        self.assertEqual(args.num_epochs, (num_epochs_A, num_epochs_B))
+        self.assertEqual(args.embedding_num_epochs, num_epochs_C)
+        self.assertEqual(args.classifier_num_epochs, num_epochs_B)
 
         args = TrainingArguments(num_epochs=num_epochs_A, embedding_num_epochs=num_epochs_C)
-        assert args.num_epochs == (num_epochs_A, num_epochs_A)
-        assert args.embedding_num_epochs == num_epochs_C
-        assert args.classifier_num_epochs == num_epochs_A
+        self.assertEqual(args.num_epochs, (num_epochs_A, num_epochs_A))
+        self.assertEqual(args.embedding_num_epochs, num_epochs_C)
+        self.assertEqual(args.classifier_num_epochs, num_epochs_A)
 
     def test_training_args_learning_rates(self):
         learning_rate_A = 1e-2
@@ -73,24 +73,24 @@ def test_training_args_learning_rates(self):
         base = TrainingArguments()
 
         args = TrainingArguments(body_learning_rate=learning_rate_A)
-        assert args.body_learning_rate == (learning_rate_A, learning_rate_A)
-        assert args.body_embedding_learning_rate == learning_rate_A
-        assert args.body_classifier_learning_rate == learning_rate_A
-        assert args.head_learning_rate == base.head_learning_rate
+        self.assertEqual(args.body_learning_rate, (learning_rate_A, learning_rate_A))
+        self.assertEqual(args.body_embedding_learning_rate, learning_rate_A)
+        self.assertEqual(args.body_classifier_learning_rate, learning_rate_A)
+        self.assertEqual(args.head_learning_rate, base.head_learning_rate)
 
         args = TrainingArguments(body_learning_rate=(learning_rate_A, learning_rate_B))
-        assert args.body_learning_rate == (learning_rate_A, learning_rate_B)
-        assert args.body_embedding_learning_rate == learning_rate_A
-        assert args.body_classifier_learning_rate == learning_rate_B
-        assert args.head_learning_rate == base.head_learning_rate
+        self.assertEqual(args.body_learning_rate, (learning_rate_A, learning_rate_B))
+        self.assertEqual(args.body_embedding_learning_rate, learning_rate_A)
+        self.assertEqual(args.body_classifier_learning_rate, learning_rate_B)
+        self.assertEqual(args.head_learning_rate, base.head_learning_rate)
 
         args = TrainingArguments(
             body_learning_rate=(learning_rate_A, learning_rate_B), head_learning_rate=learning_rate_C
         )
-        assert args.body_learning_rate == (learning_rate_A, learning_rate_B)
-        assert args.body_embedding_learning_rate == learning_rate_A
-        assert args.body_classifier_learning_rate == learning_rate_B
-        assert args.head_learning_rate == learning_rate_C
+        self.assertEqual(args.body_learning_rate, (learning_rate_A, learning_rate_B))
+        self.assertEqual(args.body_embedding_learning_rate, learning_rate_A)
+        self.assertEqual(args.body_classifier_learning_rate, learning_rate_B)
+        self.assertEqual(args.head_learning_rate, learning_rate_C)
 
         args = TrainingArguments(
             body_learning_rate=learning_rate_A,
@@ -98,16 +98,16 @@ def test_training_args_learning_rates(self):
             head_learning_rate=learning_rate_C,
         )
         # Perhaps not ideal, but body_learning_rate is never used directly:
-        assert args.body_learning_rate == (learning_rate_A, learning_rate_A)
-        assert args.body_embedding_learning_rate == learning_rate_B
-        assert args.body_classifier_learning_rate == learning_rate_A
-        assert args.head_learning_rate == learning_rate_C
+        self.assertEqual(args.body_learning_rate, (learning_rate_A, learning_rate_A))
+        self.assertEqual(args.body_embedding_learning_rate, learning_rate_B)
+        self.assertEqual(args.body_classifier_learning_rate, learning_rate_A)
+        self.assertEqual(args.head_learning_rate, learning_rate_C)
 
         args = TrainingArguments(
             body_classifier_learning_rate=learning_rate_A,
             body_embedding_learning_rate=learning_rate_B,
             head_learning_rate=learning_rate_C,
         )
-        assert args.body_embedding_learning_rate == learning_rate_B
-        assert args.body_classifier_learning_rate == learning_rate_A
-        assert args.head_learning_rate == learning_rate_C
+        self.assertEqual(args.body_embedding_learning_rate, learning_rate_B)
+        self.assertEqual(args.body_classifier_learning_rate, learning_rate_A)
+        self.assertEqual(args.head_learning_rate, learning_rate_C)

From 4c6d0fdd9338524110df5a0b24f2e137517d51f0 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 16:33:25 +0100
Subject: [PATCH 17/77] Remove training_arguments from test func names

---
 tests/test_training_args.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_training_args.py b/tests/test_training_args.py
index 24250c54..a573e10e 100644
--- a/tests/test_training_args.py
+++ b/tests/test_training_args.py
@@ -6,7 +6,7 @@
 
 
 class TestTrainingArguments(TestCase):
-    def test_training_args_raises_error_with_wrong_warmup_proportion(self):
+    def test_raises_error_with_wrong_warmup_proportion(self):
         # warmup_proportion must not be > 1.0
         with pytest.raises(ValueError):
             TrainingArguments(warmup_proportion=1.1)
@@ -15,7 +15,7 @@ def test_training_args_raises_error_with_wrong_warmup_proportion(self):
         with pytest.raises(ValueError):
             TrainingArguments(warmup_proportion=-0.1)
 
-    def test_training_args_batch_sizes(self):
+    def test_batch_sizes(self):
         batch_size_A = 12
         batch_size_B = 4
         batch_size_C = 6
@@ -40,7 +40,7 @@ def test_training_args_batch_sizes(self):
         self.assertEqual(args.embedding_batch_size, batch_size_C)
         self.assertEqual(args.classifier_batch_size, batch_size_A)
 
-    def test_training_args_num_epochs(self):
+    def test_num_epochs(self):
         num_epochs_A = 12
         num_epochs_B = 4
         num_epochs_C = 6
@@ -65,7 +65,7 @@ def test_training_args_num_epochs(self):
         self.assertEqual(args.embedding_num_epochs, num_epochs_C)
         self.assertEqual(args.classifier_num_epochs, num_epochs_A)
 
-    def test_training_args_learning_rates(self):
+    def test_learning_rates(self):
         learning_rate_A = 1e-2
         learning_rate_B = 1e-3
         learning_rate_C = 1e-4

From 5937ec25b010171122ce074f5a9cb031f36de7b9 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 17:47:12 +0100
Subject: [PATCH 18/77] Replace loss_class on Trainer with loss on TrainArgs

---
 src/setfit/trainer.py              | 21 ++++++++-------------
 src/setfit/trainer_distillation.py | 18 ++++++++----------
 src/setfit/training_args.py        | 12 +++++++-----
 tests/test_trainer.py              |  3 +--
 tests/test_trainer_distillation.py |  3 ---
 5 files changed, 24 insertions(+), 33 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 44506edd..f31b2aba 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -92,7 +92,6 @@ def __init__(
         eval_dataset: Optional["Dataset"] = None,
         model_init: Optional[Callable[[], "SetFitModel"]] = None,
         metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
-        loss_class=losses.CosineSimilarityLoss,
         column_mapping: Optional[Dict[str, str]] = None,
     ):
         self.args = args
@@ -100,7 +99,6 @@ def __init__(
         self.eval_dataset = eval_dataset
         self.model_init = model_init
         self.metric = metric
-        self.loss_class = loss_class
         self.column_mapping = column_mapping
 
         if model is None:
@@ -248,9 +246,6 @@ def train(
 
         x_train: List[str] = train_dataset["text"]
         y_train: List[int] = train_dataset["label"]
-        if self.loss_class is None:
-            logger.warning("No `loss_class` detected! Using `CosineSimilarityLoss` as the default.")
-            self.loss_class = losses.CosineSimilarityLoss
 
         self.train_embeddings(x_train, y_train, args)
         self.train_classifier(x_train, y_train, args)
@@ -259,7 +254,7 @@ def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optiona
         args = args or self.args or TrainingArguments()
 
         # sentence-transformers adaptation
-        if self.loss_class in [
+        if args.loss in [
             losses.BatchAllTripletLoss,
             losses.BatchHardTripletLoss,
             losses.BatchSemiHardTripletLoss,
@@ -272,15 +267,15 @@ def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optiona
             batch_size = min(args.embedding_batch_size, len(train_data_sampler))
             train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
 
-            if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
-                train_loss = self.loss_class(
+            if args.loss is losses.BatchHardSoftMarginTripletLoss:
+                train_loss = args.loss(
                     model=self.model.model_body,
                     distance_metric=args.distance_metric,
                 )
-            elif self.loss_class is SupConLoss:
-                train_loss = self.loss_class(model=self.model.model_body)
+            elif args.loss is SupConLoss:
+                train_loss = args.loss(model=self.model.model_body)
             else:
-                train_loss = self.loss_class(
+                train_loss = args.loss(
                     model=self.model.model_body,
                     distance_metric=args.distance_metric,
                     margin=args.margin,
@@ -298,7 +293,7 @@ def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optiona
 
             batch_size = args.embedding_batch_size
             train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-            train_loss = self.loss_class(self.model.model_body)
+            train_loss = args.loss(self.model.model_body)
 
         total_train_steps = len(train_dataloader) * args.embedding_num_epochs
         logger.info("***** Running training *****")
@@ -510,6 +505,7 @@ def __init__(
             distance_metric=distance_metric,
             margin=margin,
             samples_per_label=samples_per_label,
+            loss=loss_class,
         )
         super().__init__(
             model=model,
@@ -518,6 +514,5 @@ def __init__(
             eval_dataset=eval_dataset,
             model_init=model_init,
             metric=metric,
-            loss_class=loss_class,
             column_mapping=column_mapping,
         )
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index c54f3957..0a2cbae9 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -72,7 +72,6 @@ def __init__(
         eval_dataset: Optional["Dataset"] = None,
         model_init: Optional[Callable[[], "SetFitModel"]] = None,
         metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
-        loss_class: torch.nn.Module = losses.CosineSimilarityLoss,
         column_mapping: Optional[Dict[str, str]] = None,
     ) -> None:
         super().__init__(
@@ -82,7 +81,6 @@ def __init__(
             eval_dataset=eval_dataset,
             model_init=model_init,
             metric=metric,
-            loss_class=loss_class,
             column_mapping=column_mapping,
         )
 
@@ -98,7 +96,7 @@ def train_embeddings(
         args = args or self.args or TrainingArguments()
 
         # sentence-transformers adaptation
-        if self.loss_class in [
+        if args.loss in [
             losses.BatchAllTripletLoss,
             losses.BatchHardTripletLoss,
             losses.BatchSemiHardTripletLoss,
@@ -111,15 +109,15 @@ def train_embeddings(
             batch_size = min(args.embedding_batch_size, len(train_data_sampler))
             train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
 
-            if self.loss_class is losses.BatchHardSoftMarginTripletLoss:
-                train_loss = self.loss_class(
+            if args.loss is losses.BatchHardSoftMarginTripletLoss:
+                train_loss = args.loss(
                     model=self.student_model.model_body,
                     distance_metric=args.distance_metric,
                 )
-            elif self.loss_class is SupConLoss:
-                train_loss = self.loss_class(model=self.student_model)
+            elif args.loss is SupConLoss:
+                train_loss = args.loss(model=self.student_model)
             else:
-                train_loss = self.loss_class(
+                train_loss = args.loss(
                     model=self.student_model.model_body,
                     distance_metric=args.distance_metric,
                     margin=args.margin,
@@ -141,7 +139,7 @@ def train_embeddings(
 
             batch_size = args.embedding_batch_size
             train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-            train_loss = self.loss_class(self.student_model.model_body)
+            train_loss = args.loss(self.student_model.model_body)
 
         total_train_steps = len(train_dataloader) * args.embedding_num_epochs
         logger.info("***** Running training *****")
@@ -195,6 +193,7 @@ def __init__(
             seed=seed,
             use_amp=use_amp,
             warmup_proportion=warmup_proportion,
+            loss=loss_class,
         )
         super().__init__(
             teacher_model=teacher_model,
@@ -204,6 +203,5 @@ def __init__(
             eval_dataset=eval_dataset,
             model_init=model_init,
             metric=metric,
-            loss_class=loss_class,
             column_mapping=column_mapping,
         )
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 333a66ea..8e357f56 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -3,9 +3,9 @@
 import inspect
 from copy import copy
 from dataclasses import dataclass, field, fields
-from typing import Any, Callable, Dict, Tuple, Union
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
-from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
+from sentence_transformers import losses
 
 
 @dataclass
@@ -34,16 +34,18 @@ class TrainingArguments:
     seed: int = 42
     use_amp: bool = False
     warmup_proportion: float = 0.1
-    distance_metric: Callable = BatchHardTripletLossDistanceFunction.cosine_distance
+    distance_metric: Callable = losses.BatchHardTripletLossDistanceFunction.cosine_distance
     margin: float = 0.25
     samples_per_label: int = 2
     show_progress_bar: bool = True
 
-    l2_weight: float = None
-    max_length: int = None
+    l2_weight: Optional[float] = None
+    max_length: Optional[int] = None
 
     end_to_end: bool = False
 
+    loss: Callable = losses.CosineSimilarityLoss
+
     def __post_init__(self):
         # Set `self.embedding_batch_size` and `self.classifier_batch_size` using values from `self.batch_size`
         if isinstance(self.batch_size, int):
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index b40ec5f1..b274d862 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -364,13 +364,12 @@ def hp_name(trial):
 def test_trainer_works_with_non_default_loss_class(loss_class):
     dataset = Dataset.from_dict({"text": ["a 1", "b 1", "c 1", "a 2", "b 2", "c 2"], "label": [0, 1, 2, 0, 1, 2]})
     model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
-    args = TrainingArguments(num_iterations=1)
+    args = TrainingArguments(num_iterations=1, loss=loss_class)
     trainer = Trainer(
         model=model,
         args=args,
         train_dataset=dataset,
         eval_dataset=dataset,
-        loss_class=loss_class,
     )
     trainer.train()
     # no asserts here because this is a regression test - we only test if an exception is raised
diff --git a/tests/test_trainer_distillation.py b/tests/test_trainer_distillation.py
index df2bad4c..2216b1ad 100644
--- a/tests/test_trainer_distillation.py
+++ b/tests/test_trainer_distillation.py
@@ -2,7 +2,6 @@
 
 import pytest
 from datasets import Dataset
-from sentence_transformers.losses import CosineSimilarityLoss
 
 from setfit import DistillationTrainer, Trainer
 from setfit.modeling import SetFitModel
@@ -22,7 +21,6 @@ def test_trainer_works_with_default_columns(self):
             model=self.teacher_model,
             train_dataset=dataset,
             eval_dataset=dataset,
-            loss_class=CosineSimilarityLoss,
             metric="accuracy",
         )
         # Teacher Train and evaluate
@@ -35,7 +33,6 @@ def test_trainer_works_with_default_columns(self):
             train_dataset=dataset,
             student_model=self.student_model,
             eval_dataset=dataset,
-            loss_class=CosineSimilarityLoss,
             metric="accuracy",
         )
 

From f1e3de974d0bbee814d3ae6e597976187b3a4baf Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 17:48:02 +0100
Subject: [PATCH 19/77] Removed dead class argument

---
 src/setfit/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index f31b2aba..8d71b1ae 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -112,7 +112,6 @@ def __init__(
 
         self.model = model
         self.hp_search_backend = None
-        self._freeze = True  # If True, will train the body only; otherwise, train the body and head
 
     def _validate_column_mapping(self, dataset: "Dataset") -> None:
         """

From 6051095ddb7127657d297237322c114bdc69c474 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Feb 2023 18:02:23 +0100
Subject: [PATCH 20/77] Move SupConLoss to losses.py

Also remove unused SetFitBaseModel and SKLearnWrapper
---
 src/setfit/losses.py               | 100 ++++++++++++++++++++++
 src/setfit/modeling.py             | 133 -----------------------------
 src/setfit/trainer.py              |   3 +-
 src/setfit/trainer_distillation.py |   3 +-
 src/setfit/utils.py                |   2 +-
 tests/test_deprecated_trainer.py   |   3 +-
 tests/test_trainer.py              |   3 +-
 7 files changed, 109 insertions(+), 138 deletions(-)
 create mode 100644 src/setfit/losses.py

diff --git a/src/setfit/losses.py b/src/setfit/losses.py
new file mode 100644
index 00000000..369c8451
--- /dev/null
+++ b/src/setfit/losses.py
@@ -0,0 +1,100 @@
+import torch
+from torch import nn
+
+
+class SupConLoss(nn.Module):
+    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+
+    It also supports the unsupervised contrastive loss in SimCLR.
+    """
+
+    def __init__(self, model, temperature=0.07, contrast_mode="all", base_temperature=0.07):
+        super(SupConLoss, self).__init__()
+        self.model = model
+        self.temperature = temperature
+        self.contrast_mode = contrast_mode
+        self.base_temperature = base_temperature
+
+    def forward(self, sentence_features, labels=None, mask=None):
+        """Computes loss for model.
+
+        If both `labels` and `mask` are None, it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+
+        Args:
+            features: hidden vector of shape [bsz, n_views, ...].
+            labels: ground truth of shape [bsz].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+
+        Returns:
+            A loss scalar.
+        """
+        features = self.model(sentence_features[0])["sentence_embedding"]
+
+        # Normalize embeddings
+        features = torch.nn.functional.normalize(features, p=2, dim=1)
+
+        # Add n_views dimension
+        features = torch.unsqueeze(features, 1)
+
+        device = features.device
+
+        if len(features.shape) < 3:
+            raise ValueError("`features` needs to be [bsz, n_views, ...]," "at least 3 dimensions are required")
+        if len(features.shape) > 3:
+            features = features.view(features.shape[0], features.shape[1], -1)
+
+        batch_size = features.shape[0]
+        if labels is not None and mask is not None:
+            raise ValueError("Cannot define both `labels` and `mask`")
+        elif labels is None and mask is None:
+            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
+        elif labels is not None:
+            labels = labels.contiguous().view(-1, 1)
+            if labels.shape[0] != batch_size:
+                raise ValueError("Num of labels does not match num of features")
+            mask = torch.eq(labels, labels.T).float().to(device)
+        else:
+            mask = mask.float().to(device)
+
+        contrast_count = features.shape[1]
+        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
+        if self.contrast_mode == "one":
+            anchor_feature = features[:, 0]
+            anchor_count = 1
+        elif self.contrast_mode == "all":
+            anchor_feature = contrast_feature
+            anchor_count = contrast_count
+        else:
+            raise ValueError("Unknown mode: {}".format(self.contrast_mode))
+
+        # Compute logits
+        anchor_dot_contrast = torch.div(torch.matmul(anchor_feature, contrast_feature.T), self.temperature)
+        # For numerical stability
+        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+
+        # Tile mask
+        mask = mask.repeat(anchor_count, contrast_count)
+        # Mask-out self-contrast cases
+        logits_mask = torch.scatter(
+            torch.ones_like(mask),
+            1,
+            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
+            0,
+        )
+        mask = mask * logits_mask
+
+        # Compute log_prob
+        exp_logits = torch.exp(logits) * logits_mask
+        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
+
+        # Compute mean of log-likelihood over positive
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
+
+        # Loss
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.view(anchor_count, batch_size).mean()
+
+        return loss
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 44c6d367..9fee15d5 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -90,15 +90,6 @@
 """
 
 
-class SetFitBaseModel:
-    def __init__(self, model, max_seq_length: int, add_normalization_layer: bool) -> None:
-        self.model = SentenceTransformer(model)
-        self.model.max_seq_length = max_seq_length
-
-        if add_normalization_layer:
-            self.model._modules["2"] = models.Normalize()
-
-
 class SetFitHead(models.Dense):
     """
     A SetFit head that supports multi-class classification for end-to-end training.
@@ -586,104 +577,6 @@ def _from_pretrained(
         )
 
 
-class SupConLoss(nn.Module):
-    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
-
-    It also supports the unsupervised contrastive loss in SimCLR.
-    """
-
-    def __init__(self, model, temperature=0.07, contrast_mode="all", base_temperature=0.07):
-        super(SupConLoss, self).__init__()
-        self.model = model
-        self.temperature = temperature
-        self.contrast_mode = contrast_mode
-        self.base_temperature = base_temperature
-
-    def forward(self, sentence_features, labels=None, mask=None):
-        """Computes loss for model.
-
-        If both `labels` and `mask` are None, it degenerates to SimCLR unsupervised loss:
-        https://arxiv.org/pdf/2002.05709.pdf
-
-        Args:
-            features: hidden vector of shape [bsz, n_views, ...].
-            labels: ground truth of shape [bsz].
-            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
-                has the same class as sample i. Can be asymmetric.
-
-        Returns:
-            A loss scalar.
-        """
-        features = self.model(sentence_features[0])["sentence_embedding"]
-
-        # Normalize embeddings
-        features = torch.nn.functional.normalize(features, p=2, dim=1)
-
-        # Add n_views dimension
-        features = torch.unsqueeze(features, 1)
-
-        device = features.device
-
-        if len(features.shape) < 3:
-            raise ValueError("`features` needs to be [bsz, n_views, ...]," "at least 3 dimensions are required")
-        if len(features.shape) > 3:
-            features = features.view(features.shape[0], features.shape[1], -1)
-
-        batch_size = features.shape[0]
-        if labels is not None and mask is not None:
-            raise ValueError("Cannot define both `labels` and `mask`")
-        elif labels is None and mask is None:
-            mask = torch.eye(batch_size, dtype=torch.float32).to(device)
-        elif labels is not None:
-            labels = labels.contiguous().view(-1, 1)
-            if labels.shape[0] != batch_size:
-                raise ValueError("Num of labels does not match num of features")
-            mask = torch.eq(labels, labels.T).float().to(device)
-        else:
-            mask = mask.float().to(device)
-
-        contrast_count = features.shape[1]
-        contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
-        if self.contrast_mode == "one":
-            anchor_feature = features[:, 0]
-            anchor_count = 1
-        elif self.contrast_mode == "all":
-            anchor_feature = contrast_feature
-            anchor_count = contrast_count
-        else:
-            raise ValueError("Unknown mode: {}".format(self.contrast_mode))
-
-        # Compute logits
-        anchor_dot_contrast = torch.div(torch.matmul(anchor_feature, contrast_feature.T), self.temperature)
-        # For numerical stability
-        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
-        logits = anchor_dot_contrast - logits_max.detach()
-
-        # Tile mask
-        mask = mask.repeat(anchor_count, contrast_count)
-        # Mask-out self-contrast cases
-        logits_mask = torch.scatter(
-            torch.ones_like(mask),
-            1,
-            torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
-            0,
-        )
-        mask = mask * logits_mask
-
-        # Compute log_prob
-        exp_logits = torch.exp(logits) * logits_mask
-        log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
-
-        # Compute mean of log-likelihood over positive
-        mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
-
-        # Loss
-        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
-        loss = loss.view(anchor_count, batch_size).mean()
-
-        return loss
-
-
 def sentence_pairs_generation(sentences, labels, pairs):
     # Initialize two empty lists to hold the (sentence, sentence) pairs and
     # labels to indicate if a pair is positive or negative
@@ -754,29 +647,3 @@ def sentence_pairs_generation_cos_sim(sentences, pairs, cos_sim_matrix):
         pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
 
     return pairs
-
-
-class SKLearnWrapper:
-    def __init__(self, st_model=None, clf=None):
-        self.st_model = st_model
-        self.clf = clf
-
-    def fit(self, x_train, y_train):
-        embeddings = self.st_model.encode(x_train)
-        self.clf.fit(embeddings, y_train)
-
-    def predict(self, x_test):
-        embeddings = self.st_model.encode(x_test)
-        return self.clf.predict(embeddings)
-
-    def predict_proba(self, x_test):
-        embeddings = self.st_model.encode(x_test)
-        return self.clf.predict_proba(embeddings)
-
-    def save(self, path):
-        self.st_model.save(path=path)
-        joblib.dump(self.clf, f"{path}/setfit_head.pkl")
-
-    def load(self, path):
-        self.st_model = SentenceTransformer(model_name_or_path=path)
-        self.clf = joblib.load(f"{path}/setfit_head.pkl")
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 8d71b1ae..57cd88b4 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -19,7 +19,8 @@
 
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
-from .modeling import SupConLoss, sentence_pairs_generation, sentence_pairs_generation_multilabel
+from .losses import SupConLoss
+from .modeling import sentence_pairs_generation, sentence_pairs_generation_multilabel
 from .training_args import TrainingArguments
 from .utils import BestRun, default_hp_space_optuna
 
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 0a2cbae9..a299741a 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -9,7 +9,8 @@
 from torch.utils.data import DataLoader
 
 from . import logging
-from .modeling import SupConLoss, sentence_pairs_generation_cos_sim
+from .losses import SupConLoss
+from .modeling import sentence_pairs_generation_cos_sim
 from .trainer import Trainer
 from .training_args import TrainingArguments
 
diff --git a/src/setfit/utils.py b/src/setfit/utils.py
index 409edb05..4620ca61 100644
--- a/src/setfit/utils.py
+++ b/src/setfit/utils.py
@@ -7,7 +7,7 @@
 from sentence_transformers import losses
 
 from .data import create_fewshot_splits, create_fewshot_splits_multilabel
-from .modeling import SupConLoss
+from .losses import SupConLoss
 
 
 SEC_TO_NS_SCALE = 1000000000
diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
index d4708f93..2a19163f 100644
--- a/tests/test_deprecated_trainer.py
+++ b/tests/test_deprecated_trainer.py
@@ -8,7 +8,8 @@
 from transformers.utils.hp_naming import TrialShortNamer
 
 from setfit import logging
-from setfit.modeling import SetFitModel, SupConLoss
+from setfit.losses import SupConLoss
+from setfit.modeling import SetFitModel
 from setfit.trainer import SetFitTrainer
 from setfit.utils import BestRun
 
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index b274d862..ed66286d 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -8,7 +8,8 @@
 from transformers.utils.hp_naming import TrialShortNamer
 
 from setfit import logging
-from setfit.modeling import SetFitModel, SupConLoss
+from setfit.losses import SupConLoss
+from setfit.modeling import SetFitModel
 from setfit.trainer import Trainer
 from setfit.training_args import TrainingArguments
 from setfit.utils import BestRun

From bddd46a776883996da6fc80d38a4b1272882acdf Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 7 Feb 2023 08:06:34 +0100
Subject: [PATCH 21/77] Add deprecation to Trainer.(un)freeze

---
 src/setfit/trainer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 57cd88b4..f12a47ab 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -204,11 +204,23 @@ def call_model_init(self, params: Optional[Dict[str, Any]] = None):
         return model
 
     def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
+        warnings.warn(
+            f"`{self.__class__.__name__}.freeze` is deprecated and will be removed in v2.0.0 of SetFit. "
+            "Please use `SetFitModel.freeze` directly instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         return self.model.freeze(component)
 
     def unfreeze(
         self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None
     ) -> None:
+        warnings.warn(
+            f"`{self.__class__.__name__}.unfreeze` is deprecated and will be removed in v2.0.0 of SetFit. "
+            "Please use `SetFitModel.unfreeze` directly instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         return self.model.unfreeze(component, keep_body_frozen=keep_body_frozen)
 
     def train(

From fa8a077164842e6eb8cecec751c861f770c0f224 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 7 Feb 2023 08:18:35 +0100
Subject: [PATCH 22/77] Prevent warning from always triggering

---
 src/setfit/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index f12a47ab..e12da914 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -229,7 +229,7 @@ def train(
         trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
         **kwargs,
     ):
-        if kwargs is not None:
+        if len(kwargs):
             warnings.warn(
                 f"`{self.__class__.__name__}.train` does not accept keyword arguments anymore. "
                 f"Please provide training arguments via a `TrainingArguments` instance to the `{self.__class__.__name__}` "

From 85a3684d05f928c92ed0c102776e37cf4ae39a35 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 7 Feb 2023 08:18:47 +0100
Subject: [PATCH 23/77] Export TrainingArguments in __init__

---
 src/setfit/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index 03b96bd9..017d7ff8 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -6,7 +6,7 @@
 from .modeling import SetFitHead, SetFitModel
 from .trainer import SetFitTrainer, Trainer
 from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
-
+from .training_args import TrainingArguments
 
 # Ensure that DeprecationWarnings are shown by default, as recommended by
 # https://docs.python.org/3/library/warnings.html#overriding-the-default-filter

From ca625a22598e275e7ec7d70f9b91417e2f4fccf3 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 7 Feb 2023 10:28:21 +0100
Subject: [PATCH 24/77] Update & add important missing docstrings

---
 src/setfit/__init__.py             |   1 +
 src/setfit/modeling.py             | 111 ++++++++++++++++++++++++++---
 src/setfit/trainer.py              | 105 +++++++++++++++++----------
 src/setfit/trainer_distillation.py |  43 +++++------
 src/setfit/training_args.py        |  77 +++++++++++++++++---
 5 files changed, 256 insertions(+), 81 deletions(-)

diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index 017d7ff8..cb9af3c4 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -8,6 +8,7 @@
 from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
 from .training_args import TrainingArguments
 
+
 # Ensure that DeprecationWarnings are shown by default, as recommended by
 # https://docs.python.org/3/library/warnings.html#overriding-the-default-filter
 warnings.filterwarnings("default", category=DeprecationWarning)
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 9fee15d5..43c26f9e 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -2,7 +2,7 @@
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 
 # Google Colab runs on Python 3.7, so we need this to be compatible
@@ -28,10 +28,6 @@
 from .data import SetFitDataset
 
 
-if TYPE_CHECKING:
-    from numpy import ndarray
-
-
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
 
@@ -276,11 +272,31 @@ def fit(
         batch_size: Optional[int] = None,
         body_learning_rate: Optional[float] = None,
         head_learning_rate: Optional[float] = None,
+        end_to_end: bool = False,
         l2_weight: Optional[float] = None,
         max_length: Optional[int] = None,
         show_progress_bar: bool = True,
-        end_to_end: bool = False,
     ) -> None:
+        """Train the classifier head, only used if a differentiable PyTorch head is used.
+
+        Args:
+            x_train (`List[str]`): A list of training sentences.
+            y_train (`Union[List[int], List[List[int]]]`): A list of labels corresponding to the training sentences.
+            num_epochs (`int`): The number of epochs to train for.
+            batch_size (`int`, *optional*): The batch size to use.
+            body_learning_rate (`float`, *optional*): The learning rate for the `SentenceTransformer` body
+                in the `AdamW` optimizer. Disregarded if `end_to_end=False`.
+            head_learning_rate (`float`, *optional*): The learning rate for the differentiable torch head
+                in the `AdamW` optimizer.
+            end_to_end (`bool`, defaults to `False`): If True, train the entire model end-to-end.
+                Otherwise, freeze the `SentenceTransformer` body and only train the head.
+            l2_weight (`float`, *optional*): The l2 weight for both the model body and head
+                in the `AdamW` optimizer.
+            max_length (`int`, *optional*): The maximum token length a tokenizer can generate. If not provided,
+                the maximum length for the `SentenceTransformer` body is used.
+            show_progress_bar (`bool`, defaults to `True`): Whether to display a progress bar for the training
+                epochs and iterations.
+        """
         if self.has_differentiable_head:  # train with pyTorch
             device = self.model_body.device
             self.model_body.train()
@@ -381,6 +397,12 @@ def _prepare_optimizer(
         return optimizer
 
     def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
+        """Freeze the model body and/or the head, preventing further training on that component until unfrozen.
+
+        Args:
+            component (`Literal["body", "head"]`, *optional*): Either "body" or "head" to freeze that component.
+                If no component is provided, freeze both. Defaults to None.
+        """
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=True)
 
@@ -390,6 +412,13 @@ def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
     def unfreeze(
         self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None
     ) -> None:
+        """Unfreeze the model body and/or the head, allowing further training on that component.
+
+        Args:
+            component (`Literal["body", "head"]`, *optional*): Either "body" or "head" to unfreeze that component.
+                If no component is provided, unfreeze both. Defaults to None.
+            keep_body_frozen (`bool`, *optional*): Deprecated argument, use `component` instead.
+        """
         if keep_body_frozen is not None:
             warnings.warn(
                 "`keep_body_frozen` is deprecated and will be removed in v2.0.0 of SetFit. "
@@ -409,15 +438,40 @@ def unfreeze(
             self._freeze_or_not(self.model_head, to_freeze=False)
 
     def _freeze_or_not(self, model: nn.Module, to_freeze: bool) -> None:
+        """Set `requires_grad=not to_freeze` for all parameters in `model`"""
         for param in model.parameters():
             param.requires_grad = not to_freeze
 
-    def encode(self, inputs: List[str]) -> Union[torch.Tensor, "ndarray"]:
+    def encode(self, inputs: List[str]) -> Union[torch.Tensor, np.ndarray]:
+        """Convert input sentences to embeddings using the `SentenceTransformer` body.
+
+        Args:
+            inputs (`List[str]`): The input sentences to embed.
+
+        Returns:
+            Union[torch.Tensor, np.ndarray]: A matrix with shape [INPUT_LENGTH, EMBEDDING_SIZE], as a
+            torch Tensor if this model has a differentiable Torch head, or otherwise as a numpy array.
+        """
         return self.model_body.encode(
             inputs, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
         )
 
-    def predict(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
+    def predict(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, np.ndarray]:
+        """Predict the various classes.
+
+        Args:
+            inputs (`List[str]`): The input sentences to predict classes for.
+            as_numpy (`bool`, defaults to `False`): Whether to output as numpy array instead.
+
+        Example:
+            >>> model = SetFitModel.from_pretrained(...)
+            >>> model.predict(["What a boring display", "Exhilarating through and through", "I'm wowed!"])
+            tensor([0, 1, 1], dtype=torch.int32)
+
+        Returns:
+            `Union[torch.Tensor, np.ndarray]`: A vector with equal length to the inputs, denoting
+            to which class each input is predicted to belong.
+        """
         embeddings = self.encode(inputs)
         outputs = self.model_head.predict(embeddings)
 
@@ -428,7 +482,24 @@ def predict(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tens
 
         return outputs
 
-    def predict_proba(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
+    def predict_proba(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, np.ndarray]:
+        """Predict the probabilities of the various classes.
+
+        Args:
+            inputs (`List[str]`): The input sentences to predict class probabilities for.
+            as_numpy (`bool`, defaults to `False`): Whether to output as numpy array instead.
+
+        Example:
+            >>> model = SetFitModel.from_pretrained(...)
+            >>> model.predict_proba(["What a boring display", "Exhilarating through and through", "I'm wowed!"])
+            tensor([[0.9367, 0.0633],
+                    [0.0627, 0.9373],
+                    [0.0890, 0.9110]], dtype=torch.float64)
+
+        Returns:
+            `Union[torch.Tensor, np.ndarray]`: A matrix with shape [INPUT_LENGTH, NUM_CLASSES] denoting
+            probabilities of predicting an input as a class.
+        """
         embeddings = self.encode(inputs)
         outputs = self.model_head.predict_proba(embeddings)
 
@@ -445,6 +516,12 @@ def to(self, device: Union[str, torch.device]) -> "SetFitModel":
         Args:
             device (Union[str, torch.device]): The identifier of the device to move the model to.
 
+        Example:
+
+            >>> model = SetFitModel.from_pretrained(...)
+            >>> model.to("cpu")
+            >>> model(["cats are cute", "dogs are loyal"])
+
         Returns:
             SetFitModel: Returns the original model, but now on the desired device.
         """
@@ -472,7 +549,21 @@ def create_model_card(self, path: str, model_name: Optional[str] = "SetFit Model
         with open(os.path.join(path, "README.md"), "w", encoding="utf-8") as f:
             f.write(model_card_content)
 
-    def __call__(self, inputs):
+    def __call__(self, inputs: List[str]) -> torch.Tensor:
+        """Predict the various classes.
+
+        Args:
+            inputs (`List[str]`): The input sentences to predict classes for.
+
+        Example:
+            >>> model = SetFitModel.from_pretrained(...)
+            >>> model(["What a boring display", "Exhilarating through and through", "I'm wowed!"])
+            tensor([0, 1, 1], dtype=torch.int32)
+
+        Returns:
+            `torch.Tensor`: A vector with equal length to the inputs, denoting to which class each
+            input is predicted to belong.
+        """
         return self.predict(inputs)
 
     def _save_pretrained(self, save_directory: str) -> None:
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index e12da914..17b18f41 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -41,48 +41,24 @@ class Trainer:
     Args:
         model (`SetFitModel`, *optional*):
             The model to train. If not provided, a `model_init` must be passed.
+        args (`TrainingArguments`, *optional*):
+            The training arguments to use.
         train_dataset (`Dataset`):
             The training dataset.
         eval_dataset (`Dataset`, *optional*):
             The evaluation dataset.
         model_init (`Callable[[], SetFitModel]`, *optional*):
-            A function that instantiates the model to be used. If provided, each call to [`~SetFitTrainer.train`] will start
-            from a new instance of the model as given by this function when a `trial` is passed.
+            A function that instantiates the model to be used. If provided, each call to
+            [`~SetFitTrainer.train`] will start from a new instance of the model as given by this
+            function when a `trial` is passed.
         metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
-            The metric to use for evaluation. If a string is provided, we treat it as the metric name and load it with default settings.
+            The metric to use for evaluation. If a string is provided, we treat it as the metric
+            name and load it with default settings.
             If a callable is provided, it must take two arguments (`y_pred`, `y_test`).
-        loss_class (`nn.Module`, *optional*, defaults to `CosineSimilarityLoss`):
-            The loss function to use for contrastive training.
-        num_iterations (`int`, *optional*, defaults to `20`):
-            The number of iterations to generate sentence pairs for.
-            This argument is ignored if triplet loss is used.
-            It is only used in conjunction with `CosineSimilarityLoss`.
-        num_epochs (`int`, *optional*, defaults to `1`):
-            The number of epochs to train the Sentence Transformer body for.
-        learning_rate (`float`, *optional*, defaults to `2e-5`):
-            The learning rate to use for contrastive training.
-        batch_size (`int`, *optional*, defaults to `16`):
-            The batch size to use for contrastive training.
-        seed (`int`, *optional*, defaults to 42):
-            Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
-            [`~SetTrainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
         column_mapping (`Dict[str, str]`, *optional*):
-            A mapping from the column names in the dataset to the column names expected by the model. The expected format is a dictionary with the following format: {"text_column_name": "text", "label_column_name: "label"}.
-        use_amp (`bool`, *optional*, defaults to `False`):
-            Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
-        warmup_proportion (`float`, *optional*, defaults to `0.1`):
-            Proportion of the warmup in the total training steps.
-            Must be greater than or equal to 0.0 and less than or equal to 1.0.
-        distance_metric (`Callable`, defaults to `BatchHardTripletLossDistanceFunction.cosine_distance`):
-            Function that returns a distance between two embeddings.
-            It is set for the triplet loss and
-            is ignored for `CosineSimilarityLoss` and `SupConLoss`.
-        margin (`float`, defaults to `0.25`): Margin for the triplet loss.
-            Negative samples should be at least margin further apart from the anchor than the positive.
-            This is ignored for `CosineSimilarityLoss`, `BatchHardSoftMarginTripletLoss` and `SupConLoss`.
-        samples_per_label (`int`, defaults to `2`): Number of consecutive, random and unique samples drawn per label.
-            This is only relevant for triplet loss and ignored for `CosineSimilarityLoss`.
-            Batch size should be a multiple of samples_per_label.
+            A mapping from the column names in the dataset to the column names expected by the model.
+            The expected format is a dictionary with the following format:
+            `{"text_column_name": "text", "label_column_name: "label"}`.
     """
 
     def __init__(
@@ -106,10 +82,10 @@ def __init__(
             if model_init is not None:
                 model = self.call_model_init()
             else:
-                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument")
+                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument.")
         else:
             if model_init is not None:
-                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both")
+                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both.")
 
         self.model = model
         self.hp_search_backend = None
@@ -204,6 +180,14 @@ def call_model_init(self, params: Optional[Dict[str, Any]] = None):
         return model
 
     def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
+        """Freeze the model body and/or the head, preventing further training on that component until unfrozen.
+
+        This method is deprecated, use `SetFitModel.freeze` instead.
+
+        Args:
+            component (`Literal["body", "head"]`, *optional*): Either "body" or "head" to freeze that component.
+                If no component is provided, freeze both. Defaults to None.
+        """
         warnings.warn(
             f"`{self.__class__.__name__}.freeze` is deprecated and will be removed in v2.0.0 of SetFit. "
             "Please use `SetFitModel.freeze` directly instead.",
@@ -215,6 +199,15 @@ def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
     def unfreeze(
         self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None
     ) -> None:
+        """Unfreeze the model body and/or the head, allowing further training on that component.
+
+        This method is deprecated, use `SetFitModel.unfreeze` instead.
+
+        Args:
+            component (`Literal["body", "head"]`, *optional*): Either "body" or "head" to unfreeze that component.
+                If no component is provided, unfreeze both. Defaults to None.
+            keep_body_frozen (`bool`, *optional*): Deprecated argument, use `component` instead.
+        """
         warnings.warn(
             f"`{self.__class__.__name__}.unfreeze` is deprecated and will be removed in v2.0.0 of SetFit. "
             "Please use `SetFitModel.unfreeze` directly instead.",
@@ -229,6 +222,15 @@ def train(
         trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
         **kwargs,
     ):
+        """
+        Main training entry point.
+
+        Args:
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the training arguments for this training call.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+        """
         if len(kwargs):
             warnings.warn(
                 f"`{self.__class__.__name__}.train` does not accept keyword arguments anymore. "
@@ -262,7 +264,18 @@ def train(
         self.train_embeddings(x_train, y_train, args)
         self.train_classifier(x_train, y_train, args)
 
-    def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optional[TrainingArguments] = None):
+    def train_embeddings(
+        self, x_train: List[str], y_train: Union[List[int], List[List[int]]], args: Optional[TrainingArguments] = None
+    ):
+        """
+        Method to perform the embedding phase: finetuning the `SentenceTransformer` body.
+
+        Args:
+            x_train (`List[str]`): A list of training sentences.
+            y_train (`Union[List[int], List[List[int]]]`): A list of labels corresponding to the training sentences.
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the training arguments for this training call.
+        """
         args = args or self.args or TrainingArguments()
 
         # sentence-transformers adaptation
@@ -324,7 +337,18 @@ def train_embeddings(self, x_train: List[str], y_train: List[int], args: Optiona
             use_amp=args.use_amp,
         )
 
-    def train_classifier(self, x_train: List[str], y_train: List[int], args: Optional[TrainingArguments] = None):
+    def train_classifier(
+        self, x_train: List[str], y_train: Union[List[int], List[List[int]]], args: Optional[TrainingArguments] = None
+    ):
+        """
+        Method to perform the classifier phase: fitting a classifier head.
+
+        Args:
+            x_train (`List[str]`): A list of training sentences.
+            y_train (`Union[List[int], List[List[int]]]`): A list of labels corresponding to the training sentences.
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the training arguments for this training call.
+        """
         args = args or self.args or TrainingArguments()
 
         self.model.fit(
@@ -479,6 +503,11 @@ def push_to_hub(
 
 
 class SetFitTrainer(Trainer):
+    """
+    `SetFitTrainer` has been deprecated and will be removed in v2.0.0 of SetFit.
+    Please use `Trainer` instead.
+    """
+
     def __init__(
         self,
         model: Optional["SetFitModel"] = None,
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index a299741a..588578c9 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -30,38 +30,26 @@ class DistillationTrainer(Trainer):
     Args:
         teacher_model (`SetFitModel`):
             The teacher model to mimic.
+        student_model (`SetFitModel`, *optional*):
+            The model to train. If not provided, a `model_init` must be passed.
+        args (`TrainingArguments`, *optional*):
+            The training arguments to use.
         train_dataset (`Dataset`):
             The training dataset.
-        student_model (`SetFitModel`):
-            The student model to train. If not provided, a `model_init` must be passed.
         eval_dataset (`Dataset`, *optional*):
             The evaluation dataset.
         model_init (`Callable[[], SetFitModel]`, *optional*):
-            A function that instantiates the model to be used. If provided, each call to [`~DistillationSetFitTrainer.train`] will start
-            from a new instance of the model as given by this function when a `trial` is passed.
+            A function that instantiates the model to be used. If provided, each call to
+            [`~SetFitTrainer.train`] will start from a new instance of the model as given by this
+            function when a `trial` is passed.
         metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
-            The metric to use for evaluation. If a string is provided, we treat it as the metric name and load it with default settings.
+            The metric to use for evaluation. If a string is provided, we treat it as the metric
+            name and load it with default settings.
             If a callable is provided, it must take two arguments (`y_pred`, `y_test`).
-        loss_class (`nn.Module`, *optional*, defaults to `CosineSimilarityLoss`):
-            The loss function to use for contrastive training.
-        num_iterations (`int`, *optional*, defaults to `20`):
-            The number of iterations to generate sentence pairs for.
-        num_epochs (`int`, *optional*, defaults to `1`):
-            The number of epochs to train the Sentence Transformer body for.
-        learning_rate (`float`, *optional*, defaults to `2e-5`):
-            The learning rate to use for contrastive training.
-        batch_size (`int`, *optional*, defaults to `16`):
-            The batch size to use for contrastive training.
-        seed (`int`, *optional*, defaults to 42):
-            Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
-            [`~SetTrainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
         column_mapping (`Dict[str, str]`, *optional*):
-            A mapping from the column names in the dataset to the column names expected by the model. The expected format is a dictionary with the following format: {"text_column_name": "text", "label_column_name: "label"}.
-        use_amp (`bool`, *optional*, defaults to `False`):
-            Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
-        warmup_proportion (`float`, *optional*, defaults to `0.1`):
-            Proportion of the warmup in the total training steps.
-            Must be greater than or equal to 0.0 and less than or equal to 1.0.
+            A mapping from the column names in the dataset to the column names expected by the model.
+            The expected format is a dictionary with the following format:
+            `{"text_column_name": "text", "label_column_name: "label"}`.
     """
 
     def __init__(
@@ -91,7 +79,7 @@ def __init__(
     def train_embeddings(
         self,
         x_train: List[str],
-        y_train: List[int],
+        y_train: Union[List[int], List[List[int]]],
         args: Optional[TrainingArguments] = None,
     ):
         args = args or self.args or TrainingArguments()
@@ -161,6 +149,11 @@ def train_embeddings(
 
 
 class DistillationSetFitTrainer(DistillationTrainer):
+    """
+    `DistillationSetFitTrainer` has been deprecated and will be removed in v2.0.0 of SetFit.
+    Please use `DistillationTrainer` instead.
+    """
+
     def __init__(
         self,
         teacher_model: "SetFitModel",
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 8e357f56..3d9a371f 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -10,6 +10,65 @@
 
 @dataclass
 class TrainingArguments:
+    """
+    TrainingArguments is the subset of the arguments which relate to the training loop itself.
+
+    Parameters:
+        batch_size (`Union[int, Tuple[int, int]]`, defaults to `(16, 2)`):
+            Set the batch sizes for the embedding and classifier training phases respectively,
+            or set both if an integer is provided.
+            Note that the batch size for the classifier is only used with a differentiable PyTorch head.
+        num_epochs (`Union[int, Tuple[int, int]]`, defaults to `(1, 16)`):
+            Set the number of epochs the embedding and classifier training phases respectively,
+            or set both if an integer is provided.
+            Note that the number of epochs for the classifier is only used with a differentiable PyTorch head.
+        num_iterations (`int`, defaults to `20`):
+            The number of iterations to generate sentence pairs for.
+            This argument is ignored if triplet loss is used.
+            It is only used in conjunction with `CosineSimilarityLoss`.
+        body_learning_rate (`Union[float, Tuple[float, float]]`, defaults to `(2e-5, 1e-5)`):
+            Set the learning rate for the `SentenceTransformer` body for the embedding and classifier
+            training phases respectively, or set both if a float is provided.
+            Note that the body learning rate for the classifier is only used with a differentiable PyTorch
+            head *and* if `end_to_end=True`.
+        head_learning_rate (`float`, defaults to `1e-2`):
+            Set the learning rate for the head for the classifier training phase.
+        loss (`nn.Module`, defaults to `CosineSimilarityLoss`):
+            The loss function to use for contrastive training of the embedding training phase.
+        distance_metric (`Callable`, defaults to `BatchHardTripletLossDistanceFunction.cosine_distance`):
+            Function that returns a distance between two embeddings.
+            It is set for the triplet loss and ignored for `CosineSimilarityLoss` and `SupConLoss`.
+        margin (`float`, defaults to `0.25`):
+            Margin for the triplet loss.
+            Negative samples should be at least margin further apart from the anchor than the positive.
+            It is ignored for `CosineSimilarityLoss`, `BatchHardSoftMarginTripletLoss` and `SupConLoss`.
+        end_to_end (`bool`, defaults to `False`):
+            If True, train the entire model end-to-end during the classifier training phase.
+            Otherwise, freeze the `SentenceTransformer` body and only train the head.
+            Only used with a differentiable PyTorch head.
+        use_amp (`bool`, defaults to `False`):
+            Whether to use Automatic Mixed Precision (AMP) during the embedding training phase.
+            Only for Pytorch >= 1.6.0
+        warmup_proportion (`float`, defaults to `0.1`):
+            Proportion of the warmup in the total training steps.
+            Must be greater than or equal to 0.0 and less than or equal to 1.0.
+        l2_weight (`float`, *optional*):
+            Optional l2 weight for both the model body and head, passed to the `AdamW` optimizer in the
+            classifier training phase if a differentiable PyTorch head is used.
+        max_length (`int`, *optional*):
+            The maximum token length a tokenizer can generate. If not provided, the maximum length for
+            the `SentenceTransformer` body is used.
+        samples_per_label (`int`, defaults to `2`): Number of consecutive, random and unique samples drawn per label.
+            This is only relevant for triplet loss and ignored for `CosineSimilarityLoss`.
+            Batch size should be a multiple of samples_per_label.
+        show_progress_bar (`bool`, defaults to `True`):
+            Whether to display a progress bar for the training epochs and iterations.
+        seed (`int`, defaults to `42`):
+            Random seed that will be set at the beginning of training. To ensure reproducibility across
+            runs, use the [`~SetTrainer.model_init`] function to instantiate the model if it has some
+            randomly initialized parameters.
+    """
+
     # batch_size is only used to conveniently set `embedding_batch_size` and `classifier_batch_size`
     # which are used in practice
     batch_size: Union[int, Tuple[int, int]] = field(default=(16, 2), repr=False)
@@ -31,20 +90,22 @@ class TrainingArguments:
     body_classifier_learning_rate: float = None
     head_learning_rate: float = 1e-2
 
-    seed: int = 42
-    use_amp: bool = False
-    warmup_proportion: float = 0.1
+    # Loss-related arguments
+    loss: Callable = losses.CosineSimilarityLoss
     distance_metric: Callable = losses.BatchHardTripletLossDistanceFunction.cosine_distance
     margin: float = 0.25
-    samples_per_label: int = 2
-    show_progress_bar: bool = True
 
+    end_to_end: bool = field(default=False)
+
+    use_amp: bool = False
+    warmup_proportion: float = 0.1
     l2_weight: Optional[float] = None
     max_length: Optional[int] = None
+    samples_per_label: int = 2
 
-    end_to_end: bool = False
-
-    loss: Callable = losses.CosineSimilarityLoss
+    # Arguments that do not affect performance
+    show_progress_bar: bool = True
+    seed: int = 42
 
     def __post_init__(self):
         # Set `self.embedding_batch_size` and `self.classifier_batch_size` using values from `self.batch_size`

From 68e9094c730b71eaa7725b0e9da466f089caaf3e Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 8 Feb 2023 11:26:23 +0100
Subject: [PATCH 25/77] Use standard dataclass initialization for SetFitModel

---
 src/setfit/modeling.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 43c26f9e..79256557 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -242,22 +242,11 @@ def __repr__(self):
 class SetFitModel(PyTorchModelHubMixin):
     """A SetFit model with integration to the Hugging Face Hub."""
 
-    def __init__(
-        self,
-        model_body: Optional[SentenceTransformer] = None,
-        model_head: Optional[Union[SetFitHead, LogisticRegression]] = None,
-        multi_target_strategy: Optional[str] = None,
-        l2_weight: float = 1e-2,
-        normalize_embeddings: bool = False,
-    ) -> None:
-        super(SetFitModel, self).__init__()
-        self.model_body = model_body
-        self.model_head = model_head
-
-        self.multi_target_strategy = multi_target_strategy
-        self.l2_weight = l2_weight
-
-        self.normalize_embeddings = normalize_embeddings
+    model_body: Optional[SentenceTransformer] = (None,)
+    model_head: Optional[Union[SetFitHead, LogisticRegression]] = None
+    multi_target_strategy: Optional[str] = None
+    l2_weight: float = 1e-2
+    normalize_embeddings: bool = False
 
     @property
     def has_differentiable_head(self) -> bool:

From 19a6fc8c3a2728b3a034d36d66110d2961776107 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 15 Feb 2023 12:05:55 +0100
Subject: [PATCH 26/77] Merge branch 'main' of
 https://github.com/huggingface/setfit into refactor_v2

---
 scripts/setfit/run_zeroshot.py   |  2 +-
 setup.py                         | 13 ++------
 src/setfit/__init__.py           |  2 +-
 src/setfit/data.py               | 24 +++++++++-----
 src/setfit/modeling.py           | 10 +++---
 src/setfit/trainer.py            | 27 ++++++++++++---
 tests/test_deprecated_trainer.py | 57 ++++++++++++++++++++++++++++----
 tests/test_trainer.py            | 53 +++++++++++++++++++++++++----
 8 files changed, 145 insertions(+), 43 deletions(-)

diff --git a/scripts/setfit/run_zeroshot.py b/scripts/setfit/run_zeroshot.py
index 231fe33b..cf65f0f6 100644
--- a/scripts/setfit/run_zeroshot.py
+++ b/scripts/setfit/run_zeroshot.py
@@ -92,7 +92,7 @@ def main():
 
     metric = DEV_DATASET_TO_METRIC.get(args.eval_dataset, TEST_DATASET_TO_METRIC.get(args.eval_dataset, "accuracy"))
 
-    if args.reference_dataset is None:
+    if args.reference_dataset is None and args.candidate_labels is None:
         args.reference_dataset = args.eval_dataset
 
     train_data = get_templated_dataset(
diff --git a/setup.py b/setup.py
index de78d342..ab703e04 100644
--- a/setup.py
+++ b/setup.py
@@ -10,26 +10,17 @@
 MAINTAINER_EMAIL = "lewis@huggingface.co"
 
 INTEGRATIONS_REQUIRE = ["optuna"]
-
 REQUIRED_PKGS = ["datasets>=2.3.0", "sentence-transformers>=2.2.1", "evaluate>=0.3.0"]
-
 QUALITY_REQUIRE = ["black", "flake8", "isort", "tabulate"]
-
 ONNX_REQUIRE = ["onnxruntime", "onnx", "skl2onnx"]
-
 OPENVINO_REQUIRE = ["hummingbird-ml", "openvino>=2022.3"]
-
 TESTS_REQUIRE = ["pytest", "pytest-cov"] + ONNX_REQUIRE + OPENVINO_REQUIRE
-
-COMPAT_TESTS_REQUIRE = [requirement.replace(">=", "==") for requirement in REQUIRED_PKGS] + TESTS_REQUIRE
-
 EXTRAS_REQUIRE = {
     "optuna": INTEGRATIONS_REQUIRE,
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
     "onnx": ONNX_REQUIRE,
     "openvino": ONNX_REQUIRE + OPENVINO_REQUIRE,
-    "compat_tests": COMPAT_TESTS_REQUIRE,
 }
 
 
@@ -38,11 +29,11 @@ def combine_requirements(base_keys):
 
 
 EXTRAS_REQUIRE["dev"] = combine_requirements([k for k in EXTRAS_REQUIRE])
-
+EXTRAS_REQUIRE["compat_tests"] = [requirement.replace(">=", "==") for requirement in REQUIRED_PKGS] + TESTS_REQUIRE
 
 setup(
     name="setfit",
-    version="0.6.0.dev0",
+    version="0.7.0.dev0",
     description="Efficient few-shot learning with Sentence Transformers",
     long_description=README_TEXT,
     long_description_content_type="text/markdown",
diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index b4d87dcd..0c79e2b0 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.0.dev0"
+__version__ = "0.7.0.dev0"
 
 import warnings
 
diff --git a/src/setfit/data.py b/src/setfit/data.py
index 966d249d..ce809bbd 100644
--- a/src/setfit/data.py
+++ b/src/setfit/data.py
@@ -3,9 +3,14 @@
 
 import pandas as pd
 import torch
-from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
+from datasets import Dataset, DatasetDict, load_dataset
 from torch.utils.data import Dataset as TorchDataset
 
+from . import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
@@ -160,14 +165,15 @@ def create_samples(df: pd.DataFrame, sample_size: int, seed: int) -> pd.DataFram
 def sample_dataset(dataset: Dataset, label_column: str = "label", num_samples: int = 8, seed: int = 42) -> Dataset:
     """Samples a Dataset to create an equal number of samples per class (when possible)."""
     shuffled_dataset = dataset.shuffle(seed=seed)
-    num_labels = len(dataset.unique(label_column))
-    samples = []
-    for label in range(num_labels):
-        data = shuffled_dataset.filter(lambda example: int(example[label_column]) == label)
-        num_label_samples = min(len(data), num_samples)
-        samples.append(data.select([i for i in range(num_label_samples)]))
-
-    all_samples = concatenate_datasets(samples)
+
+    df = shuffled_dataset.to_pandas()
+    df = df.groupby(label_column)
+
+    # sample num_samples, or at least as much as possible
+    df = df.apply(lambda x: x.sample(min(num_samples, len(x))))
+    df = df.reset_index(drop=True)
+
+    all_samples = Dataset.from_pandas(df)
     return all_samples.shuffle(seed=seed)
 
 
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 79256557..0d744384 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -662,19 +662,21 @@ def sentence_pairs_generation(sentences, labels, pairs):
     # labels to indicate if a pair is positive or negative
 
     num_classes = np.unique(labels)
-    idx = [np.where(labels == i)[0] for i in num_classes]
+    label_to_idx = {x: i for i, x in enumerate(num_classes)}
+    positive_idxs = [np.where(labels == i)[0] for i in num_classes]
+    negative_idxs = [np.where(labels != i)[0] for i in num_classes]
 
     for first_idx in range(len(sentences)):
         current_sentence = sentences[first_idx]
         label = labels[first_idx]
-        second_idx = np.random.choice(idx[np.where(num_classes == label)[0][0]])
+        second_idx = np.random.choice(positive_idxs[label_to_idx[label]])
         positive_sentence = sentences[second_idx]
         # Prepare a positive pair and update the sentences and labels
         # lists, respectively
         pairs.append(InputExample(texts=[current_sentence, positive_sentence], label=1.0))
 
-        negative_idx = np.where(labels != label)[0]
-        negative_sentence = sentences[np.random.choice(negative_idx)]
+        third_idx = np.random.choice(negative_idxs[label_to_idx[label]])
+        negative_sentence = sentences[third_idx]
         # Prepare a negative pair of sentences and update our lists
         pairs.append(InputExample(texts=[current_sentence, negative_sentence], label=0.0))
     # Return a 2-tuple of our sentence pairs and labels
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 17b18f41..58735588 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -11,10 +11,12 @@
 
 import evaluate
 import numpy as np
+from datasets import DatasetDict
 from sentence_transformers import InputExample, losses
 from sentence_transformers.datasets import SentenceLabelDataset
 from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
 from torch.utils.data import DataLoader
+from tqdm.auto import trange
 from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
 
 from . import logging
@@ -97,9 +99,23 @@ def _validate_column_mapping(self, dataset: "Dataset") -> None:
         required_columns = {"text", "label"}
         column_names = set(dataset.column_names)
         if self.column_mapping is None and not required_columns.issubset(column_names):
-            raise ValueError(
-                f"A column mapping must be provided when the dataset does not contain the following columns: {required_columns}"
-            )
+            # Issue #226: load_dataset will automatically assign points to "train" if no split is specified
+            if column_names == {"train"} and isinstance(dataset, DatasetDict):
+                raise ValueError(
+                    "SetFit expected a Dataset, but it got a DatasetDict with the split ['train']. "
+                    "Did you mean to select the training split with dataset['train']?"
+                )
+            elif isinstance(dataset, DatasetDict):
+                raise ValueError(
+                    f"SetFit expected a Dataset, but it got a DatasetDict with the splits {sorted(column_names)}. "
+                    "Did you mean to select one of these splits from the dataset?"
+                )
+            else:
+                raise ValueError(
+                    f"SetFit expected the dataset to have the columns {sorted(required_columns)}, "
+                    f"but only the columns {sorted(column_names)} were found. "
+                    "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
+                )
         if self.column_mapping is not None:
             missing_columns = required_columns.difference(self.column_mapping.values())
             if missing_columns:
@@ -108,7 +124,8 @@ def _validate_column_mapping(self, dataset: "Dataset") -> None:
                 )
             if not set(self.column_mapping.keys()).issubset(column_names):
                 raise ValueError(
-                    f"The following columns are missing from the dataset: {set(self.column_mapping.keys()).difference(column_names)}. Please provide a mapping for all required columns."
+                    f"The column mapping expected the columns {sorted(self.column_mapping.keys())} in the dataset, "
+                    f"but the dataset had the columns {sorted(column_names)}."
                 )
 
     def _apply_column_mapping(self, dataset: "Dataset", column_mapping: Dict[str, str]) -> "Dataset":
@@ -308,7 +325,7 @@ def train_embeddings(
         else:
             train_examples = []
 
-            for _ in range(args.num_iterations):
+            for _ in trange(args.num_iterations, desc="Generating Training Pairs", disable=not args.show_progress_bar):
                 if self.model.multi_target_strategy is not None:
                     train_examples = sentence_pairs_generation_multilabel(
                         np.array(x_train), np.array(y_train), train_examples
diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
index 2a19163f..3bbcf587 100644
--- a/tests/test_deprecated_trainer.py
+++ b/tests/test_deprecated_trainer.py
@@ -1,8 +1,11 @@
+import pathlib
+import re
+import tempfile
 from unittest import TestCase
 
 import evaluate
 import pytest
-from datasets import Dataset
+from datasets import Dataset, load_dataset
 from sentence_transformers import losses
 from transformers.testing_utils import require_optuna
 from transformers.utils.hp_naming import TrialShortNamer
@@ -75,23 +78,65 @@ def test_trainer_raises_error_with_missing_label(self):
             trainer.train()
 
     def test_trainer_raises_error_with_missing_text(self):
+        """If the required columns are missing from the dataset, the library should throw an error and list the columns found."""
         dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
         trainer = SetFitTrainer(
             model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
         )
-        with pytest.raises(ValueError):
-            trainer.train()
+        expected_message = re.escape(
+            "SetFit expected the dataset to have the columns ['label', 'text'], "
+            "but only the columns ['extra_column', 'label'] were found. "
+            "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
+        )
+        with pytest.raises(ValueError, match=expected_message):
+            trainer._validate_column_mapping(trainer.train_dataset)
 
-    def test_column_mapping_with_missing_text(self):
+    def test_column_mapping_raises_error_when_mapped_columns_missing(self):
+        """If the columns specified in the column mapping are missing from the dataset, the library should throw an error and list the columns found."""
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
         trainer = SetFitTrainer(
             model=self.model,
             train_dataset=dataset,
             eval_dataset=dataset,
             num_iterations=self.num_iterations,
-            column_mapping={"label_new": "label"},
+            column_mapping={"text_new": "text", "label_new": "label"},
         )
-        with pytest.raises(ValueError):
+        expected_message = re.escape(
+            "The column mapping expected the columns ['label_new', 'text_new'] in the dataset, "
+            "but the dataset had the columns ['extra_column', 'text'].",
+        )
+        with pytest.raises(ValueError, match=expected_message):
+            trainer._validate_column_mapping(trainer.train_dataset)
+
+    def test_trainer_raises_error_when_dataset_not_split(self):
+        """Verify that an error is raised if we pass an unsplit dataset to the trainer."""
+        dataset = Dataset.from_dict({"text": ["a", "b", "c", "d"], "label": [0, 0, 1, 1]}).train_test_split(
+            test_size=0.5
+        )
+        trainer = SetFitTrainer(
+            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
+        )
+        expected_message = re.escape(
+            "SetFit expected a Dataset, but it got a DatasetDict with the splits ['test', 'train']. "
+            "Did you mean to select one of these splits from the dataset?",
+        )
+        with pytest.raises(ValueError, match=expected_message):
+            trainer._validate_column_mapping(trainer.train_dataset)
+
+    def test_trainer_raises_error_when_dataset_is_dataset_dict_with_train(self):
+        """Verify that a useful error is raised if we pass an unsplit dataset with only a `train` split to the trainer."""
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            path = pathlib.Path(tmpdirname) / "test_dataset_dict_with_train.csv"
+            path.write_text("label,text\n1,good\n0,terrible\n")
+            dataset = load_dataset("csv", data_files=str(path))
+        trainer = SetFitTrainer(
+            model=self.model, train_dataset=dataset, eval_dataset=dataset, num_iterations=self.num_iterations
+        )
+        expected_message = re.escape(
+            "SetFit expected a Dataset, but it got a DatasetDict with the split ['train']. "
+            "Did you mean to select the training split with dataset['train']?",
+        )
+        with pytest.raises(ValueError, match=expected_message):
             trainer._validate_column_mapping(trainer.train_dataset)
 
     def test_column_mapping_multilabel(self):
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index ed66286d..a964aaf2 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1,8 +1,11 @@
+import pathlib
+import re
+import tempfile
 from unittest import TestCase
 
 import evaluate
 import pytest
-from datasets import Dataset
+from datasets import Dataset, load_dataset
 from sentence_transformers import losses
 from transformers.testing_utils import require_optuna
 from transformers.utils.hp_naming import TrialShortNamer
@@ -72,21 +75,59 @@ def test_trainer_raises_error_with_missing_label(self):
             trainer.train()
 
     def test_trainer_raises_error_with_missing_text(self):
+        """If the required columns are missing from the dataset, the library should throw an error and list the columns found."""
         dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
         trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
-        with pytest.raises(ValueError):
-            trainer.train()
+        expected_message = re.escape(
+            "SetFit expected the dataset to have the columns ['label', 'text'], "
+            "but only the columns ['extra_column', 'label'] were found. "
+            "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
+        )
+        with pytest.raises(ValueError, match=expected_message):
+            trainer._validate_column_mapping(trainer.train_dataset)
 
-    def test_column_mapping_with_missing_text(self):
+    def test_column_mapping_raises_error_when_mapped_columns_missing(self):
+        """If the columns specified in the column mapping are missing from the dataset, the library should throw an error and list the columns found."""
         dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
         trainer = Trainer(
             model=self.model,
             args=self.args,
             train_dataset=dataset,
             eval_dataset=dataset,
-            column_mapping={"label_new": "label"},
+            column_mapping={"text_new": "text", "label_new": "label"},
         )
-        with pytest.raises(ValueError):
+        expected_message = re.escape(
+            "The column mapping expected the columns ['label_new', 'text_new'] in the dataset, "
+            "but the dataset had the columns ['extra_column', 'text'].",
+        )
+        with pytest.raises(ValueError, match=expected_message):
+            trainer._validate_column_mapping(trainer.train_dataset)
+
+    def test_trainer_raises_error_when_dataset_not_split(self):
+        """Verify that an error is raised if we pass an unsplit dataset to the trainer."""
+        dataset = Dataset.from_dict({"text": ["a", "b", "c", "d"], "label": [0, 0, 1, 1]}).train_test_split(
+            test_size=0.5
+        )
+        trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
+        expected_message = re.escape(
+            "SetFit expected a Dataset, but it got a DatasetDict with the splits ['test', 'train']. "
+            "Did you mean to select one of these splits from the dataset?",
+        )
+        with pytest.raises(ValueError, match=expected_message):
+            trainer._validate_column_mapping(trainer.train_dataset)
+
+    def test_trainer_raises_error_when_dataset_is_dataset_dict_with_train(self):
+        """Verify that a useful error is raised if we pass an unsplit dataset with only a `train` split to the trainer."""
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            path = pathlib.Path(tmpdirname) / "test_dataset_dict_with_train.csv"
+            path.write_text("label,text\n1,good\n0,terrible\n")
+            dataset = load_dataset("csv", data_files=str(path))
+        trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
+        expected_message = re.escape(
+            "SetFit expected a Dataset, but it got a DatasetDict with the split ['train']. "
+            "Did you mean to select the training split with dataset['train']?",
+        )
+        with pytest.raises(ValueError, match=expected_message):
             trainer._validate_column_mapping(trainer.train_dataset)
 
     def test_column_mapping_multilabel(self):

From ca87c4266148267520b3122710ec2b49d04adc35 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 16 Feb 2023 13:07:35 +0100
Subject: [PATCH 27/77] Remove duplicate space in DeprecationWarning

---
 src/setfit/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 58735588..31cdfcce 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -547,7 +547,7 @@ def __init__(
     ):
         warnings.warn(
             "`SetFitTrainer` has been deprecated and will be removed in v2.0.0 of SetFit. "
-            " Please use `Trainer` instead.",
+            "Please use `Trainer` instead.",
             DeprecationWarning,
             stacklevel=2,
         )

From cc5282fe30117d3e9c8d10832e711a25131f1b44 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 3 Mar 2023 16:28:58 +0100
Subject: [PATCH 28/77] No longer require labeled data for DistillationTrainer

This bug was pointed out by #320
---
 src/setfit/trainer.py                         |   9 +-
 src/setfit/trainer_distillation.py            | 139 +++++++++++-------
 tests/test_deprecated_trainer_distillation.py |  29 +++-
 tests/test_trainer_distillation.py            |  29 +++-
 4 files changed, 137 insertions(+), 69 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 31cdfcce..0fff050c 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -63,6 +63,8 @@ class Trainer:
             `{"text_column_name": "text", "label_column_name: "label"}`.
     """
 
+    _REQUIRED_COLUMNS = {"text", "label"}
+
     def __init__(
         self,
         model: Optional["SetFitModel"] = None,
@@ -96,9 +98,8 @@ def _validate_column_mapping(self, dataset: "Dataset") -> None:
         """
         Validates the provided column mapping against the dataset.
         """
-        required_columns = {"text", "label"}
         column_names = set(dataset.column_names)
-        if self.column_mapping is None and not required_columns.issubset(column_names):
+        if self.column_mapping is None and not self._REQUIRED_COLUMNS.issubset(column_names):
             # Issue #226: load_dataset will automatically assign points to "train" if no split is specified
             if column_names == {"train"} and isinstance(dataset, DatasetDict):
                 raise ValueError(
@@ -112,12 +113,12 @@ def _validate_column_mapping(self, dataset: "Dataset") -> None:
                 )
             else:
                 raise ValueError(
-                    f"SetFit expected the dataset to have the columns {sorted(required_columns)}, "
+                    f"SetFit expected the dataset to have the columns {sorted(self._REQUIRED_COLUMNS)}, "
                     f"but only the columns {sorted(column_names)} were found. "
                     "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
                 )
         if self.column_mapping is not None:
-            missing_columns = required_columns.difference(self.column_mapping.values())
+            missing_columns = self._REQUIRED_COLUMNS.difference(self.column_mapping.values())
             if missing_columns:
                 raise ValueError(
                     f"The following columns are missing from the column mapping: {missing_columns}. Please provide a mapping for all required columns."
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 588578c9..41bfa26d 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -1,21 +1,21 @@
 import math
 import warnings
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import torch
-from sentence_transformers import InputExample, losses, util
-from sentence_transformers.datasets import SentenceLabelDataset
+from sentence_transformers import losses, util
 from torch.utils.data import DataLoader
+from transformers.trainer_utils import set_seed
 
 from . import logging
-from .losses import SupConLoss
 from .modeling import sentence_pairs_generation_cos_sim
 from .trainer import Trainer
 from .training_args import TrainingArguments
 
 
 if TYPE_CHECKING:
+    import optuna
     from datasets import Dataset
 
     from .modeling import SetFitModel
@@ -52,6 +52,8 @@ class DistillationTrainer(Trainer):
             `{"text_column_name": "text", "label_column_name: "label"}`.
     """
 
+    _REQUIRED_COLUMNS = {"text"}
+
     def __init__(
         self,
         teacher_model: "SetFitModel",
@@ -76,59 +78,81 @@ def __init__(
         self.teacher_model = teacher_model
         self.student_model = self.model
 
+    def train(
+        self,
+        args: Optional[TrainingArguments] = None,
+        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Main training entry point.
+
+        Args:
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the training arguments for this training call.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+        """
+        if len(kwargs):
+            warnings.warn(
+                f"`{self.__class__.__name__}.train` does not accept keyword arguments anymore. "
+                f"Please provide training arguments via a `TrainingArguments` instance to the `{self.__class__.__name__}` "
+                f"initialisation or the `{self.__class__.__name__}.train` method.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+        args = args or self.args or TrainingArguments()
+
+        set_seed(args.seed)  # Seed must be set before instantiating the model when using model_init.
+
+        if trial:  # Trial and model initialization
+            self._hp_search_setup(trial)  # sets trainer parameters and initializes model
+
+        if self.train_dataset is None:
+            raise ValueError(
+                f"Training requires a `train_dataset` given to the `{self.__class__.__name__}` initialization."
+            )
+
+        self._validate_column_mapping(self.train_dataset)
+        train_dataset = self.train_dataset
+        if self.column_mapping is not None:
+            logger.info("Applying column mapping to training dataset")
+            train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
+
+        x_train: List[str] = train_dataset["text"]
+
+        self.train_embeddings(x_train, args)
+        self.train_classifier(x_train, args)
+
     def train_embeddings(
         self,
         x_train: List[str],
-        y_train: Union[List[int], List[List[int]]],
         args: Optional[TrainingArguments] = None,
-    ):
+    ) -> None:
+        """
+        Method to perform the embedding phase: finetuning the student its `SentenceTransformer` body.
+
+        Args:
+            x_train (`List[str]`): A list of training sentences.
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the training arguments for this training call.
+        """
         args = args or self.args or TrainingArguments()
 
-        # sentence-transformers adaptation
-        if args.loss in [
-            losses.BatchAllTripletLoss,
-            losses.BatchHardTripletLoss,
-            losses.BatchSemiHardTripletLoss,
-            losses.BatchHardSoftMarginTripletLoss,
-            SupConLoss,
-        ]:
-            train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
-            train_data_sampler = SentenceLabelDataset(train_examples)
-
-            batch_size = min(args.embedding_batch_size, len(train_data_sampler))
-            train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
-
-            if args.loss is losses.BatchHardSoftMarginTripletLoss:
-                train_loss = args.loss(
-                    model=self.student_model.model_body,
-                    distance_metric=args.distance_metric,
-                )
-            elif args.loss is SupConLoss:
-                train_loss = args.loss(model=self.student_model)
-            else:
-                train_loss = args.loss(
-                    model=self.student_model.model_body,
-                    distance_metric=args.distance_metric,
-                    margin=args.margin,
-                )
-        else:
-            train_examples = []
-
-            # **************** student training *********************
-            # Only this snippet differs from Trainer.train_embeddings
-            x_train_embd_student = self.teacher_model.model_body.encode(x_train)
-            y_train = self.teacher_model.model_head.predict(x_train_embd_student)
-
-            cos_sim_matrix = util.cos_sim(x_train_embd_student, x_train_embd_student)
-
-            train_examples = []
-            for _ in range(args.num_iterations):
-                train_examples = sentence_pairs_generation_cos_sim(np.array(x_train), train_examples, cos_sim_matrix)
-            # **************** student training END *****************
-
-            batch_size = args.embedding_batch_size
-            train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-            train_loss = args.loss(self.student_model.model_body)
+        # **************** student training *********************
+        x_train_embd_student = self.teacher_model.model_body.encode(x_train)
+
+        cos_sim_matrix = util.cos_sim(x_train_embd_student, x_train_embd_student)
+
+        train_examples = []
+        for _ in range(args.num_iterations):
+            train_examples = sentence_pairs_generation_cos_sim(np.array(x_train), train_examples, cos_sim_matrix)
+        # **************** student training END *****************
+
+        batch_size = args.embedding_batch_size
+        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
+        train_loss = args.loss(self.student_model.model_body)
 
         total_train_steps = len(train_dataloader) * args.embedding_num_epochs
         logger.info("***** Running training *****")
@@ -147,6 +171,19 @@ def train_embeddings(
             use_amp=args.use_amp,
         )
 
+    def train_classifier(self, x_train: List[str], args: Optional[TrainingArguments] = None) -> None:
+        """
+        Method to perform the classifier phase: fitting the student classifier head.
+
+        Args:
+            x_train (`List[str]`): A list of training sentences.
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the training arguments for this training call.
+        """
+        x_train_embd_student = self.teacher_model.model_body.encode(x_train)
+        y_train = self.teacher_model.model_head.predict(x_train_embd_student)
+        return super().train_classifier(x_train, y_train, args)
+
 
 class DistillationSetFitTrainer(DistillationTrainer):
     """
diff --git a/tests/test_deprecated_trainer_distillation.py b/tests/test_deprecated_trainer_distillation.py
index 4257a42e..5d59c4f5 100644
--- a/tests/test_deprecated_trainer_distillation.py
+++ b/tests/test_deprecated_trainer_distillation.py
@@ -26,7 +26,6 @@ def test_trainer_works_with_default_columns(self):
         )
         # Teacher Train and evaluate
         teacher_trainer.train()
-        metrics = teacher_trainer.evaluate()
         teacher_model = teacher_trainer.model
 
         student_trainer = DistillationSetFitTrainer(
@@ -45,16 +44,32 @@ def test_trainer_works_with_default_columns(self):
         self.assertEqual(metrics["accuracy"], 1.0)
 
     def test_trainer_raises_error_with_missing_label(self):
-        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
-        trainer = DistillationSetFitTrainer(
+        labeled_dataset = Dataset.from_dict(
+            {"text": ["a", "b", "c"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+        # train a teacher model
+        teacher_trainer = SetFitTrainer(
+            model=self.teacher_model,
+            train_dataset=labeled_dataset,
+            eval_dataset=labeled_dataset,
+            metric="accuracy",
+            num_iterations=self.num_iterations,
+        )
+        # Teacher Train and evaluate
+        teacher_trainer.train()
+
+        unlabeled_dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
+        student_trainer = DistillationSetFitTrainer(
             teacher_model=self.teacher_model,
-            train_dataset=dataset,
             student_model=self.student_model,
-            eval_dataset=dataset,
+            train_dataset=unlabeled_dataset,
+            eval_dataset=labeled_dataset,
             num_iterations=self.num_iterations,
         )
-        with pytest.raises(ValueError):
-            trainer.train()
+        student_trainer.train()
+        metrics = student_trainer.evaluate()
+        print("Student results: ", metrics)
+        self.assertEqual(metrics["accuracy"], 1.0)
 
     def test_trainer_raises_error_with_missing_text(self):
         dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})
diff --git a/tests/test_trainer_distillation.py b/tests/test_trainer_distillation.py
index 2216b1ad..71b81e0a 100644
--- a/tests/test_trainer_distillation.py
+++ b/tests/test_trainer_distillation.py
@@ -25,7 +25,6 @@ def test_trainer_works_with_default_columns(self):
         )
         # Teacher Train and evaluate
         teacher_trainer.train()
-        metrics = teacher_trainer.evaluate()
         teacher_model = teacher_trainer.model
 
         student_trainer = DistillationTrainer(
@@ -43,16 +42,32 @@ def test_trainer_works_with_default_columns(self):
         self.assertEqual(metrics["accuracy"], 1.0)
 
     def test_trainer_raises_error_with_missing_label(self):
-        dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
-        trainer = DistillationTrainer(
+        labeled_dataset = Dataset.from_dict(
+            {"text": ["a", "b", "c"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+        )
+        # train a teacher model
+        teacher_trainer = Trainer(
+            model=self.teacher_model,
+            train_dataset=labeled_dataset,
+            eval_dataset=labeled_dataset,
+            metric="accuracy",
+            args=self.args,
+        )
+        # Teacher Train and evaluate
+        teacher_trainer.train()
+
+        unlabeled_dataset = Dataset.from_dict({"text": ["a", "b", "c"], "extra_column": ["d", "e", "f"]})
+        student_trainer = DistillationTrainer(
             teacher_model=self.teacher_model,
-            train_dataset=dataset,
             student_model=self.student_model,
-            eval_dataset=dataset,
+            train_dataset=unlabeled_dataset,
+            eval_dataset=labeled_dataset,
             args=self.args,
         )
-        with pytest.raises(ValueError):
-            trainer.train()
+        student_trainer.train()
+        metrics = student_trainer.evaluate()
+        print("Student results: ", metrics)
+        self.assertEqual(metrics["accuracy"], 1.0)
 
     def test_trainer_raises_error_with_missing_text(self):
         dataset = Dataset.from_dict({"label": [0, 1, 2], "extra_column": ["d", "e", "f"]})

From 36cbbfec240f3fa294a78d1b6841f35c95facf70 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Mar 2023 10:51:53 +0100
Subject: [PATCH 29/77] Update docs for v1.0.0

---
 README.md                     | 118 ++++++++++++++++------------------
 docs/source/en/quickstart.mdx | 112 +++++++++++++++-----------------
 2 files changed, 106 insertions(+), 124 deletions(-)

diff --git a/README.md b/README.md
index 31b3d390..71bddb45 100644
--- a/README.md
+++ b/README.md
@@ -46,9 +46,7 @@ Here is an end-to-end example using a classification head from `scikit-learn`:
 
 ```python
 from datasets import load_dataset
-from sentence_transformers.losses import CosineSimilarityLoss
-
-from setfit import SetFitModel, SetFitTrainer, sample_dataset
+from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
 
 
 # Load a dataset from the Hugging Face Hub
@@ -61,17 +59,19 @@ eval_dataset = dataset["validation"]
 # Load a SetFit model from Hub
 model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
 
-# Create trainer
-trainer = SetFitTrainer(
+args = TrainingArguments(
+    batch_size=16,
+    num_iterations=20,  # The number of text pairs to generate for contrastive learning
+    num_epochs=1  # The number of epochs to use for contrastive learning
+)
+
+trainer = Trainer(
     model=model,
+    args=args,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
     metric="accuracy",
-    batch_size=16,
-    num_iterations=20, # The number of text pairs to generate for contrastive learning
-    num_epochs=1, # The number of epochs to use for contrastive learning
-    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
+    column_mapping={"sentence": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
 )
 
 # Train and evaluate
@@ -81,7 +81,7 @@ metrics = trainer.evaluate()
 # Push model to the Hub
 trainer.push_to_hub("my-awesome-setfit-model")
 
-# Download from Hub and run inference
+# Download from Hub
 model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model")
 # Run inference
 preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
@@ -92,9 +92,7 @@ Here is an end-to-end example using `SetFitHead`:
 
 ```python
 from datasets import load_dataset
-from sentence_transformers.losses import CosineSimilarityLoss
-
-from setfit import SetFitModel, SetFitTrainer, sample_dataset
+from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
 
 
 # Load a dataset from the Hugging Face Hub
@@ -103,6 +101,7 @@ dataset = load_dataset("sst2")
 # Simulate the few-shot regime by sampling 8 examples per class
 train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8)
 eval_dataset = dataset["validation"]
+num_classes = 2
 
 # Load a SetFit model from Hub
 model = SetFitModel.from_pretrained(
@@ -111,36 +110,26 @@ model = SetFitModel.from_pretrained(
     head_params={"out_features": num_classes},
 )
 
-# Create trainer
-trainer = SetFitTrainer(
+args = TrainingArguments(
+    body_learning_rate=2e-5,
+    head_learning_rate=1e-2,
+    batch_size=16,
+    num_iterations=20,  # The number of text pairs to generate for contrastive learning
+    num_epochs=(1, 25),  # For finetuning the embeddings and training the classifier, respectively
+    l2_weight=0.0,
+    end_to_end=False,  # Don't train the classifier end-to-end, i.e. only train the head
+)
+
+trainer = Trainer(
     model=model,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
     metric="accuracy",
-    batch_size=16,
-    num_iterations=20, # The number of text pairs to generate for contrastive learning
-    num_epochs=1, # The number of epochs to use for contrastive learning
-    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
+    column_mapping={"sentence": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
 )
 
 # Train and evaluate
-trainer.freeze() # Freeze the head
-trainer.train() # Train only the body
-
-# Unfreeze the head and freeze the body -> head-only training
-trainer.unfreeze(keep_body_frozen=True)
-# or
-# Unfreeze the head and unfreeze the body -> end-to-end training
-trainer.unfreeze(keep_body_frozen=False)
-
-trainer.train(
-    num_epochs=25, # The number of epochs to train the head or the whole model (body and head)
-    batch_size=16,
-    body_learning_rate=1e-5, # The body's learning rate
-    learning_rate=1e-2, # The head's learning rate
-    l2_weight=0.0, # Weight decay on **both** the body and head. If `None`, will use 0.01.
-)
+trainer.train()
 metrics = trainer.evaluate()
 
 # Push model to the Hub
@@ -175,7 +164,7 @@ This will initialise a multilabel classification head from `sklearn` - the follo
 * `multi-output`: uses a `MultiOutputClassifier` head.
 * `classifier-chain`: uses a `ClassifierChain` head.
 
-From here, you can instantiate a `SetFitTrainer` using the same example above, and train it as usual.
+From here, you can instantiate a `Trainer` using the same example above, and train it as usual.
 
 #### Example using the differentiable `SetFitHead`:
 
@@ -196,7 +185,6 @@ model = SetFitModel.from_pretrained(
 SetFit can also be applied to scenarios where no labels are available. To do so, create a synthetic dataset of training examples:
 
 ```python
-from datasets import Dataset
 from setfit import get_templated_dataset
 
 candidate_labels = ["negative", "positive"]
@@ -206,22 +194,22 @@ train_dataset = get_templated_dataset(candidate_labels=candidate_labels, sample_
 This will create examples of the form `"This sentence is {}"`, where the `{}` is filled in with one of the candidate labels. From here you can train a SetFit model as usual:
 
 ```python
-from setfit import SetFitModel, SetFitTrainer
+from setfit import SetFitModel, Trainer
 
 model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
-trainer = SetFitTrainer(
+trainer = Trainer(
     model=model,
     train_dataset=train_dataset
 )
 trainer.train()
 ```
 
-We find this approach typically outperforms the [zero-shot pipeline](https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline) in 🤗 Transformers (based on MNLI with Bart), while being 5x faster to generate predictions with.
+We find this approach typically outperforms the [zero-shot pipeline](https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline) in 🤗 Transformers (based on MNLI with BART), while being 5x faster to generate predictions with.
 
 
 ### Running hyperparameter search
 
-`SetFitTrainer` provides a `hyperparameter_search()` method that you can use to find good hyperparameters for your data. To use this feature, first install the `optuna` backend:
+`Trainer` provides a `hyperparameter_search()` method that you can use to find good hyperparameters for your data. To use this feature, first install the `optuna` backend:
 
 ```bash
 python -m pip install setfit[optuna]
@@ -267,23 +255,23 @@ def hp_space(trial):  # Training parameters
 
 **Note:** In practice, we found `num_iterations` to be the most important hyperparameter for the contrastive learning process.
 
-The next step is to instantiate a `SetFitTrainer` and call `hyperparameter_search()`:
+The next step is to instantiate a `Trainer` and call `hyperparameter_search()`:
 
 ```python
 from datasets import Dataset
-from setfit import SetFitTrainer
+from setfit import Trainer
 
 dataset = Dataset.from_dict(
-            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
-        )
+    {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+)
 
-trainer = SetFitTrainer(
+trainer = Trainer(
     train_dataset=dataset,
     eval_dataset=dataset,
     model_init=model_init,
     column_mapping={"text_new": "text", "label_new": "label"},
 )
-best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=20)
+best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=5)
 ```
 
 Finally, you can apply the hyperparameters you found to the trainer, and lock in the optimal model, before training for
@@ -300,9 +288,8 @@ If you have access to unlabeled data, you can use knowledge distillation to comp
 
 ```python
 from datasets import load_dataset
-from sentence_transformers.losses import CosineSimilarityLoss
-
-from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer, sample_dataset
+from setfit import SetFitModel, Trainer, DistillationTrainer, sample_dataset
+from setfit.training_args import TrainingArguments
 
 # Load a dataset from the Hugging Face Hub
 dataset = load_dataset("ag_news")
@@ -320,34 +307,37 @@ teacher_model = SetFitModel.from_pretrained(
 )
 
 # Create trainer for teacher model
-teacher_trainer = SetFitTrainer(
+teacher_trainer = Trainer(
     model=teacher_model,
     train_dataset=train_dataset_teacher,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
 )
 
 # Train teacher model
 teacher_trainer.train()
+teacher_metrics = teacher_trainer.evaluate()
 
 # Load small student model
 student_model = SetFitModel.from_pretrained("paraphrase-MiniLM-L3-v2")
 
+args = TrainingArguments(
+    batch_size=16,
+    num_iterations=20,
+    num_epochs=1
+)
+
 # Create trainer for knowledge distillation
-student_trainer = DistillationSetFitTrainer(
+student_trainer = DistillationTrainer(
     teacher_model=teacher_model,
-    train_dataset=train_dataset_student,
     student_model=student_model,
+    args=args,
+    train_dataset=train_dataset_student,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
-    metric="accuracy",
-    batch_size=16,
-    num_iterations=20,
-    num_epochs=1,
 )
 
 # Train student with knowledge distillation
 student_trainer.train()
+student_metrics = student_trainer.evaluate()
 ```
 
 
@@ -402,7 +392,8 @@ make style && make quality
 
 ## Citation
 
-```@misc{https://doi.org/10.48550/arxiv.2209.11055,
+```
+@misc{https://doi.org/10.48550/arxiv.2209.11055,
   doi = {10.48550/ARXIV.2209.11055},
   url = {https://arxiv.org/abs/2209.11055},
   author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren},
@@ -410,5 +401,6 @@ make style && make quality
   title = {Efficient Few-Shot Learning Without Prompts},
   publisher = {arXiv},
   year = {2022},
-  copyright = {Creative Commons Attribution 4.0 International}}
+  copyright = {Creative Commons Attribution 4.0 International}
+}
 ```
diff --git a/docs/source/en/quickstart.mdx b/docs/source/en/quickstart.mdx
index cc10ba5b..74bd75de 100644
--- a/docs/source/en/quickstart.mdx
+++ b/docs/source/en/quickstart.mdx
@@ -18,9 +18,7 @@ Here is an end-to-end example using a classification head from `scikit-learn`:
 
 ```python
 from datasets import load_dataset
-from sentence_transformers.losses import CosineSimilarityLoss
-
-from setfit import SetFitModel, SetFitTrainer, sample_dataset
+from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
 
 
 # Load a dataset from the Hugging Face Hub
@@ -33,17 +31,19 @@ eval_dataset = dataset["validation"]
 # Load a SetFit model from Hub
 model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
 
-# Create trainer
-trainer = SetFitTrainer(
+args = TrainingArguments(
+    batch_size=16,
+    num_iterations=20,  # The number of text pairs to generate for contrastive learning
+    num_epochs=1  # The number of epochs to use for contrastive learning
+)
+
+trainer = Trainer(
     model=model,
+    args=args,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
     metric="accuracy",
-    batch_size=16,
-    num_iterations=20, # The number of text pairs to generate for contrastive learning
-    num_epochs=1, # The number of epochs to use for contrastive learning
-    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
+    column_mapping={"sentence": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
 )
 
 # Train and evaluate
@@ -53,7 +53,7 @@ metrics = trainer.evaluate()
 # Push model to the Hub
 trainer.push_to_hub("my-awesome-setfit-model")
 
-# Download from Hub and run inference
+# Download from Hub
 model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model")
 # Run inference
 preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
@@ -64,9 +64,7 @@ Here is an end-to-end example using `SetFitHead`:
 
 ```python
 from datasets import load_dataset
-from sentence_transformers.losses import CosineSimilarityLoss
-
-from setfit import SetFitModel, SetFitTrainer, sample_dataset
+from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset
 
 
 # Load a dataset from the Hugging Face Hub
@@ -75,6 +73,7 @@ dataset = load_dataset("sst2")
 # Simulate the few-shot regime by sampling 8 examples per class
 train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=8)
 eval_dataset = dataset["validation"]
+num_classes = 2
 
 # Load a SetFit model from Hub
 model = SetFitModel.from_pretrained(
@@ -83,36 +82,26 @@ model = SetFitModel.from_pretrained(
     head_params={"out_features": num_classes},
 )
 
-# Create trainer
-trainer = SetFitTrainer(
+args = TrainingArguments(
+    body_learning_rate=2e-5,
+    head_learning_rate=1e-2,
+    batch_size=16,
+    num_iterations=20,  # The number of text pairs to generate for contrastive learning
+    num_epochs=(1, 25),  # For finetuning the embeddings and training the classifier, respectively
+    l2_weight=0.0,
+    end_to_end=False,  # Don't train the classifier end-to-end, i.e. only train the head
+)
+
+trainer = Trainer(
     model=model,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
     metric="accuracy",
-    batch_size=16,
-    num_iterations=20, # The number of text pairs to generate for contrastive learning
-    num_epochs=1, # The number of epochs to use for contrastive learning
-    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
+    column_mapping={"sentence": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
 )
 
 # Train and evaluate
-trainer.freeze() # Freeze the head
-trainer.train() # Train only the body
-
-# Unfreeze the head and freeze the body -> head-only training
-trainer.unfreeze(keep_body_frozen=True)
-# or
-# Unfreeze the head and unfreeze the body -> end-to-end training
-trainer.unfreeze(keep_body_frozen=False)
-
-trainer.train(
-    num_epochs=25, # The number of epochs to train the head or the whole model (body and head)
-    batch_size=16,
-    body_learning_rate=1e-5, # The body's learning rate
-    learning_rate=1e-2, # The head's learning rate
-    l2_weight=0.0, # Weight decay on **both** the body and head. If `None`, will use 0.01.
-)
+trainer.train()
 metrics = trainer.evaluate()
 
 # Push model to the Hub
@@ -147,7 +136,7 @@ This will initialise a multilabel classification head from `sklearn` - the follo
 * `multi-output`: uses a `MultiOutputClassifier` head.
 * `classifier-chain`: uses a `ClassifierChain` head.
 
-From here, you can instantiate a `SetFitTrainer` using the same example above, and train it as usual.
+From here, you can instantiate a `Trainer` using the same example above, and train it as usual.
 
 #### Example using the differentiable `SetFitHead`:
 
@@ -168,7 +157,6 @@ model = SetFitModel.from_pretrained(
 SetFit can also be applied to scenarios where no labels are available. To do so, create a synthetic dataset of training examples:
 
 ```python
-from datasets import Dataset
 from setfit import get_templated_dataset
 
 candidate_labels = ["negative", "positive"]
@@ -178,22 +166,22 @@ train_dataset = get_templated_dataset(candidate_labels=candidate_labels, sample_
 This will create examples of the form `"This sentence is {}"`, where the `{}` is filled in with one of the candidate labels. From here you can train a SetFit model as usual:
 
 ```python
-from setfit import SetFitModel, SetFitTrainer
+from setfit import SetFitModel, Trainer
 
 model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
-trainer = SetFitTrainer(
+trainer = Trainer(
     model=model,
     train_dataset=train_dataset
 )
 trainer.train()
 ```
 
-We find this approach typically outperforms the [zero-shot pipeline](https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline) in 🤗 Transformers (based on MNLI with Bart), while being 5x faster to generate predictions with.
+We find this approach typically outperforms the [zero-shot pipeline](https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/pipelines#transformers.ZeroShotClassificationPipeline) in 🤗 Transformers (based on MNLI with BART), while being 5x faster to generate predictions with.
 
 
 ### Running hyperparameter search
 
-`SetFitTrainer` provides a `hyperparameter_search()` method that you can use to find good hyperparameters for your data. To use this feature, first install the `optuna` backend:
+`Trainer` provides a `hyperparameter_search()` method that you can use to find good hyperparameters for your data. To use this feature, first install the `optuna` backend:
 
 ```bash
 python -m pip install setfit[optuna]
@@ -239,23 +227,23 @@ def hp_space(trial):  # Training parameters
 
 **Note:** In practice, we found `num_iterations` to be the most important hyperparameter for the contrastive learning process.
 
-The next step is to instantiate a `SetFitTrainer` and call `hyperparameter_search()`:
+The next step is to instantiate a `Trainer` and call `hyperparameter_search()`:
 
 ```python
 from datasets import Dataset
-from setfit import SetFitTrainer
+from setfit import Trainer
 
 dataset = Dataset.from_dict(
-            {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
-        )
+    {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
+)
 
-trainer = SetFitTrainer(
+trainer = Trainer(
     train_dataset=dataset,
     eval_dataset=dataset,
     model_init=model_init,
     column_mapping={"text_new": "text", "label_new": "label"},
 )
-best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=20)
+best_run = trainer.hyperparameter_search(direction="maximize", hp_space=hp_space, n_trials=5)
 ```
 
 Finally, you can apply the hyperparameters you found to the trainer, and lock in the optimal model, before training for
@@ -272,9 +260,8 @@ If you have access to unlabeled data, you can use knowledge distillation to comp
 
 ```python
 from datasets import load_dataset
-from sentence_transformers.losses import CosineSimilarityLoss
-
-from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer, sample_dataset
+from setfit import SetFitModel, Trainer, DistillationTrainer, sample_dataset
+from setfit.training_args import TrainingArguments
 
 # Load a dataset from the Hugging Face Hub
 dataset = load_dataset("ag_news")
@@ -292,32 +279,35 @@ teacher_model = SetFitModel.from_pretrained(
 )
 
 # Create trainer for teacher model
-teacher_trainer = SetFitTrainer(
+teacher_trainer = Trainer(
     model=teacher_model,
     train_dataset=train_dataset_teacher,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
 )
 
 # Train teacher model
 teacher_trainer.train()
+teacher_metrics = teacher_trainer.evaluate()
 
 # Load small student model
 student_model = SetFitModel.from_pretrained("paraphrase-MiniLM-L3-v2")
 
+args = TrainingArguments(
+    batch_size=16,
+    num_iterations=20,
+    num_epochs=1
+)
+
 # Create trainer for knowledge distillation
-student_trainer = DistillationSetFitTrainer(
+student_trainer = DistillationTrainer(
     teacher_model=teacher_model,
-    train_dataset=train_dataset_student,
     student_model=student_model,
+    args=args,
+    train_dataset=train_dataset_student,
     eval_dataset=eval_dataset,
-    loss_class=CosineSimilarityLoss,
-    metric="accuracy",
-    batch_size=16,
-    num_iterations=20,
-    num_epochs=1,
 )
 
 # Train student with knowledge distillation
 student_trainer.train()
+student_metrics = student_trainer.evaluate()
 ```
\ No newline at end of file

From deb57fffd03b733e049b8d4750a6dec3799c04b9 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Mar 2023 11:04:11 +0100
Subject: [PATCH 30/77] Remove references of SetFitTrainer

---
 README.md                          |  2 +-
 docs/source/en/api/trainer.mdx     |  8 ++++----
 docs/source/en/quickstart.mdx      |  2 +-
 src/setfit/integrations.py         |  4 ++--
 src/setfit/trainer.py              | 10 +++++-----
 src/setfit/trainer_distillation.py |  2 +-
 src/setfit/utils.py                |  2 +-
 tests/exporters/test_onnx.py       | 29 ++++++++++++++---------------
 tests/test_trainer.py              | 10 +++++-----
 tests/test_trainer_distillation.py |  2 +-
 10 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 71bddb45..93bc9efe 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ The examples below provide a quick overview on the various features supported in
 `setfit` is integrated with the [Hugging Face Hub](https://huggingface.co/) and provides two main classes:
 
 * `SetFitModel`: a wrapper that combines a pretrained body from `sentence_transformers` and a classification head from either [`scikit-learn`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) or [`SetFitHead`](https://github.com/huggingface/setfit/blob/main/src/setfit/modeling.py) (a differentiable head built upon `PyTorch` with similar APIs to `sentence_transformers`).
-* `SetFitTrainer`: a helper class that wraps the fine-tuning process of SetFit.
+* `Trainer`: a helper class that wraps the fine-tuning process of SetFit.
 
 Here is an end-to-end example using a classification head from `scikit-learn`:
 
diff --git a/docs/source/en/api/trainer.mdx b/docs/source/en/api/trainer.mdx
index a51df833..4b605dc8 100644
--- a/docs/source/en/api/trainer.mdx
+++ b/docs/source/en/api/trainer.mdx
@@ -1,8 +1,8 @@
 
-# SetFitTrainer
+# Trainer
 
-[[autodoc]] SetFitTrainer
+[[autodoc]] Trainer
 
-# DistillationSetFitTrainer
+# DistillationTrainer
 
-[[autodoc]] DistillationSetFitTrainer
\ No newline at end of file
+[[autodoc]] DistillationTrainer
\ No newline at end of file
diff --git a/docs/source/en/quickstart.mdx b/docs/source/en/quickstart.mdx
index 74bd75de..9e46933b 100644
--- a/docs/source/en/quickstart.mdx
+++ b/docs/source/en/quickstart.mdx
@@ -11,7 +11,7 @@ The examples below provide a quick overview on the various features supported in
 `setfit` is integrated with the [Hugging Face Hub](https://huggingface.co/) and provides two main classes:
 
 * `SetFitModel`: a wrapper that combines a pretrained body from `sentence_transformers` and a classification head from either [`scikit-learn`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) or [`SetFitHead`](https://github.com/huggingface/setfit/blob/main/src/setfit/modeling.py) (a differentiable head built upon `PyTorch` with similar APIs to `sentence_transformers`).
-* `SetFitTrainer`: a helper class that wraps the fine-tuning process of SetFit.
+* `Trainer`: a helper class that wraps the fine-tuning process of SetFit.
 
 Here is an end-to-end example using a classification head from `scikit-learn`:
 
diff --git a/src/setfit/integrations.py b/src/setfit/integrations.py
index 94d7161e..44b4858a 100644
--- a/src/setfit/integrations.py
+++ b/src/setfit/integrations.py
@@ -5,7 +5,7 @@
 
 
 if TYPE_CHECKING:
-    from .trainer import SetFitTrainer
+    from .trainer import Trainer
 
 
 def is_optuna_available():
@@ -17,7 +17,7 @@ def default_hp_search_backend():
         return "optuna"
 
 
-def run_hp_search_optuna(trainer: "SetFitTrainer", n_trials: int, direction: str, **kwargs) -> BestRun:
+def run_hp_search_optuna(trainer: "Trainer", n_trials: int, direction: str, **kwargs) -> BestRun:
     import optuna
 
     # Heavily inspired by transformers.integrations.run_hp_search_optuna
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 0fff050c..aa2d08d9 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -51,7 +51,7 @@ class Trainer:
             The evaluation dataset.
         model_init (`Callable[[], SetFitModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to
-            [`~SetFitTrainer.train`] will start from a new instance of the model as given by this
+            [`~Trainer.train`] will start from a new instance of the model as given by this
             function when a `trial` is passed.
         metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
             The metric to use for evaluation. If a string is provided, we treat it as the metric
@@ -86,10 +86,10 @@ def __init__(
             if model_init is not None:
                 model = self.call_model_init()
             else:
-                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument.")
+                raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument.")
         else:
             if model_init is not None:
-                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both.")
+                raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument, but not both.")
 
         self.model = model
         self.hp_search_backend = None
@@ -115,7 +115,7 @@ def _validate_column_mapping(self, dataset: "Dataset") -> None:
                 raise ValueError(
                     f"SetFit expected the dataset to have the columns {sorted(self._REQUIRED_COLUMNS)}, "
                     f"but only the columns {sorted(column_names)} were found. "
-                    "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
+                    "Either make sure these columns are present, or specify which columns to use with column_mapping in Trainer."
                 )
         if self.column_mapping is not None:
             missing_columns = self._REQUIRED_COLUMNS.difference(self.column_mapping.values())
@@ -432,7 +432,7 @@ def hyperparameter_search(
 
         <Tip warning={true}>
 
-        To use this method, you need to have provided a `model_init` when initializing your [`SetFitTrainer`]: we need to
+        To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to
         reinitialize the model at each new run.
 
         </Tip>
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 41bfa26d..f0ecb91e 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -40,7 +40,7 @@ class DistillationTrainer(Trainer):
             The evaluation dataset.
         model_init (`Callable[[], SetFitModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to
-            [`~SetFitTrainer.train`] will start from a new instance of the model as given by this
+            [`~DistillationTrainer.train`] will start from a new instance of the model as given by this
             function when a `trial` is passed.
         metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
             The metric to use for evaluation. If a string is provided, we treat it as the metric
diff --git a/src/setfit/utils.py b/src/setfit/utils.py
index 352f9c03..d75dc7cf 100644
--- a/src/setfit/utils.py
+++ b/src/setfit/utils.py
@@ -135,7 +135,7 @@ def summary(self) -> None:
 
 class BestRun(NamedTuple):
     """
-    The best run found by a hyperparameter search (see [`~SetFitTrainer.hyperparameter_search`]).
+    The best run found by a hyperparameter search (see [`~Trainer.hyperparameter_search`]).
 
     Parameters:
         run_id (`str`):
diff --git a/tests/exporters/test_onnx.py b/tests/exporters/test_onnx.py
index 9eb68578..6b49b96b 100644
--- a/tests/exporters/test_onnx.py
+++ b/tests/exporters/test_onnx.py
@@ -8,7 +8,8 @@
 from setfit import SetFitModel
 from setfit.data import get_templated_dataset
 from setfit.exporters.onnx import export_onnx
-from setfit.trainer import SetFitTrainer
+from setfit.trainer import Trainer
+from setfit.training_args import TrainingArguments
 
 
 def test_export_onnx_sklearn_head():
@@ -63,25 +64,23 @@ def test_export_onnx_torch_head(out_features):
         model_path, use_differentiable_head=True, head_params={"out_features": out_features}
     )
 
-    trainer = SetFitTrainer(
+    args = TrainingArguments(
+        num_iterations=15,
+        num_epochs=(1, 15),
+        batch_size=16,
+        body_learning_rate=(2e-5, 1e-5),
+        head_learning_rate=1e-2,
+        l2_weight=0.0,
+        end_to_end=True,
+    )
+    trainer = Trainer(
         model=model,
+        args=args,
         train_dataset=dataset,
         eval_dataset=dataset,
-        num_iterations=15,
         column_mapping={"text": "text", "label": "label"},
     )
-    # Train and evaluate
-    trainer.freeze()  # Freeze the head
-    trainer.train()  # Train only the body
-    # Unfreeze the head and unfreeze the body -> end-to-end training
-    trainer.unfreeze(keep_body_frozen=False)
-    trainer.train(
-        num_epochs=15,
-        batch_size=16,
-        body_learning_rate=1e-5,
-        learning_rate=1e-2,
-        l2_weight=0.0,
-    )
+    trainer.train()
 
     # Export the sklearn based model
     output_path = "model.onnx"
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index a964aaf2..33c6e14b 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -22,7 +22,7 @@
 logging.enable_propagation()
 
 
-class SetFitTrainerTest(TestCase):
+class TrainerTest(TestCase):
     def setUp(self):
         self.model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
         self.args = TrainingArguments(num_iterations=1)
@@ -81,7 +81,7 @@ def test_trainer_raises_error_with_missing_text(self):
         expected_message = re.escape(
             "SetFit expected the dataset to have the columns ['label', 'text'], "
             "but only the columns ['extra_column', 'label'] were found. "
-            "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
+            "Either make sure these columns are present, or specify which columns to use with column_mapping in Trainer."
         )
         with pytest.raises(ValueError, match=expected_message):
             trainer._validate_column_mapping(trainer.train_dataset)
@@ -205,7 +205,7 @@ def test_raise_when_metric_value_is_invalid(self):
             trainer.evaluate()
 
 
-class SetFitTrainerDifferentiableHeadTest(TestCase):
+class TrainerDifferentiableHeadTest(TestCase):
     def setUp(self):
         self.dataset = Dataset.from_dict(
             {"text_new": ["a", "b", "c"], "label_new": [0, 1, 2], "extra_column": ["d", "e", "f"]}
@@ -262,7 +262,7 @@ def test_trainer_max_length_is_smaller_than_max_acceptable_length(self):
                 raise AssertionError(e)
 
 
-class SetFitTrainerMultilabelTest(TestCase):
+class TrainerMultilabelTest(TestCase):
     def setUp(self):
         self.model = SetFitModel.from_pretrained(
             "sentence-transformers/paraphrase-albert-small-v2", multi_target_strategy="one-vs-rest"
@@ -302,7 +302,7 @@ def compute_metrics(y_pred, y_test):
         )
 
 
-class SetFitTrainerMultilabelDifferentiableTest(TestCase):
+class TrainerMultilabelDifferentiableTest(TestCase):
     def setUp(self):
         self.model = SetFitModel.from_pretrained(
             "sentence-transformers/paraphrase-albert-small-v2",
diff --git a/tests/test_trainer_distillation.py b/tests/test_trainer_distillation.py
index 71b81e0a..a9d80c47 100644
--- a/tests/test_trainer_distillation.py
+++ b/tests/test_trainer_distillation.py
@@ -8,7 +8,7 @@
 from setfit.training_args import TrainingArguments
 
 
-class DistillationSetFitTrainerTest(TestCase):
+class DistillationTrainerTest(TestCase):
     def setUp(self):
         self.teacher_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
         self.student_model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L3-v2")

From 46922d5d72be1ab400733a9ce3f0abf49e5133e3 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Mon, 6 Mar 2023 11:13:31 +0100
Subject: [PATCH 31/77] Update expected test output

---
 tests/test_deprecated_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
index 3bbcf587..b8f122d3 100644
--- a/tests/test_deprecated_trainer.py
+++ b/tests/test_deprecated_trainer.py
@@ -86,7 +86,7 @@ def test_trainer_raises_error_with_missing_text(self):
         expected_message = re.escape(
             "SetFit expected the dataset to have the columns ['label', 'text'], "
             "but only the columns ['extra_column', 'label'] were found. "
-            "Either make sure these columns are present, or specify which columns to use with column_mapping in SetFitTrainer."
+            "Either make sure these columns are present, or specify which columns to use with column_mapping in Trainer."
         )
         with pytest.raises(ValueError, match=expected_message):
             trainer._validate_column_mapping(trainer.train_dataset)

From b0f9f582e3782d3aa3c54341fb48c588c29b25dd Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 19 Apr 2023 09:04:34 +0200
Subject: [PATCH 32/77] Remove unused pipeline

---
 src/setfit/pipeline.py | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 src/setfit/pipeline.py

diff --git a/src/setfit/pipeline.py b/src/setfit/pipeline.py
deleted file mode 100644
index 51e551ff..00000000
--- a/src/setfit/pipeline.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .modeling import SKLearnWrapper
-
-
-class SetFitPipeline:
-    def __init__(self, model_name_or_path) -> None:
-        base_model = SKLearnWrapper()
-        base_model.load(model_name_or_path)
-        self.model = base_model
-
-    def __call__(self, inputs, *args, **kwargs):
-        model_outputs = self.model.predict(inputs)
-        return model_outputs

From 339f332e820c4d1f78194abbee4269db19a584d4 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 19 Apr 2023 09:05:35 +0200
Subject: [PATCH 33/77] Execute deprecations

---
 src/setfit/data.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/src/setfit/data.py b/src/setfit/data.py
index ce809bbd..8128e376 100644
--- a/src/setfit/data.py
+++ b/src/setfit/data.py
@@ -21,15 +21,6 @@
 SAMPLE_SIZES = [2, 4, 8, 16, 32, 64]
 
 
-def get_augmented_samples(*args, **kwargs) -> None:
-    warnings.warn(
-        "`get_augmented_samples` has been deprecated and will be removed in v1.0.0 of SetFit. "
-        "Please use `get_templated_dataset` instead.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
-
-
 def get_templated_dataset(
     dataset: Optional[Dataset] = None,
     candidate_labels: Optional[List[str]] = None,
@@ -115,15 +106,6 @@ def get_templated_dataset(
     return dataset
 
 
-def add_templated_examples(*args, **kwargs) -> None:
-    warnings.warn(
-        "`add_templated_examples` has been deprecated and will be removed in v1.0.0 of SetFit. "
-        "Please use `get_templated_dataset` instead.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
-
-
 def get_candidate_labels(dataset_name: str, label_names_column: str = "label_text") -> List[str]:
     dataset = load_dataset(dataset_name, split="train")
 

From 9e0bf78d1c86188e14f17023bbbf4819abce98e9 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 19 Apr 2023 09:16:53 +0200
Subject: [PATCH 34/77] Stop importing now-removed function

---
 src/setfit/__init__.py | 2 +-
 src/setfit/data.py     | 1 -
 src/setfit/modeling.py | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index 02dad09b..c36d630d 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -2,7 +2,7 @@
 
 import warnings
 
-from .data import add_templated_examples, get_templated_dataset, sample_dataset
+from .data import get_templated_dataset, sample_dataset
 from .modeling import SetFitHead, SetFitModel
 from .trainer import SetFitTrainer, Trainer
 from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
diff --git a/src/setfit/data.py b/src/setfit/data.py
index 8128e376..44507018 100644
--- a/src/setfit/data.py
+++ b/src/setfit/data.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import pandas as pd
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index e5f23cf1..1a0b947e 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -1,6 +1,6 @@
 import os
-import warnings
 import tempfile
+import warnings
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union

From ecabbcfbdc842cbc2b3c7592a933d9fb9efd7bd5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 6 Jul 2023 22:22:57 +0200
Subject: [PATCH 35/77] Initial setup for logging & callbacks

---
 .gitignore                         |   4 +
 src/setfit/sentence_transformer.py | 296 +++++++++++++++++++++++++++++
 src/setfit/trainer.py              | 179 +++++++++++++----
 src/setfit/training_args.py        | 116 +++++++++++
 4 files changed, 553 insertions(+), 42 deletions(-)
 create mode 100644 src/setfit/sentence_transformer.py

diff --git a/.gitignore b/.gitignore
index a13745c3..6e89ff50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,3 +149,7 @@ scripts/tfew/run_tmux.sh
 # macOS
 .DS_Store
 .vscode/settings.json
+
+# Common SetFit Trainer logging folders
+wandb
+runs/
\ No newline at end of file
diff --git a/src/setfit/sentence_transformer.py b/src/setfit/sentence_transformer.py
new file mode 100644
index 00000000..b24b64a1
--- /dev/null
+++ b/src/setfit/sentence_transformer.py
@@ -0,0 +1,296 @@
+import time
+from typing import Callable, Dict, Iterable, Tuple, Type
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.evaluation import SentenceEvaluator
+from sentence_transformers.util import batch_to_device
+from torch.utils.data import DataLoader
+from torch import nn
+from torch.optim import Optimizer
+import torch
+from tqdm.autonotebook import trange, tqdm
+from transformers.trainer_callback import TrainerState, TrainerControl, CallbackHandler
+from transformers.trainer_utils import speed_metrics
+
+from setfit.training_args import TrainingArguments
+
+
+def log(args: TrainingArguments, callback_handler: CallbackHandler, state: TrainerState, control: TrainerControl, logs: Dict[str, float]) -> None:
+    """
+    Log `logs` on the various objects watching training.
+
+    Subclass and override this method to inject custom behavior.
+
+    Args:
+        logs (`Dict[str, float]`):
+            The values to log.
+    """
+    if state.epoch is not None:
+        logs["epoch"] = round(state.epoch, 2)
+
+    output = {**logs, **{"step": state.global_step}}
+    state.log_history.append(output)
+    return callback_handler.on_log(args, state, control, logs)
+
+
+def fit(
+    model_body: SentenceTransformer,
+    train_dataloader: DataLoader,
+    eval_dataloader: DataLoader,
+    loss_func: nn.Module,
+    args: TrainingArguments,
+    callback_handler: CallbackHandler,
+    state: TrainerState,
+    control: TrainerControl,
+    # evaluator: SentenceEvaluator = None,  # <- remove
+    # epochs: int = 1,  # <- remove
+    # steps_per_epoch=None,  # <- remove?
+    scheduler: str = "WarmupLinear",
+    warmup_steps: int = 10000,
+    optimizer_class: Type[Optimizer] = torch.optim.AdamW,
+    optimizer_params: Dict[str, object] = {"lr": 2e-5},
+    weight_decay: float = 0.01,
+    output_path: str = None,
+    save_best_model: bool = True,
+    max_grad_norm: float = 1,
+    use_amp: bool = False,
+    # callback: Callable[[float, int, int], None] = None,  # <- remove
+    show_progress_bar: bool = True,
+    checkpoint_path: str = None,  # <- remove
+    checkpoint_save_steps: int = 500,  # <- remove
+    checkpoint_save_total_limit: int = 0,  # <- remove
+):
+    """
+    Train the model with the given training objective
+    Each training objective is sampled in turn for one batch.
+    We sample only as many batches from each objective as there are in the smallest one
+    to make sure of equal training with each dataset.
+
+    :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning
+    :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc.
+    :param epochs: Number of epochs for training
+    :param steps_per_epoch: Number of training steps per epoch. If set to None (default), one epoch is equal the DataLoader size from train_objectives.
+    :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
+    :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero.
+    :param optimizer_class: Optimizer
+    :param optimizer_params: Optimizer parameters
+    :param weight_decay: Weight decay for model parameters
+    :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps
+    :param output_path: Storage path for the model and evaluation files
+    :param save_best_model: If true, the best model (according to evaluator) is stored at output_path
+    :param max_grad_norm: Used for gradient normalization.
+    :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
+    :param callback: Callback function that is invoked after each evaluation.
+            It must accept the following three parameters in this order:
+            `score`, `epoch`, `steps`
+    :param show_progress_bar: If True, output a tqdm progress bar
+    :param checkpoint_path: Folder to save checkpoints during training
+    :param checkpoint_save_steps: Will save a checkpoint after so many steps
+    :param checkpoint_save_total_limit: Total number of checkpoints to store
+    """
+
+    """
+    ##Add info to model card
+    # info_loss_functions = "\n".join(["- {} with {} training examples".format(str(loss), len(dataloader)) for dataloader, loss in train_objectives])
+    info_loss_functions = []
+    for dataloader, loss in train_objectives:
+        info_loss_functions.extend(ModelCardTemplate.get_train_objective_info(dataloader, loss))
+    info_loss_functions = "\n\n".join([text for text in info_loss_functions])
+
+    info_fit_parameters = json.dumps(
+        {
+            "evaluator": fullname(evaluator),
+            "epochs": epochs,
+            "steps_per_epoch": steps_per_epoch,
+            "scheduler": scheduler,
+            "warmup_steps": warmup_steps,
+            "optimizer_class": str(optimizer_class),
+            "optimizer_params": optimizer_params,
+            "weight_decay": weight_decay,
+            "evaluation_steps": evaluation_steps,
+            "max_grad_norm": max_grad_norm,
+        },
+        indent=4,
+        sort_keys=True,
+    )
+    self._model_card_text = None
+    self._model_card_vars["{TRAINING_SECTION}"] = ModelCardTemplate.__TRAINING_SECTION__.replace(
+        "{LOSS_FUNCTIONS}", info_loss_functions
+    ).replace("{FIT_PARAMETERS}", info_fit_parameters)
+    """
+    # TODO: Loading best model
+    # TODO: Saving/checkpointing
+    # TODO: args.gradient_accumulation_steps
+    # TODO: fp16/bf16, etc.
+
+    state.epoch = 0
+    start_time = time.time()
+    # TODO: Add max_steps via args.max_steps here?
+    state.max_steps = len(train_dataloader) * args.embedding_num_epochs
+    control = callback_handler.on_train_begin(args, state, control)
+
+    if use_amp:
+        from torch.cuda.amp import autocast
+
+        scaler = torch.cuda.amp.GradScaler()
+
+    model_body.to(model_body._target_device)
+
+    # Use smart batching
+    train_dataloader.collate_fn = model_body.smart_batching_collate
+    if eval_dataloader:
+        eval_dataloader.collate_fn = model_body.smart_batching_collate
+
+    loss_func.to(model_body._target_device)
+
+    model_body.best_score = -9999999
+
+    steps_per_epoch = len(train_dataloader)
+    num_train_steps = int(steps_per_epoch * args.embedding_num_epochs)
+
+    # Prepare optimizers
+    param_optimizer = list(loss_func.named_parameters())
+
+    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+            "weight_decay": weight_decay,
+        },
+        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+
+    optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
+    scheduler_obj = model_body._get_scheduler(
+        optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps
+    )
+
+    data_iterator = iter(train_dataloader)
+
+    skip_scheduler = False
+    for epoch in range(args.embedding_num_epochs):
+        control = callback_handler.on_epoch_begin(args, state, control)
+
+        training_steps = 0
+
+        loss_func.zero_grad()
+        loss_func.train()
+
+        for step in range(steps_per_epoch):
+            control = callback_handler.on_step_begin(args, state, control)
+
+            try:
+                data = next(data_iterator)
+            except StopIteration:
+                data_iterator = iter(train_dataloader)
+                data = next(data_iterator)
+
+            features, labels = data
+            labels = labels.to(model_body._target_device)
+            features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
+
+            if use_amp:
+                with autocast():
+                    loss_value = loss_func(features, labels)
+
+                scale_before_step = scaler.get_scale()
+                scaler.scale(loss_value).backward()
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(loss_func.parameters(), max_grad_norm)
+                scaler.step(optimizer)
+                scaler.update()
+
+                skip_scheduler = scaler.get_scale() != scale_before_step
+            else:
+                loss_value = loss_func(features, labels)
+                loss_value.backward()
+                torch.nn.utils.clip_grad_norm_(loss_func.parameters(), max_grad_norm)
+                optimizer.step()
+
+            optimizer.zero_grad()
+
+            if not skip_scheduler:
+                scheduler_obj.step()
+
+            training_steps += 1
+
+            state.global_step += 1
+            state.epoch = epoch + (step + 1) / steps_per_epoch
+            control = callback_handler.on_step_end(args, state, control)
+
+            if control.should_log:
+                learning_rate = scheduler_obj.get_last_lr()[0]
+                metrics = {"embedding_loss": round(loss_value.item(), 4), "learning_rate": learning_rate}
+                control = log(args, callback_handler, state, control, metrics)
+
+            if control.should_evaluate:
+                # self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback)
+                eval_loss = evaluate_with_loss(model_body, eval_dataloader, loss_func, show_progress_bar, use_amp)
+                learning_rate = scheduler_obj.get_last_lr()[0]
+                metrics = {"eval_embedding_loss": round(eval_loss, 4), "learning_rate": learning_rate}
+                control = log(args, callback_handler, state, control, metrics)
+                control = callback_handler.on_evaluate(args, state, control, metrics)
+                if state.best_metric is None or eval_loss < state.best_metric:
+                    state.best_metric = eval_loss
+
+                loss_func.zero_grad()
+                loss_func.train()
+
+            if (
+                checkpoint_path is not None
+                and checkpoint_save_steps is not None
+                and checkpoint_save_steps > 0
+                and state.global_step % checkpoint_save_steps == 0
+            ):
+                model_body._save_checkpoint(checkpoint_path, checkpoint_save_total_limit, state.global_step)
+
+            if control.should_epoch_stop or control.should_training_stop:
+                break
+
+        control = callback_handler.on_epoch_end(args, state, control)
+
+        if control.should_training_stop:
+            break
+
+    if output_path is not None:  # No evaluator, but output path: save final model version
+        model_body.save(output_path)
+
+    if checkpoint_path is not None:
+        model_body._save_checkpoint(checkpoint_path, checkpoint_save_total_limit, state.global_step)
+
+    control = callback_handler.on_train_end(args, state, control)
+
+    num_train_samples = state.max_steps * args.embedding_batch_size  # * args.gradient_accumulation_steps
+    metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=state.max_steps)
+    # TODO: This isn't always printed
+    log(args, callback_handler, state, control, metrics)
+
+    # eval_start_time = time.time()
+    # num_eval_samples = len(eval_dataloader)  # args.max_steps * args.embedding_batch_size  # * args.gradient_accumulation_steps
+    # num_eval_steps = num_eval_samples * args.embedding_num_epochs
+    # metrics.update(speed_metrics("eval", eval_start_time, num_samples=num_eval_samples, num_steps=num_eval_steps))
+
+
+def evaluate_with_loss(model_body: SentenceTransformer, eval_dataloader: DataLoader, loss_func: nn.Module, show_progress_bar: bool, use_amp: bool):
+    model_body.eval()
+
+    if use_amp:
+        from torch.cuda.amp import autocast
+
+        scaler = torch.cuda.amp.GradScaler()
+
+    losses = []
+    for data in tqdm(iter(eval_dataloader), leave=False):
+        features, labels = data
+        labels = labels.to(model_body._target_device)
+        features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
+
+        if use_amp:
+            with autocast():
+                loss_value = loss_func(features, labels)
+
+            losses.append(scaler.scale(loss_value).item())
+        else:
+            losses.append(loss_func(features, labels).item())
+
+    model_body.train()
+    return sum(losses) / len(losses)
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 5fb4b37f..32ad05c3 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -2,6 +2,8 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
+from setfit import sentence_transformer
+
 
 # Google Colab runs on Python 3.7, so we need this to be compatible
 try:
@@ -18,6 +20,18 @@
 from torch.utils.data import DataLoader
 from tqdm.auto import trange
 from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
+import transformers
+from transformers.trainer_callback import (
+    CallbackHandler,
+    DefaultFlowCallback,
+    PrinterCallback,
+    ProgressCallback,
+    TrainerCallback,
+    TrainerState,
+    TrainerControl,
+)
+from transformers.integrations import get_reporting_integration_callbacks, get_available_reporting_integrations
+from transformers.utils.import_utils import is_in_notebook
 
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
@@ -37,6 +51,15 @@
 logger = logging.get_logger(__name__)
 
 
+DEFAULT_CALLBACKS = [DefaultFlowCallback]
+DEFAULT_PROGRESS_CALLBACK = ProgressCallback
+
+if is_in_notebook():
+    from transformers.utils.notebook import NotebookProgressCallback
+
+    DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback
+
+
 class Trainer:
     """Trainer to train a SetFit model.
 
@@ -77,9 +100,10 @@ def __init__(
         model_init: Optional[Callable[[], "SetFitModel"]] = None,
         metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
         metric_kwargs: Optional[Dict[str, Any]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
         column_mapping: Optional[Dict[str, str]] = None,
     ):
-        self.args = args
+        self.args = args or TrainingArguments()
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
         self.model_init = model_init
@@ -99,6 +123,57 @@ def __init__(
         self.model = model
         self.hp_search_backend = None
 
+        # Setup the callbacks
+        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
+        callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
+        # TODO: Observe optimizer and scheduler by wrapping SentenceTransformer._get_scheduler
+        self.callback_handler = CallbackHandler(
+            callbacks, self.model.model_body, self.model.model_body.tokenizer, None, None
+        )
+        self.state = TrainerState()
+        self.control = TrainerControl()
+        self.add_callback(DEFAULT_PROGRESS_CALLBACK if self.args.show_progress_bar else PrinterCallback)
+
+        self.control = self.callback_handler.on_init_end(args, self.state, self.control)
+
+    def add_callback(self, callback):
+        """
+        Add a callback to the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will instantiate a member of that class.
+        """
+        self.callback_handler.add_callback(callback)
+
+    def pop_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.
+
+        If the callback is not found, returns `None` (and no error is raised).
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will pop the first member of that class found in the list of callbacks.
+
+        Returns:
+            [`~transformer.TrainerCallback`]: The callback removed, if found.
+        """
+        return self.callback_handler.pop_callback(callback)
+
+    def remove_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will remove the first member of that class found in the list of callbacks.
+        """
+        self.callback_handler.remove_callback(callback)
+
     def _validate_column_mapping(self, dataset: "Dataset") -> None:
         """
         Validates the provided column mapping against the dataset.
@@ -275,20 +350,28 @@ def train(
                 f"Training requires a `train_dataset` given to the `{self.__class__.__name__}` initialization."
             )
 
-        self._validate_column_mapping(self.train_dataset)
-        train_dataset = self.train_dataset
-        if self.column_mapping is not None:
-            logger.info("Applying column mapping to training dataset")
-            train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
+        parameters = []
+        for dataset, dataset_name in [(self.train_dataset, "training"), (self.eval_dataset, "evaluation")]:
+            if dataset is None:
+                continue
 
-        x_train: List[str] = train_dataset["text"]
-        y_train: List[int] = train_dataset["label"]
+            self._validate_column_mapping(dataset)
+            if self.column_mapping is not None:
+                logger.info(f"Applying column mapping to {dataset_name} dataset")
+                dataset = self._apply_column_mapping(dataset, self.column_mapping)
 
-        self.train_embeddings(x_train, y_train, args)
-        self.train_classifier(x_train, y_train, args)
+            parameters.extend([dataset["text"], dataset["label"]])
+
+        self.train_embeddings(*parameters, args=args)
+        self.train_classifier(*parameters[:2], args=args)
 
     def train_embeddings(
-        self, x_train: List[str], y_train: Union[List[int], List[List[int]]], args: Optional[TrainingArguments] = None
+        self,
+        x_train: List[str],
+        y_train: Union[List[int], List[List[int]]],
+        x_eval: List[str],
+        y_eval: Union[List[int], List[List[int]]],
+        args: Optional[TrainingArguments] = None,
     ) -> None:
         """
         Method to perform the embedding phase: finetuning the `SentenceTransformer` body.
@@ -301,6 +384,36 @@ def train_embeddings(
         """
         args = args or self.args or TrainingArguments()
 
+        train_dataloader, loss_func, batch_size = self.get_dataloader(x_train, y_train, args=args)
+        if x_eval is not None:
+            eval_dataloader, _, _ = self.get_dataloader(x_eval, y_eval, args=args)
+        else:
+            eval_dataloader = None
+
+        total_train_steps = len(train_dataloader) * args.embedding_num_epochs
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_dataloader)}")
+        logger.info(f"  Num epochs = {args.embedding_num_epochs}")
+        logger.info(f"  Total optimization steps = {total_train_steps}")
+        logger.info(f"  Total train batch size = {batch_size}")
+
+        warmup_steps = math.ceil(total_train_steps * args.warmup_proportion)
+        sentence_transformer.fit(
+            self.model.model_body,
+            train_dataloader=train_dataloader,
+            loss_func=loss_func,
+            eval_dataloader=eval_dataloader,
+            args=args,
+            callback_handler=self.callback_handler,
+            state=self.state,
+            control=self.control,
+            optimizer_params={"lr": args.body_embedding_learning_rate},
+            warmup_steps=warmup_steps,
+            show_progress_bar=args.show_progress_bar,
+            use_amp=args.use_amp,
+        )
+
+    def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], args: TrainingArguments):
         # sentence-transformers adaptation
         if args.loss in [
             losses.BatchAllTripletLoss,
@@ -309,56 +422,38 @@ def train_embeddings(
             losses.BatchHardSoftMarginTripletLoss,
             SupConLoss,
         ]:
-            train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
-            train_data_sampler = SentenceLabelDataset(train_examples, samples_per_label=args.samples_per_label)
+            examples = [InputExample(texts=[text], label=label) for text, label in zip(x, y)]
+            data_sampler = SentenceLabelDataset(examples, samples_per_label=args.samples_per_label)
 
-            batch_size = min(args.embedding_batch_size, len(train_data_sampler))
-            train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
+            batch_size = min(args.embedding_batch_size, len(data_sampler))
+            dataloader = DataLoader(data_sampler, batch_size=batch_size, drop_last=True)
 
             if args.loss is losses.BatchHardSoftMarginTripletLoss:
-                train_loss = args.loss(
+                loss = args.loss(
                     model=self.model.model_body,
                     distance_metric=args.distance_metric,
                 )
             elif args.loss is SupConLoss:
-                train_loss = args.loss(model=self.model.model_body)
+                loss = args.loss(model=self.model.model_body)
             else:
-                train_loss = args.loss(
+                loss = args.loss(
                     model=self.model.model_body,
                     distance_metric=args.distance_metric,
                     margin=args.margin,
                 )
         else:
-            train_examples = []
+            examples = []
 
             for _ in trange(args.num_iterations, desc="Generating Training Pairs", disable=not args.show_progress_bar):
                 if self.model.multi_target_strategy is not None:
-                    train_examples = sentence_pairs_generation_multilabel(
-                        np.array(x_train), np.array(y_train), train_examples
-                    )
+                    examples = sentence_pairs_generation_multilabel(np.array(x), np.array(y), examples)
                 else:
-                    train_examples = sentence_pairs_generation(np.array(x_train), np.array(y_train), train_examples)
+                    examples = sentence_pairs_generation(np.array(x), np.array(y), examples)
 
             batch_size = args.embedding_batch_size
-            train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-            train_loss = args.loss(self.model.model_body)
-
-        total_train_steps = len(train_dataloader) * args.embedding_num_epochs
-        logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {len(train_examples)}")
-        logger.info(f"  Num epochs = {args.embedding_num_epochs}")
-        logger.info(f"  Total optimization steps = {total_train_steps}")
-        logger.info(f"  Total train batch size = {batch_size}")
-
-        warmup_steps = math.ceil(total_train_steps * args.warmup_proportion)
-        self.model.model_body.fit(
-            train_objectives=[(train_dataloader, train_loss)],
-            epochs=args.embedding_num_epochs,
-            optimizer_params={"lr": args.body_embedding_learning_rate},
-            warmup_steps=warmup_steps,
-            show_progress_bar=args.show_progress_bar,
-            use_amp=args.use_amp,
-        )
+            dataloader = DataLoader(examples, shuffle=True, batch_size=batch_size)
+            loss = args.loss(self.model.model_body)
+        return dataloader, loss, batch_size
 
     def train_classifier(
         self, x_train: List[str], y_train: Union[List[int], List[List[int]]], args: Optional[TrainingArguments] = None
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index ed3cbbdd..2a1aeb3b 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -3,9 +3,15 @@
 import inspect
 from copy import copy
 from dataclasses import dataclass, field, fields
+import json
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 from sentence_transformers import losses
+import torch
+from transformers.integrations import get_available_reporting_integrations
+from transformers.training_args import default_logdir
+from transformers.utils import is_torch_available
+from transformers import IntervalStrategy
 
 
 @dataclass
@@ -67,6 +73,53 @@ class TrainingArguments:
             Random seed that will be set at the beginning of training. To ensure reproducibility across
             runs, use the [`~SetTrainer.model_init`] function to instantiate the model if it has some
             randomly initialized parameters.
+        report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
+            The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
+            `"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. Use `"all"` to report to
+            all integrations installed, `"none"` for no integrations.
+        run_name (`str`, *optional*):
+            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and
+            [mlflow](https://www.mlflow.org/) logging.
+        logging_dir (`str`, *optional*):
+            [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
+            *runs/**CURRENT_DATETIME_HOSTNAME***.
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+            The logging strategy to adopt during training. Possible values are:
+
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.
+
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`.
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+            The evaluation strategy to adopt during training. Possible values are:
+
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.
+
+        eval_steps (`int`, *optional*):
+            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+            value as `logging_steps` if not set.
+        eval_delay (`float`, *optional*):
+            Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
+            evaluation_strategy.
+
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+            The checkpoint save strategy to adopt during training. Possible values are:
+
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.
+        save_steps (`int`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_total_limit (`int`, *optional*):
+            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
+            `output_dir`.
+
     """
 
     # batch_size is only used to conveniently set `embedding_batch_size` and `classifier_batch_size`
@@ -107,6 +160,26 @@ class TrainingArguments:
     show_progress_bar: bool = True
     seed: int = 42
 
+    # Logging & callbacks
+    report_to: str = "all"
+    run_name: Optional[str] = None
+    logging_dir: Optional[str] = None
+    logging_strategy: str = "steps"
+    logging_first_step: bool = False
+    logging_steps: int = 5
+
+    evaluation_strategy: str = "steps"
+    eval_steps: Optional[int] = None
+    eval_delay: int = 0
+
+    save_strategy: str = "steps"
+    save_steps: int = 500
+    save_total_limit: Optional[int] = None
+
+    load_best_model_at_end: bool = True
+    metric_for_best_model: str = field(default="embedding_loss", repr=False)
+    greater_is_better: bool = field(default=False, repr=False)
+
     def __post_init__(self) -> None:
         # Set `self.embedding_batch_size` and `self.classifier_batch_size` using values from `self.batch_size`
         if isinstance(self.batch_size, int):
@@ -138,6 +211,29 @@ def __post_init__(self) -> None:
                 f"warmup_proportion must be greater than or equal to 0.0 and less than or equal to 1.0! But it was: {self.warmup_proportion}"
             )
 
+        if self.report_to in (None, "all"):
+            self.report_to = get_available_reporting_integrations()
+
+        if self.logging_dir is None:
+            self.logging_dir = default_logdir()
+
+        self.logging_strategy = IntervalStrategy(self.logging_strategy)
+        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+
+        # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
+        if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
+            if self.logging_steps > 0:
+                self.eval_steps = self.logging_steps
+            else:
+                raise ValueError(
+                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero `eval_steps` or"
+                    " `logging_steps`"
+                )
+
+        # logging_steps must be non-zero for logging_strategy that is other than 'no'
+        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
+            raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
+
     def to_dict(self) -> Dict[str, Any]:
         # filter out fields that are defined as field(init=False)
         return {field.name: getattr(self, field.name) for field in fields(self) if field.init}
@@ -153,3 +249,23 @@ def copy(self) -> TrainingArguments:
 
     def update(self, arguments: Dict[str, Any], ignore_extra: bool = False) -> TrainingArguments:
         return TrainingArguments.from_dict({**self.to_dict(), **arguments}, ignore_extra=ignore_extra)
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+        """
+        # TODO: This needs to be improved
+        return json.dumps({key: str(value) for key, value in self.to_dict().items()}, indent=2)
+
+    def to_sanitized_dict(self) -> Dict[str, Any]:
+        """
+        Sanitized serialization to use with TensorBoard’s hparams
+        """
+        d = self.to_dict()
+        d = {**d, **{"train_batch_size": self.embedding_batch_size, "eval_batch_size": self.embedding_batch_size}}
+
+        valid_types = [bool, int, float, str]
+        if is_torch_available():
+            valid_types.append(torch.Tensor)
+
+        return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}

From 6e6720b8054b7f92ca48cf47a544b7a84f6cf249 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 6 Jul 2023 23:28:31 +0200
Subject: [PATCH 36/77] Move sentence-transformer training into trainer.py

---
 src/setfit/sentence_transformer.py | 296 -----------------------------
 src/setfit/trainer.py              | 239 ++++++++++++++++++++---
 src/setfit/training_args.py        |  12 +-
 3 files changed, 222 insertions(+), 325 deletions(-)
 delete mode 100644 src/setfit/sentence_transformer.py

diff --git a/src/setfit/sentence_transformer.py b/src/setfit/sentence_transformer.py
deleted file mode 100644
index b24b64a1..00000000
--- a/src/setfit/sentence_transformer.py
+++ /dev/null
@@ -1,296 +0,0 @@
-import time
-from typing import Callable, Dict, Iterable, Tuple, Type
-from sentence_transformers import SentenceTransformer
-from sentence_transformers.evaluation import SentenceEvaluator
-from sentence_transformers.util import batch_to_device
-from torch.utils.data import DataLoader
-from torch import nn
-from torch.optim import Optimizer
-import torch
-from tqdm.autonotebook import trange, tqdm
-from transformers.trainer_callback import TrainerState, TrainerControl, CallbackHandler
-from transformers.trainer_utils import speed_metrics
-
-from setfit.training_args import TrainingArguments
-
-
-def log(args: TrainingArguments, callback_handler: CallbackHandler, state: TrainerState, control: TrainerControl, logs: Dict[str, float]) -> None:
-    """
-    Log `logs` on the various objects watching training.
-
-    Subclass and override this method to inject custom behavior.
-
-    Args:
-        logs (`Dict[str, float]`):
-            The values to log.
-    """
-    if state.epoch is not None:
-        logs["epoch"] = round(state.epoch, 2)
-
-    output = {**logs, **{"step": state.global_step}}
-    state.log_history.append(output)
-    return callback_handler.on_log(args, state, control, logs)
-
-
-def fit(
-    model_body: SentenceTransformer,
-    train_dataloader: DataLoader,
-    eval_dataloader: DataLoader,
-    loss_func: nn.Module,
-    args: TrainingArguments,
-    callback_handler: CallbackHandler,
-    state: TrainerState,
-    control: TrainerControl,
-    # evaluator: SentenceEvaluator = None,  # <- remove
-    # epochs: int = 1,  # <- remove
-    # steps_per_epoch=None,  # <- remove?
-    scheduler: str = "WarmupLinear",
-    warmup_steps: int = 10000,
-    optimizer_class: Type[Optimizer] = torch.optim.AdamW,
-    optimizer_params: Dict[str, object] = {"lr": 2e-5},
-    weight_decay: float = 0.01,
-    output_path: str = None,
-    save_best_model: bool = True,
-    max_grad_norm: float = 1,
-    use_amp: bool = False,
-    # callback: Callable[[float, int, int], None] = None,  # <- remove
-    show_progress_bar: bool = True,
-    checkpoint_path: str = None,  # <- remove
-    checkpoint_save_steps: int = 500,  # <- remove
-    checkpoint_save_total_limit: int = 0,  # <- remove
-):
-    """
-    Train the model with the given training objective
-    Each training objective is sampled in turn for one batch.
-    We sample only as many batches from each objective as there are in the smallest one
-    to make sure of equal training with each dataset.
-
-    :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning
-    :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc.
-    :param epochs: Number of epochs for training
-    :param steps_per_epoch: Number of training steps per epoch. If set to None (default), one epoch is equal the DataLoader size from train_objectives.
-    :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts
-    :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero.
-    :param optimizer_class: Optimizer
-    :param optimizer_params: Optimizer parameters
-    :param weight_decay: Weight decay for model parameters
-    :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps
-    :param output_path: Storage path for the model and evaluation files
-    :param save_best_model: If true, the best model (according to evaluator) is stored at output_path
-    :param max_grad_norm: Used for gradient normalization.
-    :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
-    :param callback: Callback function that is invoked after each evaluation.
-            It must accept the following three parameters in this order:
-            `score`, `epoch`, `steps`
-    :param show_progress_bar: If True, output a tqdm progress bar
-    :param checkpoint_path: Folder to save checkpoints during training
-    :param checkpoint_save_steps: Will save a checkpoint after so many steps
-    :param checkpoint_save_total_limit: Total number of checkpoints to store
-    """
-
-    """
-    ##Add info to model card
-    # info_loss_functions = "\n".join(["- {} with {} training examples".format(str(loss), len(dataloader)) for dataloader, loss in train_objectives])
-    info_loss_functions = []
-    for dataloader, loss in train_objectives:
-        info_loss_functions.extend(ModelCardTemplate.get_train_objective_info(dataloader, loss))
-    info_loss_functions = "\n\n".join([text for text in info_loss_functions])
-
-    info_fit_parameters = json.dumps(
-        {
-            "evaluator": fullname(evaluator),
-            "epochs": epochs,
-            "steps_per_epoch": steps_per_epoch,
-            "scheduler": scheduler,
-            "warmup_steps": warmup_steps,
-            "optimizer_class": str(optimizer_class),
-            "optimizer_params": optimizer_params,
-            "weight_decay": weight_decay,
-            "evaluation_steps": evaluation_steps,
-            "max_grad_norm": max_grad_norm,
-        },
-        indent=4,
-        sort_keys=True,
-    )
-    self._model_card_text = None
-    self._model_card_vars["{TRAINING_SECTION}"] = ModelCardTemplate.__TRAINING_SECTION__.replace(
-        "{LOSS_FUNCTIONS}", info_loss_functions
-    ).replace("{FIT_PARAMETERS}", info_fit_parameters)
-    """
-    # TODO: Loading best model
-    # TODO: Saving/checkpointing
-    # TODO: args.gradient_accumulation_steps
-    # TODO: fp16/bf16, etc.
-
-    state.epoch = 0
-    start_time = time.time()
-    # TODO: Add max_steps via args.max_steps here?
-    state.max_steps = len(train_dataloader) * args.embedding_num_epochs
-    control = callback_handler.on_train_begin(args, state, control)
-
-    if use_amp:
-        from torch.cuda.amp import autocast
-
-        scaler = torch.cuda.amp.GradScaler()
-
-    model_body.to(model_body._target_device)
-
-    # Use smart batching
-    train_dataloader.collate_fn = model_body.smart_batching_collate
-    if eval_dataloader:
-        eval_dataloader.collate_fn = model_body.smart_batching_collate
-
-    loss_func.to(model_body._target_device)
-
-    model_body.best_score = -9999999
-
-    steps_per_epoch = len(train_dataloader)
-    num_train_steps = int(steps_per_epoch * args.embedding_num_epochs)
-
-    # Prepare optimizers
-    param_optimizer = list(loss_func.named_parameters())
-
-    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
-            "weight_decay": weight_decay,
-        },
-        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-
-    optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
-    scheduler_obj = model_body._get_scheduler(
-        optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps
-    )
-
-    data_iterator = iter(train_dataloader)
-
-    skip_scheduler = False
-    for epoch in range(args.embedding_num_epochs):
-        control = callback_handler.on_epoch_begin(args, state, control)
-
-        training_steps = 0
-
-        loss_func.zero_grad()
-        loss_func.train()
-
-        for step in range(steps_per_epoch):
-            control = callback_handler.on_step_begin(args, state, control)
-
-            try:
-                data = next(data_iterator)
-            except StopIteration:
-                data_iterator = iter(train_dataloader)
-                data = next(data_iterator)
-
-            features, labels = data
-            labels = labels.to(model_body._target_device)
-            features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
-
-            if use_amp:
-                with autocast():
-                    loss_value = loss_func(features, labels)
-
-                scale_before_step = scaler.get_scale()
-                scaler.scale(loss_value).backward()
-                scaler.unscale_(optimizer)
-                torch.nn.utils.clip_grad_norm_(loss_func.parameters(), max_grad_norm)
-                scaler.step(optimizer)
-                scaler.update()
-
-                skip_scheduler = scaler.get_scale() != scale_before_step
-            else:
-                loss_value = loss_func(features, labels)
-                loss_value.backward()
-                torch.nn.utils.clip_grad_norm_(loss_func.parameters(), max_grad_norm)
-                optimizer.step()
-
-            optimizer.zero_grad()
-
-            if not skip_scheduler:
-                scheduler_obj.step()
-
-            training_steps += 1
-
-            state.global_step += 1
-            state.epoch = epoch + (step + 1) / steps_per_epoch
-            control = callback_handler.on_step_end(args, state, control)
-
-            if control.should_log:
-                learning_rate = scheduler_obj.get_last_lr()[0]
-                metrics = {"embedding_loss": round(loss_value.item(), 4), "learning_rate": learning_rate}
-                control = log(args, callback_handler, state, control, metrics)
-
-            if control.should_evaluate:
-                # self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback)
-                eval_loss = evaluate_with_loss(model_body, eval_dataloader, loss_func, show_progress_bar, use_amp)
-                learning_rate = scheduler_obj.get_last_lr()[0]
-                metrics = {"eval_embedding_loss": round(eval_loss, 4), "learning_rate": learning_rate}
-                control = log(args, callback_handler, state, control, metrics)
-                control = callback_handler.on_evaluate(args, state, control, metrics)
-                if state.best_metric is None or eval_loss < state.best_metric:
-                    state.best_metric = eval_loss
-
-                loss_func.zero_grad()
-                loss_func.train()
-
-            if (
-                checkpoint_path is not None
-                and checkpoint_save_steps is not None
-                and checkpoint_save_steps > 0
-                and state.global_step % checkpoint_save_steps == 0
-            ):
-                model_body._save_checkpoint(checkpoint_path, checkpoint_save_total_limit, state.global_step)
-
-            if control.should_epoch_stop or control.should_training_stop:
-                break
-
-        control = callback_handler.on_epoch_end(args, state, control)
-
-        if control.should_training_stop:
-            break
-
-    if output_path is not None:  # No evaluator, but output path: save final model version
-        model_body.save(output_path)
-
-    if checkpoint_path is not None:
-        model_body._save_checkpoint(checkpoint_path, checkpoint_save_total_limit, state.global_step)
-
-    control = callback_handler.on_train_end(args, state, control)
-
-    num_train_samples = state.max_steps * args.embedding_batch_size  # * args.gradient_accumulation_steps
-    metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=state.max_steps)
-    # TODO: This isn't always printed
-    log(args, callback_handler, state, control, metrics)
-
-    # eval_start_time = time.time()
-    # num_eval_samples = len(eval_dataloader)  # args.max_steps * args.embedding_batch_size  # * args.gradient_accumulation_steps
-    # num_eval_steps = num_eval_samples * args.embedding_num_epochs
-    # metrics.update(speed_metrics("eval", eval_start_time, num_samples=num_eval_samples, num_steps=num_eval_steps))
-
-
-def evaluate_with_loss(model_body: SentenceTransformer, eval_dataloader: DataLoader, loss_func: nn.Module, show_progress_bar: bool, use_amp: bool):
-    model_body.eval()
-
-    if use_amp:
-        from torch.cuda.amp import autocast
-
-        scaler = torch.cuda.amp.GradScaler()
-
-    losses = []
-    for data in tqdm(iter(eval_dataloader), leave=False):
-        features, labels = data
-        labels = labels.to(model_body._target_device)
-        features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
-
-        if use_amp:
-            with autocast():
-                loss_value = loss_func(features, labels)
-
-            losses.append(scaler.scale(loss_value).item())
-        else:
-            losses.append(loss_func(features, labels).item())
-
-    model_body.train()
-    return sum(losses) / len(losses)
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 32ad05c3..53b0d763 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -1,38 +1,41 @@
 import math
+import time
 import warnings
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
-from setfit import sentence_transformer
-
-
-# Google Colab runs on Python 3.7, so we need this to be compatible
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
-
 import evaluate
 import numpy as np
+import torch
 from datasets import DatasetDict
-from sentence_transformers import InputExample, losses
+from sentence_transformers import InputExample, SentenceTransformer, losses
 from sentence_transformers.datasets import SentenceLabelDataset
 from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
+from sentence_transformers.util import batch_to_device
+from torch import nn
+from torch.cuda.amp import autocast
 from torch.utils.data import DataLoader
-from tqdm.auto import trange
-from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
-import transformers
+from tqdm.autonotebook import tqdm, trange
+from transformers.integrations import get_reporting_integration_callbacks
 from transformers.trainer_callback import (
     CallbackHandler,
     DefaultFlowCallback,
     PrinterCallback,
     ProgressCallback,
     TrainerCallback,
-    TrainerState,
     TrainerControl,
+    TrainerState,
+)
+from transformers.trainer_utils import (
+    HPSearchBackend,
+    default_compute_objective,
+    number_of_arguments,
+    set_seed,
+    speed_metrics,
 )
-from transformers.integrations import get_reporting_integration_callbacks, get_available_reporting_integrations
 from transformers.utils.import_utils import is_in_notebook
 
+from setfit.training_args import TrainingArguments
+
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
 from .losses import SupConLoss
@@ -41,6 +44,13 @@
 from .utils import BestRun, default_hp_space_optuna
 
 
+# Google Colab runs on Python 3.7, so we need this to be compatible
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
 if TYPE_CHECKING:
     import optuna
     from datasets import Dataset
@@ -398,19 +408,13 @@ def train_embeddings(
         logger.info(f"  Total train batch size = {batch_size}")
 
         warmup_steps = math.ceil(total_train_steps * args.warmup_proportion)
-        sentence_transformer.fit(
+        self._train_sentence_transformer(
             self.model.model_body,
             train_dataloader=train_dataloader,
-            loss_func=loss_func,
             eval_dataloader=eval_dataloader,
             args=args,
-            callback_handler=self.callback_handler,
-            state=self.state,
-            control=self.control,
-            optimizer_params={"lr": args.body_embedding_learning_rate},
+            loss_func=loss_func,
             warmup_steps=warmup_steps,
-            show_progress_bar=args.show_progress_bar,
-            use_amp=args.use_amp,
         )
 
     def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], args: TrainingArguments):
@@ -455,6 +459,195 @@ def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], arg
             loss = args.loss(self.model.model_body)
         return dataloader, loss, batch_size
 
+    def log(self, args: TrainingArguments, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training.
+
+        Subclass and override this method to inject custom behavior.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        if self.state.epoch is not None:
+            logs["epoch"] = round(self.state.epoch, 2)
+
+        output = {**logs, **{"step": self.state.global_step}}
+        self.state.log_history.append(output)
+        return self.callback_handler.on_log(args, self.state, self.control, logs)
+
+    def _train_sentence_transformer(
+        self,
+        model_body: SentenceTransformer,
+        train_dataloader: DataLoader,
+        eval_dataloader: DataLoader,
+        args: TrainingArguments,
+        loss_func: nn.Module,
+        warmup_steps: int = 10000,
+    ) -> None:
+        """
+        Train the model with the given training objective
+        Each training objective is sampled in turn for one batch.
+        We sample only as many batches from each objective as there are in the smallest one
+        to make sure of equal training with each dataset.
+        """
+        # TODO: Loading best model
+        # TODO: Saving/checkpointing
+        # TODO: args.gradient_accumulation_steps
+        # TODO: fp16/bf16, etc.
+
+        # Hardcoded training arguments
+        max_grad_norm = 1
+        weight_decay = 0.01
+
+        self.state.epoch = 0
+        start_time = time.time()
+        # TODO: Add max_steps via args.max_steps here?
+        self.state.max_steps = len(train_dataloader) * args.embedding_num_epochs
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        if args.use_amp:
+            scaler = torch.cuda.amp.GradScaler()
+
+        model_body.to(model_body._target_device)
+        loss_func.to(model_body._target_device)
+
+        # Use smart batching
+        train_dataloader.collate_fn = model_body.smart_batching_collate
+        if eval_dataloader:
+            eval_dataloader.collate_fn = model_body.smart_batching_collate
+
+        steps_per_epoch = len(train_dataloader)
+        num_train_steps = int(steps_per_epoch * args.embedding_num_epochs)
+
+        # Prepare optimizers
+        param_optimizer = list(loss_func.named_parameters())
+
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay": weight_decay,
+            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        ]
+
+        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, **{"lr": args.body_embedding_learning_rate})
+        scheduler_obj = model_body._get_scheduler(
+            optimizer, scheduler="WarmupLinear", warmup_steps=warmup_steps, t_total=num_train_steps
+        )
+
+        data_iterator = iter(train_dataloader)
+        skip_scheduler = False
+        for epoch in range(args.embedding_num_epochs):
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
+
+            loss_func.zero_grad()
+            loss_func.train()
+
+            for step in range(steps_per_epoch):
+                self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
+
+                try:
+                    data = next(data_iterator)
+                except StopIteration:
+                    data_iterator = iter(train_dataloader)
+                    data = next(data_iterator)
+
+                features, labels = data
+                labels = labels.to(model_body._target_device)
+                features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
+
+                if args.use_amp:
+                    with autocast():
+                        loss_value = loss_func(features, labels)
+
+                    scale_before_step = scaler.get_scale()
+                    scaler.scale(loss_value).backward()
+                    scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(loss_func.parameters(), max_grad_norm)
+                    scaler.step(optimizer)
+                    scaler.update()
+
+                    skip_scheduler = scaler.get_scale() != scale_before_step
+                else:
+                    loss_value = loss_func(features, labels)
+                    loss_value.backward()
+                    torch.nn.utils.clip_grad_norm_(loss_func.parameters(), max_grad_norm)
+                    optimizer.step()
+
+                optimizer.zero_grad()
+
+                if not skip_scheduler:
+                    scheduler_obj.step()
+
+                self.state.global_step += 1
+                self.state.epoch = epoch + (step + 1) / steps_per_epoch
+                self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+
+                if self.control.should_log:
+                    learning_rate = scheduler_obj.get_last_lr()[0]
+                    metrics = {"embedding_loss": round(loss_value.item(), 4), "learning_rate": learning_rate}
+                    self.control = self.log(args, metrics)
+
+                if self.control.should_evaluate:
+                    eval_loss = self._evaluate_with_loss(model_body, eval_dataloader, args, loss_func)
+                    learning_rate = scheduler_obj.get_last_lr()[0]
+                    metrics = {"eval_embedding_loss": round(eval_loss, 4), "learning_rate": learning_rate}
+                    self.control = self.log(args, metrics)
+
+                    self.control = self.callback_handler.on_evaluate(args, self.state, self.control, metrics)
+                    if self.state.best_metric is None or eval_loss < self.state.best_metric:
+                        self.state.best_metric = eval_loss
+
+                    loss_func.zero_grad()
+                    loss_func.train()
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+
+            if self.control.should_training_stop:
+                break
+
+        # Ensure logging the speed metrics
+        num_train_samples = self.state.max_steps * args.embedding_batch_size  # * args.gradient_accumulation_steps
+        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+        self.control.should_log = True
+        self.log(args, metrics)
+
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+    def _evaluate_with_loss(
+        self,
+        model_body: SentenceTransformer,
+        eval_dataloader: DataLoader,
+        args: TrainingArguments,
+        loss_func: nn.Module,
+    ) -> float:
+        model_body.eval()
+
+        if args.use_amp:
+            scaler = torch.cuda.amp.GradScaler()
+
+        losses = []
+        for data in tqdm(iter(eval_dataloader), leave=False, disable=not args.show_progress_bar):
+            features, labels = data
+            labels = labels.to(model_body._target_device)
+            features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
+
+            if args.use_amp:
+                with autocast():
+                    loss_value = loss_func(features, labels)
+
+                losses.append(scaler.scale(loss_value).item())
+            else:
+                losses.append(loss_func(features, labels).item())
+
+        model_body.train()
+        return sum(losses) / len(losses)
+
     def train_classifier(
         self, x_train: List[str], y_train: Union[List[int], List[List[int]]], args: Optional[TrainingArguments] = None
     ) -> None:
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 2a1aeb3b..9a6d9ee8 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -1,17 +1,17 @@
 from __future__ import annotations
 
 import inspect
+import json
 from copy import copy
 from dataclasses import dataclass, field, fields
-import json
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
-from sentence_transformers import losses
 import torch
+from sentence_transformers import losses
+from transformers import IntervalStrategy
 from transformers.integrations import get_available_reporting_integrations
 from transformers.training_args import default_logdir
 from transformers.utils import is_torch_available
-from transformers import IntervalStrategy
 
 
 @dataclass
@@ -165,7 +165,7 @@ class TrainingArguments:
     run_name: Optional[str] = None
     logging_dir: Optional[str] = None
     logging_strategy: str = "steps"
-    logging_first_step: bool = False
+    logging_first_step: bool = True
     logging_steps: int = 5
 
     evaluation_strategy: str = "steps"
@@ -177,8 +177,8 @@ class TrainingArguments:
     save_total_limit: Optional[int] = None
 
     load_best_model_at_end: bool = True
-    metric_for_best_model: str = field(default="embedding_loss", repr=False)
-    greater_is_better: bool = field(default=False, repr=False)
+    metric_for_best_model: str = field(default="embedding_loss", repr=False, init=False)
+    greater_is_better: bool = field(default=False, repr=False, init=False)
 
     def __post_init__(self) -> None:
         # Set `self.embedding_batch_size` and `self.classifier_batch_size` using values from `self.batch_size`

From 826eb538b919b4a4a747eede65979ac6701e1c42 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 28 Jul 2023 16:33:46 +0200
Subject: [PATCH 37/77] Add checkpointing, support EarlyStoppingCallback

---
 src/setfit/trainer.py       | 52 ++++++++++++++++++++++++++-------
 src/setfit/training_args.py | 57 ++++++++++++++++++++++++++++++++-----
 2 files changed, 92 insertions(+), 17 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 53b0d763..ddedc2c3 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -1,4 +1,6 @@
 import math
+from pathlib import Path
+import shutil
 import time
 import warnings
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
@@ -34,8 +36,6 @@
 )
 from transformers.utils.import_utils import is_in_notebook
 
-from setfit.training_args import TrainingArguments
-
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
 from .losses import SupConLoss
@@ -379,8 +379,8 @@ def train_embeddings(
         self,
         x_train: List[str],
         y_train: Union[List[int], List[List[int]]],
-        x_eval: List[str],
-        y_eval: Union[List[int], List[List[int]]],
+        x_eval: List[str] = None,
+        y_eval: Union[List[int], List[List[int]]] = None,
         args: Optional[TrainingArguments] = None,
     ) -> None:
         """
@@ -480,7 +480,7 @@ def _train_sentence_transformer(
         self,
         model_body: SentenceTransformer,
         train_dataloader: DataLoader,
-        eval_dataloader: DataLoader,
+        eval_dataloader: Optional[DataLoader],
         args: TrainingArguments,
         loss_func: nn.Module,
         warmup_steps: int = 10000,
@@ -491,10 +491,9 @@ def _train_sentence_transformer(
         We sample only as many batches from each objective as there are in the smallest one
         to make sure of equal training with each dataset.
         """
-        # TODO: Loading best model
-        # TODO: Saving/checkpointing
         # TODO: args.gradient_accumulation_steps
         # TODO: fp16/bf16, etc.
+        # TODO: Safetensors
 
         # Hardcoded training arguments
         max_grad_norm = 1
@@ -590,19 +589,28 @@ def _train_sentence_transformer(
                     metrics = {"embedding_loss": round(loss_value.item(), 4), "learning_rate": learning_rate}
                     self.control = self.log(args, metrics)
 
-                if self.control.should_evaluate:
+                eval_loss = None
+                if self.control.should_evaluate and eval_dataloader:
                     eval_loss = self._evaluate_with_loss(model_body, eval_dataloader, args, loss_func)
                     learning_rate = scheduler_obj.get_last_lr()[0]
                     metrics = {"eval_embedding_loss": round(eval_loss, 4), "learning_rate": learning_rate}
                     self.control = self.log(args, metrics)
 
                     self.control = self.callback_handler.on_evaluate(args, self.state, self.control, metrics)
-                    if self.state.best_metric is None or eval_loss < self.state.best_metric:
-                        self.state.best_metric = eval_loss
 
                     loss_func.zero_grad()
                     loss_func.train()
 
+                if self.control.should_save:
+                    checkpoint_dir = self._checkpoint(
+                        self.args.output_dir, args.save_total_limit, self.state.global_step
+                    )
+                    self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+
+                    if eval_loss is not None and (self.state.best_metric is None or eval_loss < self.state.best_metric):
+                        self.state.best_metric = eval_loss
+                        self.state.best_model_checkpoint = checkpoint_dir
+
                 if self.control.should_epoch_stop or self.control.should_training_stop:
                     break
 
@@ -611,6 +619,12 @@ def _train_sentence_transformer(
             if self.control.should_training_stop:
                 break
 
+        if self.args.load_best_model_at_end and self.state.best_model_checkpoint:
+            dir_name = Path(self.state.best_model_checkpoint).name
+            if dir_name.startswith("step_"):
+                logger.info(f"Loading best SentenceTransformer model from step {dir_name[5:]}.")
+            self.model.model_body = SentenceTransformer(self.state.best_model_checkpoint, device=model_body.device)
+
         # Ensure logging the speed metrics
         num_train_samples = self.state.max_steps * args.embedding_batch_size  # * args.gradient_accumulation_steps
         metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
@@ -648,6 +662,24 @@ def _evaluate_with_loss(
         model_body.train()
         return sum(losses) / len(losses)
 
+    def _checkpoint(self, checkpoint_path: str, checkpoint_save_total_limit: int, step: int) -> None:
+        # Delete old checkpoints
+        if checkpoint_save_total_limit is not None and checkpoint_save_total_limit > 0:
+            old_checkpoints = []
+            for subdir in Path(checkpoint_path).glob("step_*"):
+                if subdir.name[5:].isdigit() and (
+                    self.state.best_model_checkpoint is None or subdir != Path(self.state.best_model_checkpoint)
+                ):
+                    old_checkpoints.append({"step": int(subdir.name[5:]), "path": str(subdir)})
+
+            if len(old_checkpoints) > checkpoint_save_total_limit - 1:
+                old_checkpoints = sorted(old_checkpoints, key=lambda x: x["step"])
+                shutil.rmtree(old_checkpoints[0]["path"])
+
+        checkpoint_file_path = str(Path(checkpoint_path) / f"step_{step}")
+        self.model.save_pretrained(checkpoint_file_path)
+        return checkpoint_file_path
+
     def train_classifier(
         self, x_train: List[str], y_train: Union[List[int], List[List[int]]], args: Optional[TrainingArguments] = None
     ) -> None:
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 9a6d9ee8..3ba751cb 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -20,6 +20,8 @@ class TrainingArguments:
     TrainingArguments is the subset of the arguments which relate to the training loop itself.
 
     Parameters:
+        output_dir (`str`, defaults to `"checkpoints"`):
+            The output directory where the model predictions and checkpoints will be written.
         batch_size (`Union[int, Tuple[int, int]]`, defaults to `(16, 2)`):
             Set the batch sizes for the embedding and classifier training phases respectively,
             or set both if an integer is provided.
@@ -116,12 +118,22 @@ class TrainingArguments:
                 - `"steps"`: Save is done every `save_steps`.
         save_steps (`int`, *optional*, defaults to 500):
             Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
-        save_total_limit (`int`, *optional*):
+        save_total_limit (`int`, *optional*, defaults to `1`):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            `output_dir`.
+            `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`.
+        load_best_model_at_end (`bool`, *optional*, defaults to `False`):
+            Whether or not to load the best model found during training at the end of training.
 
+            <Tip>
+
+            When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+            the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
+
+            </Tip>
     """
 
+    output_dir: str = "checkpoints"
+
     # batch_size is only used to conveniently set `embedding_batch_size` and `classifier_batch_size`
     # which are used in practice
     batch_size: Union[int, Tuple[int, int]] = field(default=(16, 2), repr=False)
@@ -174,11 +186,11 @@ class TrainingArguments:
 
     save_strategy: str = "steps"
     save_steps: int = 500
-    save_total_limit: Optional[int] = None
+    save_total_limit: Optional[int] = 1
 
-    load_best_model_at_end: bool = True
-    metric_for_best_model: str = field(default="embedding_loss", repr=False, init=False)
-    greater_is_better: bool = field(default=False, repr=False, init=False)
+    load_best_model_at_end: bool = False
+    metric_for_best_model: str = field(default="embedding_loss", repr=False)
+    greater_is_better: bool = field(default=False, repr=False)
 
     def __post_init__(self) -> None:
         # Set `self.embedding_batch_size` and `self.classifier_batch_size` using values from `self.batch_size`
@@ -211,8 +223,12 @@ def __post_init__(self) -> None:
                 f"warmup_proportion must be greater than or equal to 0.0 and less than or equal to 1.0! But it was: {self.warmup_proportion}"
             )
 
-        if self.report_to in (None, "all"):
+        if self.report_to in (None, "all", ["all"]):
             self.report_to = get_available_reporting_integrations()
+        elif self.report_to in ("none", ["none"]):
+            self.report_to = []
+        elif not isinstance(self.report_to, list):
+            self.report_to = [self.report_to]
 
         if self.logging_dir is None:
             self.logging_dir = default_logdir()
@@ -230,6 +246,33 @@ def __post_init__(self) -> None:
                     " `logging_steps`"
                 )
 
+        # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
+        if self.load_best_model_at_end:
+            if self.evaluation_strategy != self.save_strategy:
+                raise ValueError(
+                    "`load_best_model_at_end` requires the save and eval strategy to match, but found\n- Evaluation "
+                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                )
+            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+                if self.eval_steps < 1 or self.save_steps < 1:
+                    if not (self.eval_steps < 1 and self.save_steps < 1):
+                        raise ValueError(
+                            "`load_best_model_at_end` requires the saving steps to be a multiple of the evaluation "
+                            "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps"
+                            f"{self.save_steps} and eval_steps {self.eval_steps}."
+                        )
+                    # Work around floating point precision issues
+                    LARGE_MULTIPLIER = 1_000_000
+                    if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
+                        raise ValueError(
+                            "`load_best_model_at_end` requires the saving steps to be a multiple of the evaluation "
+                            f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
+                        )
+                raise ValueError(
+                    "`load_best_model_at_end` requires the saving steps to be a round multiple of the evaluation "
+                    f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
+                )
+
         # logging_steps must be non-zero for logging_strategy that is other than 'no'
         if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
             raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")

From 1930973c6a862c2914f4162254e4ad7d5ce460c5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Sat, 29 Jul 2023 16:43:29 +0200
Subject: [PATCH 38/77] Run formatting

---
 tests/test_deprecated_trainer.py | 6 ++----
 tests/test_trainer.py            | 8 ++------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/tests/test_deprecated_trainer.py b/tests/test_deprecated_trainer.py
index 82a2eea8..8e1ce1d5 100644
--- a/tests/test_deprecated_trainer.py
+++ b/tests/test_deprecated_trainer.py
@@ -5,9 +5,9 @@
 
 import evaluate
 import pytest
+import torch
 from datasets import Dataset, load_dataset
 from sentence_transformers import losses
-import torch
 from transformers.testing_utils import require_optuna
 from transformers.utils.hp_naming import TrialShortNamer
 
@@ -511,9 +511,7 @@ def test_trainer_evaluate_multilabel_f1():
 
 def test_trainer_evaluate_on_cpu() -> None:
     # This test used to fail if CUDA was available
-    dataset = Dataset.from_dict(
-        {"text": ["positive sentence", "negative sentence"], "label": [1, 0]}
-    )
+    dataset = Dataset.from_dict({"text": ["positive sentence", "negative sentence"], "label": [1, 0]})
     model = SetFitModel.from_pretrained(
         "sentence-transformers/paraphrase-albert-small-v2", use_differentiable_head=True
     )
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 4cde93e2..792888f8 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -74,9 +74,7 @@ def test_trainer_works_with_alternate_dataset_for_evaluate(self):
         alternate_dataset = Dataset.from_dict(
             {"text": ["x", "y", "z"], "label": [0, 1, 2], "extra_column": ["d", "e", "f"]}
         )
-        trainer = Trainer(
-            model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset
-        )
+        trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
         trainer.train()
         metrics = trainer.evaluate(alternate_dataset)
         self.assertNotEqual(metrics["accuracy"], 1.0)
@@ -470,9 +468,7 @@ def test_trainer_evaluate_multilabel_f1():
 
 def test_trainer_evaluate_on_cpu() -> None:
     # This test used to fail if CUDA was available
-    dataset = Dataset.from_dict(
-        {"text": ["positive sentence", "negative sentence"], "label": [1, 0]}
-    )
+    dataset = Dataset.from_dict({"text": ["positive sentence", "negative sentence"], "label": [1, 0]})
     model = SetFitModel.from_pretrained(
         "sentence-transformers/paraphrase-albert-small-v2", use_differentiable_head=True
     )

From a87cdc0395186b9f39972b66269c92e392bc87f0 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Sat, 29 Jul 2023 22:21:30 +0200
Subject: [PATCH 39/77] Add additional trainer tests

---
 src/setfit/trainer.py |  2 +-
 tests/conftest.py     |  8 +++++
 tests/test_trainer.py | 79 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 3 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index bf9d78fb..fadf5a7b 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -801,7 +801,7 @@ def hyperparameter_search(
         if backend is None:
             backend = default_hp_search_backend()
             if backend is None:
-                raise RuntimeError("optuna should be installed. To install optuna run `pip install optuna`. ")
+                raise RuntimeError("optuna should be installed. To install optuna run `pip install optuna`.")
         backend = HPSearchBackend(backend)
         if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
             raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..acf5b825
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,8 @@
+import pytest
+
+from setfit import SetFitModel
+
+
+@pytest.fixture()
+def model() -> SetFitModel:
+    return SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 792888f8..e5e0fa08 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1,3 +1,4 @@
+import os
 import pathlib
 import re
 import tempfile
@@ -8,6 +9,7 @@
 import torch
 from datasets import Dataset, load_dataset
 from sentence_transformers import losses
+from transformers import TrainerCallback
 from transformers.testing_utils import require_optuna
 from transformers.utils.hp_naming import TrialShortNamer
 
@@ -428,11 +430,10 @@ def test_trainer_works_with_non_default_loss_class(loss_class):
     # no asserts here because this is a regression test - we only test if an exception is raised
 
 
-def test_trainer_evaluate_with_strings():
+def test_trainer_evaluate_with_strings(model: SetFitModel):
     dataset = Dataset.from_dict(
         {"text": ["positive sentence", "negative sentence"], "label": ["positive", "negative"]}
     )
-    model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
     trainer = Trainer(
         model=model,
         args=TrainingArguments(num_iterations=1),
@@ -487,3 +488,77 @@ def compute_metric(y_pred, y_test) -> None:
     )
     trainer.train()
     trainer.evaluate()
+
+
+def test_no_model_no_model_init():
+    with pytest.raises(RuntimeError, match="`Trainer` requires either a `model` or `model_init` argument."):
+        Trainer()
+
+
+def test_model_and_model_init(model: SetFitModel):
+    def model_init() -> SetFitModel:
+        return model
+
+    with pytest.raises(RuntimeError, match="`Trainer` requires either a `model` or `model_init` argument."):
+        Trainer(model=model, model_init=model_init)
+
+
+def test_trainer_callbacks(model: SetFitModel):
+    trainer = Trainer(model=model)
+    assert len(trainer.callback_handler.callbacks) == 2
+
+    class TestCallback(TrainerCallback):
+        pass
+
+    callback = TestCallback()
+    trainer.add_callback(callback)
+    assert len(trainer.callback_handler.callbacks) == 3
+    assert trainer.callback_handler.callbacks[-1] == callback
+
+    assert trainer.pop_callback(callback) == callback
+    trainer.add_callback(callback)
+    assert trainer.callback_handler.callbacks[-1] == callback
+    trainer.remove_callback(callback)
+    assert callback not in trainer.callback_handler.callbacks
+
+
+def test_trainer_warn_freeze(model: SetFitModel):
+    trainer = Trainer(model)
+    with pytest.warns(
+        DeprecationWarning,
+        match="Trainer.freeze` is deprecated and will be removed in v2.0.0 of SetFit. "
+        "Please use `SetFitModel.freeze` directly instead.",
+    ):
+        trainer.freeze()
+
+
+def test_train_with_kwargs(model: SetFitModel):
+    train_dataset = Dataset.from_dict({"text": ["positive sentence", "negative sentence"], "label": [1, 0]})
+    trainer = Trainer(model, train_dataset=train_dataset)
+    with pytest.warns(DeprecationWarning, match="`Trainer.train` does not accept keyword arguments anymore."):
+        trainer.train(num_epochs=5)
+
+
+def test_train_no_dataset(model: SetFitModel):
+    trainer = Trainer(model)
+    with pytest.raises(ValueError, match="Training requires a `train_dataset` given to the `Trainer` initialization."):
+        trainer.train()
+
+
+def test_train_amp_save(model: SetFitModel, tmp_path):
+    args = TrainingArguments(output_dir=tmp_path, use_amp=True, save_steps=5)
+    dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
+    trainer = Trainer(model, args=args, train_dataset=dataset, eval_dataset=dataset)
+    trainer.train()
+    assert trainer.evaluate() == {"accuracy": 1.0}
+    assert os.listdir(tmp_path) == ["step_5"]
+
+
+def test_train_load_best(model: SetFitModel, tmp_path, caplog):
+    args = TrainingArguments(output_dir=tmp_path, save_steps=5, eval_steps=5, load_best_model_at_end=True)
+    dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
+    trainer = Trainer(model, args=args, train_dataset=dataset, eval_dataset=dataset)
+    with caplog.at_level(logging.INFO):
+        trainer.train()
+
+    assert any("Load pretrained SentenceTransformer" in text for _, _, text in caplog.record_tuples)

From d418759dc17618d2c83f6938c2b790f89b343124 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Sat, 29 Jul 2023 22:35:01 +0200
Subject: [PATCH 40/77] Use isinstance, required by flake8 release from 1hr ago

---
 src/setfit/exporters/onnx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/exporters/onnx.py b/src/setfit/exporters/onnx.py
index 51e4b41f..cd05c464 100644
--- a/src/setfit/exporters/onnx.py
+++ b/src/setfit/exporters/onnx.py
@@ -153,7 +153,7 @@ def export_sklearn_head_to_onnx(model_head: LogisticRegression, opset: int) -> o
 
     # If the datatype of the model is double we need to cast the outputs
     # from the setfit model to doubles for compatibility inside of ONNX.
-    if type(dtype) == onnxconverter_common.data_types.DoubleTensorType:
+    if isinstance(dtype, onnxconverter_common.data_types.DoubleTensorType):
         sklearn_model = Pipeline([("castdouble", CastTransformer(dtype=np.double)), ("head", model_head)])
     else:
         sklearn_model = model_head

From 08892f6753a82b95c165b2a7ca7ee8eb1ff20438 Mon Sep 17 00:00:00 2001
From: danstan5 <danstan5@hotmail.co.uk>
Date: Thu, 14 Sep 2023 12:04:56 +0100
Subject: [PATCH 41/77] sampler for refactor WIP

---
 src/setfit/modeling.py             |  74 ----
 src/setfit/sampler.py              | 144 ++++++++
 src/setfit/trainer.py              |  25 +-
 src/setfit/trainer_distillation.py |   2 +-
 src/setfit/trainer_unique_pairs.py | 531 +++++++++++++++++++++++++++++
 5 files changed, 687 insertions(+), 89 deletions(-)
 create mode 100644 src/setfit/sampler.py
 create mode 100644 src/setfit/trainer_unique_pairs.py

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 17740a3e..835968f4 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -673,77 +673,3 @@ def _from_pretrained(
             multi_target_strategy=multi_target_strategy,
             normalize_embeddings=normalize_embeddings,
         )
-
-
-def sentence_pairs_generation(sentences, labels, pairs):
-    # Initialize two empty lists to hold the (sentence, sentence) pairs and
-    # labels to indicate if a pair is positive or negative
-
-    num_classes = np.unique(labels)
-    label_to_idx = {x: i for i, x in enumerate(num_classes)}
-    positive_idxs = [np.where(labels == i)[0] for i in num_classes]
-    negative_idxs = [np.where(labels != i)[0] for i in num_classes]
-
-    for first_idx in range(len(sentences)):
-        current_sentence = sentences[first_idx]
-        label = labels[first_idx]
-        second_idx = np.random.choice(positive_idxs[label_to_idx[label]])
-        positive_sentence = sentences[second_idx]
-        # Prepare a positive pair and update the sentences and labels
-        # lists, respectively
-        pairs.append(InputExample(texts=[current_sentence, positive_sentence], label=1.0))
-
-        third_idx = np.random.choice(negative_idxs[label_to_idx[label]])
-        negative_sentence = sentences[third_idx]
-        # Prepare a negative pair of sentences and update our lists
-        pairs.append(InputExample(texts=[current_sentence, negative_sentence], label=0.0))
-    # Return a 2-tuple of our sentence pairs and labels
-    return pairs
-
-
-def sentence_pairs_generation_multilabel(sentences, labels, pairs):
-    # Initialize two empty lists to hold the (sentence, sentence) pairs and
-    # labels to indicate if a pair is positive or negative
-    for first_idx in range(len(sentences)):
-        current_sentence = sentences[first_idx]
-        sample_labels = np.where(labels[first_idx, :] == 1)[0]
-        if len(np.where(labels.dot(labels[first_idx, :].T) == 0)[0]) == 0:
-            continue
-        else:
-            for _label in sample_labels:
-                second_idx = np.random.choice(np.where(labels[:, _label] == 1)[0])
-                positive_sentence = sentences[second_idx]
-                # Prepare a positive pair and update the sentences and labels
-                # lists, respectively
-                pairs.append(InputExample(texts=[current_sentence, positive_sentence], label=1.0))
-
-            # Search for sample that don't have a label in common with current
-            # sentence
-            negative_idx = np.where(labels.dot(labels[first_idx, :].T) == 0)[0]
-            negative_sentence = sentences[np.random.choice(negative_idx)]
-            # Prepare a negative pair of sentences and update our lists
-            pairs.append(InputExample(texts=[current_sentence, negative_sentence], label=0.0))
-    # Return a 2-tuple of our sentence pairs and labels
-    return pairs
-
-
-def sentence_pairs_generation_cos_sim(sentences, pairs, cos_sim_matrix):
-    # initialize two empty lists to hold the (sentence, sentence) pairs and
-    # labels to indicate if a pair is positive or negative
-
-    idx = list(range(len(sentences)))
-
-    for first_idx in range(len(sentences)):
-        current_sentence = sentences[first_idx]
-        second_idx = int(np.random.choice([x for x in idx if x != first_idx]))
-
-        cos_sim = float(cos_sim_matrix[first_idx][second_idx])
-        paired_sentence = sentences[second_idx]
-        pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
-
-        third_idx = np.random.choice([x for x in idx if x != first_idx])
-        cos_sim = float(cos_sim_matrix[first_idx][third_idx])
-        paired_sentence = sentences[third_idx]
-        pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
-
-    return pairs
diff --git a/src/setfit/sampler.py b/src/setfit/sampler.py
new file mode 100644
index 00000000..f4ae97ce
--- /dev/null
+++ b/src/setfit/sampler.py
@@ -0,0 +1,144 @@
+from typing import Generator, Iterable, List, Optional
+
+import numpy as np
+from torch.utils.data import IterableDataset
+
+from sentence_transformers import InputExample
+
+from . import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def sentence_pairs_generation_cos_sim(sentences, pairs, cos_sim_matrix):
+    # initialize two empty lists to hold the (sentence, sentence) pairs and
+    # labels to indicate if a pair is positive or negative
+
+    idx = list(range(len(sentences)))
+
+    for first_idx in range(len(sentences)):
+        current_sentence = sentences[first_idx]
+        second_idx = int(np.random.choice([x for x in idx if x != first_idx]))
+
+        cos_sim = float(cos_sim_matrix[first_idx][second_idx])
+        paired_sentence = sentences[second_idx]
+        pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
+
+        third_idx = np.random.choice([x for x in idx if x != first_idx])
+        cos_sim = float(cos_sim_matrix[first_idx][third_idx])
+        paired_sentence = sentences[third_idx]
+        pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
+
+    return pairs
+
+
+def shuffle_combinations(iterable: Iterable, replacement: bool = True) -> Generator:
+    """Generates shuffled pair combinations for any iterable data provided.
+
+    Args:
+        iterable: data to generate pair combinations from
+        replacement: enable to include combinations of same samples,
+            equivalent to itertools.combinations_with_replacement
+
+    Returns:
+        Generator of shuffled pairs as a tuple
+    """
+    n = len(iterable)
+    k = 1 if not replacement else 0
+    idxs = np.stack(np.triu_indices(n, k), axis=-1)
+    for i in np.random.RandomState(seed=42).permutation(len(idxs)):
+        _idx, idx = idxs[i, :]
+        yield iterable[_idx], iterable[idx]
+
+
+class ConstrastiveDataset(IterableDataset):
+    def __init__(self,
+        examples: InputExample,
+        multilabel: bool,
+        num_iterations: Optional[None] = None,
+        sampling_strategy: str = "oversampling",
+    ):
+        """Generates positive and negative text pairs for contrastive learning.
+
+        Args:
+            examples (InputExample): text and labels in a text transformer dataclass
+            multilabel: set to process "multilabel" labels array
+            sampling_strategy: "unique", "oversampling", or "undersampling"
+            num_iterations: if provided explicitly sets the number of pairs to be generated
+                where n_pairs = n_iterations * n_sentences * 2 (for pos & neg pairs)
+        """
+        super().__init__()
+        self.pos_index = 0
+        self.neg_index = 0
+        self.pos_pairs = []
+        self.neg_pairs = []
+        self.sentences = np.array([s.texts[0] for s in examples])
+        self.labels = np.array([s.label for s in examples])
+        self.sentence_labels = list(zip(self.sentences, self.labels))
+
+        if multilabel:
+            self.generate_multilabel_pairs()
+        else:
+            self.generate_pairs()
+
+        if num_iterations is not None and num_iterations > 0:
+            self.len_pos_pairs = num_iterations * len(self.sentences)
+            self.len_neg_pairs = num_iterations * len(self.sentences)
+
+        elif sampling_strategy == "unique":
+            self.len_pos_pairs = len(self.pos_pairs)
+            self.len_neg_pairs = len(self.neg_pairs)
+
+        elif sampling_strategy == "undersampling":
+            self.len_pos_pairs = min(len(self.pos_pairs), len(self.neg_pairs))
+            self.len_neg_pairs = min(len(self.pos_pairs), len(self.neg_pairs))
+
+        elif sampling_strategy == "oversampling":
+            self.len_pos_pairs = max(len(self.pos_pairs), len(self.neg_pairs))
+            self.len_neg_pairs = max(len(self.pos_pairs), len(self.neg_pairs))
+
+        else:
+            raise ValueError("Invalid sampling strategy. Must be one of 'unique', 'oversampling', or 'undersampling'.")
+
+    def generate_pairs(self) -> None:
+        for (_text, _label), (text, label) in shuffle_combinations(self.sentence_labels):
+            if _label == label:
+                self.pos_pairs.append(InputExample(texts=[_text, text], label=1.0)) 
+            else:
+                self.neg_pairs.append(InputExample(texts=[_text, text], label=0.0))
+
+    def generate_multilabel_pairs(self) -> None:
+        for (_text, _label), (text, label) in shuffle_combinations(self.sentence_labels):
+            if any(np.logical_and(_label, label)):
+                # logical_and checks if labels are both set for each class
+                self.pos_pairs.append(InputExample(texts=[_text, text], label=1.0)) 
+            else:
+                self.neg_pairs.append(InputExample(texts=[_text, text], label=0.0))
+
+    def get_positive_pairs(self) -> List[InputExample]:
+        pairs = []
+        for _ in range(self.len_pos_pairs):
+            if self.pos_index >= len(self.pos_pairs):
+                self.pos_index = 0
+            pairs.append(self.pos_pairs[self.pos_index])
+            self.pos_index += 1
+        return pairs
+
+    def get_negative_pairs(self) -> List[InputExample]:
+        pairs = []
+        for _ in range(self.len_neg_pairs):
+            if self.neg_index >= len(self.neg_pairs):
+                self.neg_index = 0
+            pairs.append(self.neg_pairs[self.neg_index])
+            self.neg_index += 1
+        return pairs
+
+    def __iter__(self):
+        for pos_pair, neg_pair in zip(self.get_positive_pairs(), self.get_negative_pairs()):
+            yield pos_pair
+            yield neg_pair
+
+    def __len__(self):
+        return self.len_pos_pairs + self.len_neg_pairs
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index fadf5a7b..5d7c2a17 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -39,7 +39,7 @@
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
 from .losses import SupConLoss
-from .modeling import sentence_pairs_generation, sentence_pairs_generation_multilabel
+from .sampler import ConstrastiveDataset
 from .training_args import TrainingArguments
 from .utils import BestRun, default_hp_space_optuna
 
@@ -417,7 +417,10 @@ def train_embeddings(
         )
 
     def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], args: TrainingArguments):
+      
         # sentence-transformers adaptation
+        input_data = [InputExample(texts=[text], label=label) for text, label in zip(x, y)]
+
         if args.loss in [
             losses.BatchAllTripletLoss,
             losses.BatchHardTripletLoss,
@@ -425,9 +428,7 @@ def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], arg
             losses.BatchHardSoftMarginTripletLoss,
             SupConLoss,
         ]:
-            examples = [InputExample(texts=[text], label=label) for text, label in zip(x, y)]
-            data_sampler = SentenceLabelDataset(examples, samples_per_label=args.samples_per_label)
-
+            data_sampler = SentenceLabelDataset(input_data, samples_per_label=args.samples_per_label)
             batch_size = min(args.embedding_batch_size, len(data_sampler))
             dataloader = DataLoader(data_sampler, batch_size=batch_size, drop_last=True)
 
@@ -445,17 +446,13 @@ def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], arg
                     margin=args.margin,
                 )
         else:
-            examples = []
-
-            for _ in trange(args.num_iterations, desc="Generating Training Pairs", disable=not args.show_progress_bar):
-                if self.model.multi_target_strategy is not None:
-                    examples = sentence_pairs_generation_multilabel(np.array(x), np.array(y), examples)
-                else:
-                    examples = sentence_pairs_generation(np.array(x), np.array(y), examples)
-
-            batch_size = args.embedding_batch_size
-            dataloader = DataLoader(examples, shuffle=True, batch_size=batch_size)
+            data_sampler = ConstrastiveDataset(
+                input_data, self.model.multi_target_strategy, args.num_iterations
+            ) # sets default sampling_strategy="oversampling"
+            batch_size = min(args.embedding_batch_size, len(data_sampler))
+            dataloader = DataLoader(data_sampler, batch_size=batch_size, drop_last=False) # shuffle=True can be dropped in for 'randomising'
             loss = args.loss(self.model.model_body)
+
         return dataloader, loss, batch_size
 
     def log(self, args: TrainingArguments, logs: Dict[str, float]) -> None:
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 5a27f585..faebe43c 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -9,7 +9,7 @@
 from transformers.trainer_utils import set_seed
 
 from . import logging
-from .modeling import sentence_pairs_generation_cos_sim
+from .sampler import sentence_pairs_generation_cos_sim
 from .trainer import Trainer
 from .training_args import TrainingArguments
 
diff --git a/src/setfit/trainer_unique_pairs.py b/src/setfit/trainer_unique_pairs.py
new file mode 100644
index 00000000..461b880d
--- /dev/null
+++ b/src/setfit/trainer_unique_pairs.py
@@ -0,0 +1,531 @@
+import math
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+
+import evaluate
+from sentence_transformers import InputExample, losses
+from sentence_transformers.datasets import SentenceLabelDataset
+from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
+from torch.utils.data import DataLoader
+from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
+
+from . import logging
+from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
+from .modeling import SupConLoss
+from .sampler import OVERSAMPLE, ConstrastiveDataset
+from .utils import BestRun, default_hp_space_optuna
+
+
+if TYPE_CHECKING:
+    import optuna
+    from datasets import Dataset
+
+    from .modeling import SetFitModel
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+class SetFitTrainer:
+    """Trainer to train a SetFit model.
+
+    Args:
+        model (`SetFitModel`, *optional*):
+            The model to train. If not provided, a `model_init` must be passed.
+        train_dataset (`Dataset`):
+            The training dataset.
+        eval_dataset (`Dataset`, *optional*):
+            The evaluation dataset.
+        model_init (`Callable[[], SetFitModel]`, *optional*):
+            A function that instantiates the model to be used. If provided, each call to [`~SetFitTrainer.train`] will start
+            from a new instance of the model as given by this function when a `trial` is passed.
+        metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
+            The metric to use for evaluation. If a string is provided, we treat it as the metric name and load it with default settings.
+            If a callable is provided, it must take two arguments (`y_pred`, `y_test`).
+        loss_class (`nn.Module`, *optional*, defaults to `CosineSimilarityLoss`):
+            The loss function to use for contrastive training.
+        num_iterations (`int`, *optional*, defaults to `20`):
+            The number of iterations to generate sentence pairs for.
+            This argument is ignored if triplet loss is used.
+            It is only used in conjunction with `CosineSimilarityLoss`.
+        num_epochs (`int`, *optional*, defaults to `1`):
+            The number of epochs to train the Sentence Transformer body for.
+        learning_rate (`float`, *optional*, defaults to `2e-5`):
+            The learning rate to use for contrastive training.
+        batch_size (`int`, *optional*, defaults to `16`):
+            The batch size to use for contrastive training.
+        seed (`int`, *optional*, defaults to 42):
+            Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
+            [`~SetTrainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
+        column_mapping (`Dict[str, str]`, *optional*):
+            A mapping from the column names in the dataset to the column names expected by the model. The expected format is a dictionary with the following format: {"text_column_name": "text", "label_column_name: "label"}.
+        use_amp (`bool`, *optional*, defaults to `False`):
+            Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
+        warmup_proportion (`float`, *optional*, defaults to `0.1`):
+            Proportion of the warmup in the total training steps.
+            Must be greater than or equal to 0.0 and less than or equal to 1.0.
+        distance_metric (`Callable`, defaults to `BatchHardTripletLossDistanceFunction.cosine_distance`):
+            Function that returns a distance between two embeddings.
+            It is set for the triplet loss and
+            is ignored for `CosineSimilarityLoss` and `SupConLoss`.
+        margin (`float`, defaults to `0.25`): Margin for the triplet loss.
+            Negative samples should be at least margin further apart from the anchor than the positive.
+            This is ignored for `CosineSimilarityLoss`, `BatchHardSoftMarginTripletLoss` and `SupConLoss`.
+        samples_per_label (`int`, defaults to `2`): Number of consecutive, random and unique samples drawn per label.
+            This is only relevant for triplet loss and ignored for `CosineSimilarityLoss`.
+            Batch size should be a multiple of samples_per_label.
+    """
+
+    def __init__(
+        self,
+        model: Optional["SetFitModel"] = None,
+        train_dataset: Optional["Dataset"] = None,
+        eval_dataset: Optional["Dataset"] = None,
+        model_init: Optional[Callable[[], "SetFitModel"]] = None,
+        metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
+        loss_class: Optional[Any] = None,
+        num_iterations: int = 20,
+        num_epochs: int = 1,
+        learning_rate: float = 2e-5,
+        batch_size: int = 16,
+        seed: int = 42,
+        column_mapping: Optional[Dict[str, str]] = None,
+        use_amp: bool = False,
+        warmup_proportion: float = 0.1,
+        distance_metric: Callable = BatchHardTripletLossDistanceFunction.cosine_distance,
+        margin: float = 0.25,
+        samples_per_label: int = 2,
+        sampling_strategy: int = OVERSAMPLE,
+    ):
+        if (warmup_proportion < 0.0) or (warmup_proportion > 1.0):
+            raise ValueError(
+                f"warmup_proportion must be greater than or equal to 0.0 and less than or equal to 1.0! But it was: {warmup_proportion}"
+            )
+
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.model_init = model_init
+        self.metric = metric
+        self.loss_class = loss_class
+        self.num_iterations = num_iterations
+        self.num_epochs = num_epochs
+        self.learning_rate = learning_rate
+        self.batch_size = batch_size
+        self.seed = seed
+        self.column_mapping = column_mapping
+        self.use_amp = use_amp
+        self.warmup_proportion = warmup_proportion
+        self.distance_metric = distance_metric
+        self.margin = margin
+        self.samples_per_label = samples_per_label
+        self.sampling_strategy = sampling_strategy
+
+        if model is None:
+            if model_init is not None:
+                model = self.call_model_init()
+            else:
+                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument")
+        else:
+            if model_init is not None:
+                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both")
+
+        self.model = model
+        self.hp_search_backend = None
+        self._freeze = True  # If True, will train the body only; otherwise, train the body and head
+
+    def _validate_column_mapping(self, dataset: "Dataset") -> None:
+        """
+        Validates the provided column mapping against the dataset.
+        """
+        required_columns = {"text", "label"}
+        column_names = set(dataset.column_names)
+        if self.column_mapping is None and not required_columns.issubset(column_names):
+            raise ValueError(
+                f"A column mapping must be provided when the dataset does not contain the following columns: {required_columns}"
+            )
+        if self.column_mapping is not None:
+            missing_columns = required_columns.difference(self.column_mapping.values())
+            if missing_columns:
+                raise ValueError(
+                    f"The following columns are missing from the column mapping: {missing_columns}. Please provide a mapping for all required columns."
+                )
+            if not set(self.column_mapping.keys()).issubset(column_names):
+                raise ValueError(
+                    f"The following columns are missing from the dataset: {set(self.column_mapping.keys()).difference(column_names)}. Please provide a mapping for all required columns."
+                )
+
+    def _apply_column_mapping(self, dataset: "Dataset", column_mapping: Dict[str, str]) -> "Dataset":
+        """
+        Applies the provided column mapping to the dataset, renaming columns accordingly.
+        Extra features not in the column mapping are prefixed with `"feat_"`.
+        """
+        dataset = dataset.rename_columns(
+            {
+                **column_mapping,
+                **{col: f"feat_{col}" for col in dataset.column_names if col not in column_mapping},
+            }
+        )
+        dset_format = dataset.format
+        dataset = dataset.with_format(
+            type=dset_format["type"],
+            columns=dataset.column_names,
+            output_all_columns=dset_format["output_all_columns"],
+            **dset_format["format_kwargs"],
+        )
+        return dataset
+
+    def apply_hyperparameters(self, params: Dict[str, Any], final_model: bool = False):
+        """Applies a dictionary of hyperparameters to both the trainer and the model
+
+        Args:
+            params (`Dict[str, Any]`): The parameters, usually from `BestRun.hyperparameters`
+            final_model (`bool`, *optional*, defaults to `False`): If `True`, replace the `model_init()` function with a fixed model based on the parameters.
+        """
+        for key, value in params.items():
+            if hasattr(self, key):
+                old_attr = getattr(self, key, None)
+                # Casting value to the proper type
+                if old_attr is not None:
+                    value = type(old_attr)(value)
+                setattr(self, key, value)
+            elif number_of_arguments(self.model_init) == 0:  # we do not warn if model_init could be using it
+                logger.warning(
+                    f"Trying to set {key!r} in the hyperparameter search but there is no corresponding field in "
+                    "`SetFitTrainer`, and `model_init` does not take any arguments."
+                )
+
+        self.model = self.model_init(params)
+        if final_model:
+            self.model_init = None
+
+    def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
+        """HP search setup code"""
+
+        # Heavily inspired by transformers.Trainer._hp_search_setup
+        if self.hp_search_backend is None or trial is None:
+            return
+
+        if isinstance(trial, Dict):  # For passing a Dict to train() -- mostly unused for now
+            params = trial
+        elif self.hp_search_backend == HPSearchBackend.OPTUNA:
+            params = self.hp_space(trial)
+        else:
+            raise ValueError("Invalid trial parameter")
+
+        logger.info(f"Trial: {params}")
+        self.apply_hyperparameters(params, final_model=False)
+
+    def call_model_init(self, params: Optional[Dict[str, Any]] = None):
+        model_init_argcount = number_of_arguments(self.model_init)
+        if model_init_argcount == 0:
+            model = self.model_init()
+        elif model_init_argcount == 1:
+            model = self.model_init(params)
+        else:
+            raise RuntimeError("`model_init` should have 0 or 1 argument.")
+
+        if model is None:
+            raise RuntimeError("`model_init` should not return None.")
+
+        return model
+
+    def freeze(self):
+        """
+        Freeze SetFitModel's differentiable head.
+        Note: call this function only when using the differentiable head.
+        """
+        if not self.model.has_differentiable_head:
+            raise ValueError("Please use the differentiable head in `SetFitModel` when calling this function.")
+
+        self._freeze = True  # Currently use self._freeze as a switch
+        self.model.freeze("head")
+
+    def unfreeze(self, keep_body_frozen: bool = False):
+        """
+        Unfreeze SetFitModel's differentiable head.
+        Note: call this function only when using the differentiable head.
+
+        Args:
+            keep_body_frozen (`bool`, *optional*, defaults to `False`):
+                Whether to freeze the body when unfreeze the head.
+        """
+        if not self.model.has_differentiable_head:
+            raise ValueError("Please use the differentiable head in `SetFitModel` when calling this function.")
+
+        self._freeze = False  # Currently use self._freeze as a switch
+        self.model.unfreeze("head")
+        if keep_body_frozen:
+            self.model.freeze("body")
+        else:  # ensure to unfreeze the body
+            self.model.unfreeze("body")
+
+    def train(
+        self,
+        num_epochs: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        learning_rate: Optional[float] = None,
+        body_learning_rate: Optional[float] = None,
+        l2_weight: Optional[float] = None,
+        max_length: Optional[int] = None,
+        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
+        show_progress_bar: bool = True,
+    ):
+        """
+        Main training entry point.
+
+        Args:
+            num_epochs (`int`, *optional*):
+                Temporary change the number of epochs to train the Sentence Transformer body/head for.
+                If ignore, will use the value given in initialization.
+            batch_size (`int`, *optional*):
+                Temporary change the batch size to use for contrastive training or logistic regression.
+                If ignore, will use the value given in initialization.
+            learning_rate (`float`, *optional*):
+                Temporary change the learning rate to use for contrastive training or SetFitModel's head in logistic regression.
+                If ignore, will use the value given in initialization.
+            body_learning_rate (`float`, *optional*):
+                Temporary change the learning rate to use for SetFitModel's body in logistic regression only.
+                If ignore, will be the same as `learning_rate`.
+            l2_weight (`float`, *optional*):
+                Temporary change the weight of L2 regularization for SetFitModel's differentiable head in logistic regression.
+            max_length (int, *optional*, defaults to `None`):
+                The maximum number of tokens for one data sample. Currently only for training the differentiable head.
+                If `None`, will use the maximum number of tokens the model body can accept.
+                If `max_length` is greater than the maximum number of acceptable tokens the model body can accept, it will be set to the maximum number of acceptable tokens.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Whether to show a bar that indicates training progress.
+        """
+        set_seed(self.seed)  # Seed must be set before instantiating the model when using model_init.
+
+        if trial:  # Trial and model initialization
+            self._hp_search_setup(trial)  # sets trainer parameters and initializes model
+
+        if self.train_dataset is None:
+            raise ValueError("Training requires a `train_dataset` given to the `SetFitTrainer` initialization.")
+
+        self._validate_column_mapping(self.train_dataset)
+        train_dataset = self.train_dataset
+        if self.column_mapping is not None:
+            logger.info("Applying column mapping to training dataset")
+            train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
+
+        if self.loss_class is None:
+            logger.warning("No `loss_class` detected! Using `CosineSimilarityLoss` as the default.")
+            self.loss_class = losses.CosineSimilarityLoss
+
+        multilabel = True if self.model.multi_target_strategy is not None else False
+
+        num_epochs = num_epochs or self.num_epochs
+        batch_size = batch_size or self.batch_size
+        learning_rate = learning_rate or self.learning_rate
+
+        # dataset generation
+        x_train = train_dataset["text"]
+        y_train = train_dataset["label"]
+        train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
+
+        if not self.model.has_differentiable_head or self._freeze:
+            # sentence-transformers adaptation
+            if self.loss_class in [
+                losses.BatchAllTripletLoss,
+                losses.BatchHardTripletLoss,
+                losses.BatchSemiHardTripletLoss,
+                losses.BatchHardSoftMarginTripletLoss,
+                SupConLoss,
+            ]:
+                train_data_sampler = SentenceLabelDataset(train_examples, samples_per_label=self.samples_per_label)
+                batch_size = min(batch_size, len(train_data_sampler))
+                train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
+            else: 
+                train_data_sampler = ConstrastiveDataset(train_examples, multilabel, self.sampling_strategy)
+                train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=False)
+
+            total_train_steps = len(train_dataloader) * num_epochs
+            logger.info("***** Running training *****")
+            logger.info(f"  Num examples per epoch = {len(train_data_sampler)}")
+            logger.info(f"  Num epochs = {num_epochs}")
+            logger.info(f"  Total optimization steps = {total_train_steps}")
+            logger.info(f"  Total train batch size = {batch_size}")
+
+            # setup training loss
+            if self.loss_class in [
+                losses.BatchAllTripletLoss,
+                losses.BatchHardTripletLoss,
+                losses.BatchSemiHardTripletLoss,
+                losses.BatchHardSoftMarginTripletLoss,
+            ]:
+                train_loss = self.loss_class(
+                    model=self.model.model_body,
+                    distance_metric=self.distance_metric,
+                    margin=self.margin,
+                )
+            elif self.loss_class is losses.BatchHardSoftMarginTripletLoss:
+                train_loss = self.loss_class(
+                    model=self.model.model_body,
+                    distance_metric=self.distance_metric,
+                )
+            else:
+                train_loss = self.loss_class(model=self.model.model_body)
+
+            warmup_steps = math.ceil(total_train_steps * self.warmup_proportion)
+            self.model.model_body.fit(
+                train_objectives=[(train_dataloader, train_loss)],
+                epochs=num_epochs,
+                optimizer_params={"lr": learning_rate},
+                warmup_steps=warmup_steps,
+                show_progress_bar=show_progress_bar,
+                use_amp=self.use_amp,
+            )
+
+        if not self.model.has_differentiable_head or not self._freeze:
+            # Train the final classifier
+            self.model.fit(
+                x_train,
+                y_train,
+                num_epochs=num_epochs,
+                batch_size=batch_size,
+                learning_rate=learning_rate,
+                body_learning_rate=body_learning_rate,
+                l2_weight=l2_weight,
+                max_length=max_length,
+                show_progress_bar=True,
+            )
+
+    def evaluate(self):
+        """
+        Computes the metrics for a given classifier.
+
+        Returns:
+            `Dict[str, float]`: The evaluation metrics.
+        """
+
+        self._validate_column_mapping(self.eval_dataset)
+        eval_dataset = self.eval_dataset
+
+        if self.column_mapping is not None:
+            logger.info("Applying column mapping to evaluation dataset")
+            eval_dataset = self._apply_column_mapping(self.eval_dataset, self.column_mapping)
+
+        x_test = eval_dataset["text"]
+        y_test = eval_dataset["label"]
+
+        logger.info("***** Running evaluation *****")
+        y_pred = self.model.predict(x_test)
+
+        if isinstance(self.metric, str):
+            metric_config = "multilabel" if self.model.multi_target_strategy is not None else None
+            metric_fn = evaluate.load(self.metric, config_name=metric_config)
+
+            return metric_fn.compute(predictions=y_pred, references=y_test)
+
+        elif callable(self.metric):
+            return self.metric(y_pred, y_test)
+
+        else:
+            raise ValueError("metric must be a string or a callable")
+
+    def hyperparameter_search(
+        self,
+        hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None,
+        compute_objective: Optional[Callable[[Dict[str, float]], float]] = None,
+        n_trials: int = 10,
+        direction: str = "maximize",
+        backend: Optional[Union["str", HPSearchBackend]] = None,
+        hp_name: Optional[Callable[["optuna.Trial"], str]] = None,
+        **kwargs,
+    ) -> BestRun:
+        """
+        Launch a hyperparameter search using `optuna`. The optimized quantity is determined
+        by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
+        the sum of all metrics otherwise.
+
+        <Tip warning={true}>
+
+        To use this method, you need to have provided a `model_init` when initializing your [`SetFitTrainer`]: we need to
+        reinitialize the model at each new run.
+
+        </Tip>
+
+        Args:
+            hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*):
+                A function that defines the hyperparameter search space. Will default to
+                [`~trainer_utils.default_hp_space_optuna`].
+            compute_objective (`Callable[[Dict[str, float]], float]`, *optional*):
+                A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
+                method. Will default to [`~trainer_utils.default_compute_objective`] which uses the sum of metrics.
+            n_trials (`int`, *optional*, defaults to 100):
+                The number of trial runs to test.
+            direction (`str`, *optional*, defaults to `"maximize"`):
+                Whether to optimize greater or lower objects. Can be `"minimize"` or `"maximize"`, you should pick
+                `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics.
+            backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
+                The backend to use for hyperparameter search. Only optuna is supported for now.
+                TODO: add support for ray and sigopt.
+            hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
+                A function that defines the trial/run name. Will default to None.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to `optuna.create_study`. For more
+                information see:
+
+                - the documentation of
+                  [optuna.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
+
+        Returns:
+            [`trainer_utils.BestRun`]: All the information about the best run.
+        """
+        if backend is None:
+            backend = default_hp_search_backend()
+            if backend is None:
+                raise RuntimeError("optuna should be installed. " "To install optuna run `pip install optuna`. ")
+        backend = HPSearchBackend(backend)
+        if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
+            raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
+        elif backend != HPSearchBackend.OPTUNA:
+            raise RuntimeError("Only optuna backend is supported for hyperparameter search.")
+        self.hp_search_backend = backend
+        if self.model_init is None:
+            raise RuntimeError(
+                "To use hyperparameter search, you need to pass your model through a model_init function."
+            )
+
+        self.hp_space = default_hp_space_optuna if hp_space is None else hp_space
+        self.hp_name = hp_name
+        self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
+
+        backend_dict = {
+            HPSearchBackend.OPTUNA: run_hp_search_optuna,
+        }
+        best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
+
+        self.hp_search_backend = None
+        return best_run
+
+    def push_to_hub(
+        self,
+        repo_path_or_name: Optional[str] = None,
+        repo_url: Optional[str] = None,
+        commit_message: Optional[str] = "Add SetFit model",
+        organization: Optional[str] = None,
+        private: Optional[bool] = None,
+        api_endpoint: Optional[str] = None,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        git_user: Optional[str] = None,
+        git_email: Optional[str] = None,
+        config: Optional[dict] = None,
+        skip_lfs_files: bool = False,
+    ):
+
+        return self.model.push_to_hub(
+            repo_path_or_name,
+            repo_url,
+            commit_message,
+            organization,
+            private,
+            api_endpoint,
+            use_auth_token,
+            git_user,
+            git_email,
+            config,
+            skip_lfs_files,
+        )

From 173f0845baa6133491bfc89c33328a4dd8c2227c Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 17 Oct 2023 21:49:50 +0200
Subject: [PATCH 42/77] Run formatters

---
 src/setfit/modeling.py             |  2 +-
 src/setfit/sampler.py              | 18 +++++++++---------
 src/setfit/trainer.py              | 12 +++++-------
 src/setfit/trainer_unique_pairs.py |  3 +--
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 466d47a3..0662d2d3 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -17,7 +17,7 @@
 import requests
 import torch
 from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
-from sentence_transformers import InputExample, SentenceTransformer, models
+from sentence_transformers import SentenceTransformer, models
 from sklearn.linear_model import LogisticRegression
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
diff --git a/src/setfit/sampler.py b/src/setfit/sampler.py
index f4ae97ce..38e4ea92 100644
--- a/src/setfit/sampler.py
+++ b/src/setfit/sampler.py
@@ -1,9 +1,8 @@
-from typing import Generator, Iterable, List, Optional
+from typing import Generator, Iterable, Iterator, List, Optional
 
 import numpy as np
-from torch.utils.data import IterableDataset
-
 from sentence_transformers import InputExample
+from torch.utils.data import IterableDataset
 
 from . import logging
 
@@ -54,12 +53,13 @@ def shuffle_combinations(iterable: Iterable, replacement: bool = True) -> Genera
 
 
 class ConstrastiveDataset(IterableDataset):
-    def __init__(self,
+    def __init__(
+        self,
         examples: InputExample,
         multilabel: bool,
         num_iterations: Optional[None] = None,
         sampling_strategy: str = "oversampling",
-    ):
+    ) -> None:
         """Generates positive and negative text pairs for contrastive learning.
 
         Args:
@@ -105,7 +105,7 @@ def __init__(self,
     def generate_pairs(self) -> None:
         for (_text, _label), (text, label) in shuffle_combinations(self.sentence_labels):
             if _label == label:
-                self.pos_pairs.append(InputExample(texts=[_text, text], label=1.0)) 
+                self.pos_pairs.append(InputExample(texts=[_text, text], label=1.0))
             else:
                 self.neg_pairs.append(InputExample(texts=[_text, text], label=0.0))
 
@@ -113,7 +113,7 @@ def generate_multilabel_pairs(self) -> None:
         for (_text, _label), (text, label) in shuffle_combinations(self.sentence_labels):
             if any(np.logical_and(_label, label)):
                 # logical_and checks if labels are both set for each class
-                self.pos_pairs.append(InputExample(texts=[_text, text], label=1.0)) 
+                self.pos_pairs.append(InputExample(texts=[_text, text], label=1.0))
             else:
                 self.neg_pairs.append(InputExample(texts=[_text, text], label=0.0))
 
@@ -135,10 +135,10 @@ def get_negative_pairs(self) -> List[InputExample]:
             self.neg_index += 1
         return pairs
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[InputExample]:
         for pos_pair, neg_pair in zip(self.get_positive_pairs(), self.get_negative_pairs()):
             yield pos_pair
             yield neg_pair
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.len_pos_pairs + self.len_neg_pairs
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 5d7c2a17..6d2e3130 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -6,7 +6,6 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import evaluate
-import numpy as np
 import torch
 from datasets import Dataset, DatasetDict
 from sentence_transformers import InputExample, SentenceTransformer, losses
@@ -16,7 +15,7 @@
 from torch import nn
 from torch.cuda.amp import autocast
 from torch.utils.data import DataLoader
-from tqdm.autonotebook import tqdm, trange
+from tqdm.autonotebook import tqdm
 from transformers.integrations import get_reporting_integration_callbacks
 from transformers.trainer_callback import (
     CallbackHandler,
@@ -417,7 +416,6 @@ def train_embeddings(
         )
 
     def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], args: TrainingArguments):
-      
         # sentence-transformers adaptation
         input_data = [InputExample(texts=[text], label=label) for text, label in zip(x, y)]
 
@@ -446,11 +444,11 @@ def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], arg
                     margin=args.margin,
                 )
         else:
-            data_sampler = ConstrastiveDataset(
-                input_data, self.model.multi_target_strategy, args.num_iterations
-            ) # sets default sampling_strategy="oversampling"
+            # sets default sampling_strategy="oversampling"
+            data_sampler = ConstrastiveDataset(input_data, self.model.multi_target_strategy, args.num_iterations)
             batch_size = min(args.embedding_batch_size, len(data_sampler))
-            dataloader = DataLoader(data_sampler, batch_size=batch_size, drop_last=False) # shuffle=True can be dropped in for 'randomising'
+            # shuffle=True can be dropped in for 'randomising'
+            dataloader = DataLoader(data_sampler, batch_size=batch_size, drop_last=False)
             loss = args.loss(self.model.model_body)
 
         return dataloader, loss, batch_size
diff --git a/src/setfit/trainer_unique_pairs.py b/src/setfit/trainer_unique_pairs.py
index 461b880d..73004574 100644
--- a/src/setfit/trainer_unique_pairs.py
+++ b/src/setfit/trainer_unique_pairs.py
@@ -337,7 +337,7 @@ def train(
                 train_data_sampler = SentenceLabelDataset(train_examples, samples_per_label=self.samples_per_label)
                 batch_size = min(batch_size, len(train_data_sampler))
                 train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
-            else: 
+            else:
                 train_data_sampler = ConstrastiveDataset(train_examples, multilabel, self.sampling_strategy)
                 train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=False)
 
@@ -515,7 +515,6 @@ def push_to_hub(
         config: Optional[dict] = None,
         skip_lfs_files: bool = False,
     ):
-
         return self.model.push_to_hub(
             repo_path_or_name,
             repo_url,

From c23959aec721c5d086404be29730181130bacfbb Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 17 Oct 2023 22:11:33 +0200
Subject: [PATCH 43/77] Remove tests from modeling.py

The code here is moved to sampler.py, which will need its own tester file
---
 tests/test_modeling.py | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/tests/test_modeling.py b/tests/test_modeling.py
index c31417d2..a5e279f6 100644
--- a/tests/test_modeling.py
+++ b/tests/test_modeling.py
@@ -10,42 +10,12 @@
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
 
 from setfit import SetFitHead, SetFitModel
-from setfit.modeling import MODEL_HEAD_NAME, sentence_pairs_generation, sentence_pairs_generation_multilabel
+from setfit.modeling import MODEL_HEAD_NAME
 
 
 torch_cuda_available = pytest.mark.skipif(not torch.cuda.is_available(), reason="PyTorch must be compiled with CUDA")
 
 
-def test_sentence_pairs_generation():
-    sentences = np.array(["sent 1", "sent 2", "sent 3"])
-    labels = np.array(["label 1", "label 2", "label 3"])
-
-    pairs = []
-    n_iterations = 2
-
-    for _ in range(n_iterations):
-        pairs = sentence_pairs_generation(sentences, labels, pairs)
-
-    assert len(pairs) == 12
-    assert pairs[0].texts == ["sent 1", "sent 1"]
-    assert pairs[0].label == 1.0
-
-
-def test_sentence_pairs_generation_multilabel():
-    sentences = np.array(["sent 1", "sent 2", "sent 3"])
-    labels = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]])
-
-    pairs = []
-    n_iterations = 2
-
-    for _ in range(n_iterations):
-        pairs = sentence_pairs_generation_multilabel(sentences, labels, pairs)
-
-    assert len(pairs) == 12
-    assert pairs[0].texts == ["sent 1", "sent 1"]
-    assert pairs[0].label == 1.0
-
-
 def test_setfit_model_body():
     model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
 

From 0fa3870c00e178e750b4b5b322b7d009b7607c1e Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 17 Oct 2023 22:12:38 +0200
Subject: [PATCH 44/77] Add missing type hint

---
 src/setfit/trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index fadf5a7b..ca9f3973 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import evaluate
 import numpy as np
@@ -416,7 +416,9 @@ def train_embeddings(
             warmup_steps=warmup_steps,
         )
 
-    def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], args: TrainingArguments):
+    def get_dataloader(
+        self, x: List[str], y: Union[List[int], List[List[int]]], args: TrainingArguments
+    ) -> Tuple[DataLoader, nn.Module, int]:
         # sentence-transformers adaptation
         if args.loss in [
             losses.BatchAllTripletLoss,

From 3969f3833d0d89543b9d83bd7ee6ed2b2794cce0 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 17 Oct 2023 22:15:15 +0200
Subject: [PATCH 45/77] Adjust test to still pass if W&B/Tensorboard are
 installed

---
 tests/test_trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index e5e0fa08..1a75fefb 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -505,14 +505,16 @@ def model_init() -> SetFitModel:
 
 def test_trainer_callbacks(model: SetFitModel):
     trainer = Trainer(model=model)
-    assert len(trainer.callback_handler.callbacks) == 2
+    assert len(trainer.callback_handler.callbacks) >= 2
+    callback_names = {callback.__class__.__name__ for callback in trainer.callback_handler.callbacks}
+    assert {"DefaultFlowCallback", "ProgressCallback"} <= callback_names
 
     class TestCallback(TrainerCallback):
         pass
 
     callback = TestCallback()
     trainer.add_callback(callback)
-    assert len(trainer.callback_handler.callbacks) == 3
+    assert len(trainer.callback_handler.callbacks) == len(callback_names) + 1
     assert trainer.callback_handler.callbacks[-1] == callback
 
     assert trainer.pop_callback(callback) == callback

From 851f0bb1d7cb298bc06eb3bce4e2b5e2ec7e5058 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Tue, 17 Oct 2023 23:06:16 +0200
Subject: [PATCH 46/77] The log/eval/save steps should be saved on the state
 instead

Since transformers v4.32.0,
---
 src/setfit/trainer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index ca9f3973..a4ec70f2 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -142,7 +142,6 @@ def __init__(
         self.state = TrainerState()
         self.control = TrainerControl()
         self.add_callback(DEFAULT_PROGRESS_CALLBACK if self.args.show_progress_bar else PrinterCallback)
-
         self.control = self.callback_handler.on_init_end(args, self.state, self.control)
 
     def add_callback(self, callback):
@@ -392,6 +391,10 @@ def train_embeddings(
                 Temporarily change the training arguments for this training call.
         """
         args = args or self.args or TrainingArguments()
+        # Since transformers v4.32.0, the log/eval/save steps should be saved on the state instead
+        self.state.logging_steps = args.logging_steps
+        self.state.eval_steps = args.eval_steps
+        self.state.save_steps = args.save_steps
 
         train_dataloader, loss_func, batch_size = self.get_dataloader(x_train, y_train, args=args)
         if x_eval is not None:

From d37ee09ba5764ca6793521586f3de7267c6f55a2 Mon Sep 17 00:00:00 2001
From: danstan5 <danstan5@hotmail.co.uk>
Date: Thu, 19 Oct 2023 12:37:46 +0100
Subject: [PATCH 47/77] sampler logic fix "unique" strategy

---
 src/setfit/sampler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/setfit/sampler.py b/src/setfit/sampler.py
index f4ae97ce..24146274 100644
--- a/src/setfit/sampler.py
+++ b/src/setfit/sampler.py
@@ -1,3 +1,4 @@
+from itertools import zip_longest
 from typing import Generator, Iterable, List, Optional
 
 import numpy as np
@@ -136,9 +137,11 @@ def get_negative_pairs(self) -> List[InputExample]:
         return pairs
 
     def __iter__(self):
-        for pos_pair, neg_pair in zip(self.get_positive_pairs(), self.get_negative_pairs()):
-            yield pos_pair
-            yield neg_pair
+        for pos_pair, neg_pair in zip_longest(self.get_positive_pairs(), self.get_negative_pairs()):
+            if pos_pair is not None:
+                yield pos_pair
+            if neg_pair is not None:
+                yield neg_pair
 
     def __len__(self):
         return self.len_pos_pairs + self.len_neg_pairs

From 0ef88378bff9bcb5bda19ea03e1fb632ce70c367 Mon Sep 17 00:00:00 2001
From: danstan5 <danstan5@hotmail.co.uk>
Date: Thu, 19 Oct 2023 12:38:37 +0100
Subject: [PATCH 48/77] add sampler tests (not complete)

---
 src/setfit/trainer_unique_pairs.py | 531 -----------------------------
 tests/test_modeling.py             |  30 --
 tests/test_sampler.py              |  50 +++
 3 files changed, 50 insertions(+), 561 deletions(-)
 delete mode 100644 src/setfit/trainer_unique_pairs.py
 create mode 100644 tests/test_sampler.py

diff --git a/src/setfit/trainer_unique_pairs.py b/src/setfit/trainer_unique_pairs.py
deleted file mode 100644
index 461b880d..00000000
--- a/src/setfit/trainer_unique_pairs.py
+++ /dev/null
@@ -1,531 +0,0 @@
-import math
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
-
-import evaluate
-from sentence_transformers import InputExample, losses
-from sentence_transformers.datasets import SentenceLabelDataset
-from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
-from torch.utils.data import DataLoader
-from transformers.trainer_utils import HPSearchBackend, default_compute_objective, number_of_arguments, set_seed
-
-from . import logging
-from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
-from .modeling import SupConLoss
-from .sampler import OVERSAMPLE, ConstrastiveDataset
-from .utils import BestRun, default_hp_space_optuna
-
-
-if TYPE_CHECKING:
-    import optuna
-    from datasets import Dataset
-
-    from .modeling import SetFitModel
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-
-class SetFitTrainer:
-    """Trainer to train a SetFit model.
-
-    Args:
-        model (`SetFitModel`, *optional*):
-            The model to train. If not provided, a `model_init` must be passed.
-        train_dataset (`Dataset`):
-            The training dataset.
-        eval_dataset (`Dataset`, *optional*):
-            The evaluation dataset.
-        model_init (`Callable[[], SetFitModel]`, *optional*):
-            A function that instantiates the model to be used. If provided, each call to [`~SetFitTrainer.train`] will start
-            from a new instance of the model as given by this function when a `trial` is passed.
-        metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
-            The metric to use for evaluation. If a string is provided, we treat it as the metric name and load it with default settings.
-            If a callable is provided, it must take two arguments (`y_pred`, `y_test`).
-        loss_class (`nn.Module`, *optional*, defaults to `CosineSimilarityLoss`):
-            The loss function to use for contrastive training.
-        num_iterations (`int`, *optional*, defaults to `20`):
-            The number of iterations to generate sentence pairs for.
-            This argument is ignored if triplet loss is used.
-            It is only used in conjunction with `CosineSimilarityLoss`.
-        num_epochs (`int`, *optional*, defaults to `1`):
-            The number of epochs to train the Sentence Transformer body for.
-        learning_rate (`float`, *optional*, defaults to `2e-5`):
-            The learning rate to use for contrastive training.
-        batch_size (`int`, *optional*, defaults to `16`):
-            The batch size to use for contrastive training.
-        seed (`int`, *optional*, defaults to 42):
-            Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
-            [`~SetTrainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
-        column_mapping (`Dict[str, str]`, *optional*):
-            A mapping from the column names in the dataset to the column names expected by the model. The expected format is a dictionary with the following format: {"text_column_name": "text", "label_column_name: "label"}.
-        use_amp (`bool`, *optional*, defaults to `False`):
-            Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0
-        warmup_proportion (`float`, *optional*, defaults to `0.1`):
-            Proportion of the warmup in the total training steps.
-            Must be greater than or equal to 0.0 and less than or equal to 1.0.
-        distance_metric (`Callable`, defaults to `BatchHardTripletLossDistanceFunction.cosine_distance`):
-            Function that returns a distance between two embeddings.
-            It is set for the triplet loss and
-            is ignored for `CosineSimilarityLoss` and `SupConLoss`.
-        margin (`float`, defaults to `0.25`): Margin for the triplet loss.
-            Negative samples should be at least margin further apart from the anchor than the positive.
-            This is ignored for `CosineSimilarityLoss`, `BatchHardSoftMarginTripletLoss` and `SupConLoss`.
-        samples_per_label (`int`, defaults to `2`): Number of consecutive, random and unique samples drawn per label.
-            This is only relevant for triplet loss and ignored for `CosineSimilarityLoss`.
-            Batch size should be a multiple of samples_per_label.
-    """
-
-    def __init__(
-        self,
-        model: Optional["SetFitModel"] = None,
-        train_dataset: Optional["Dataset"] = None,
-        eval_dataset: Optional["Dataset"] = None,
-        model_init: Optional[Callable[[], "SetFitModel"]] = None,
-        metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
-        loss_class: Optional[Any] = None,
-        num_iterations: int = 20,
-        num_epochs: int = 1,
-        learning_rate: float = 2e-5,
-        batch_size: int = 16,
-        seed: int = 42,
-        column_mapping: Optional[Dict[str, str]] = None,
-        use_amp: bool = False,
-        warmup_proportion: float = 0.1,
-        distance_metric: Callable = BatchHardTripletLossDistanceFunction.cosine_distance,
-        margin: float = 0.25,
-        samples_per_label: int = 2,
-        sampling_strategy: int = OVERSAMPLE,
-    ):
-        if (warmup_proportion < 0.0) or (warmup_proportion > 1.0):
-            raise ValueError(
-                f"warmup_proportion must be greater than or equal to 0.0 and less than or equal to 1.0! But it was: {warmup_proportion}"
-            )
-
-        self.train_dataset = train_dataset
-        self.eval_dataset = eval_dataset
-        self.model_init = model_init
-        self.metric = metric
-        self.loss_class = loss_class
-        self.num_iterations = num_iterations
-        self.num_epochs = num_epochs
-        self.learning_rate = learning_rate
-        self.batch_size = batch_size
-        self.seed = seed
-        self.column_mapping = column_mapping
-        self.use_amp = use_amp
-        self.warmup_proportion = warmup_proportion
-        self.distance_metric = distance_metric
-        self.margin = margin
-        self.samples_per_label = samples_per_label
-        self.sampling_strategy = sampling_strategy
-
-        if model is None:
-            if model_init is not None:
-                model = self.call_model_init()
-            else:
-                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument")
-        else:
-            if model_init is not None:
-                raise RuntimeError("`SetFitTrainer` requires either a `model` or `model_init` argument, but not both")
-
-        self.model = model
-        self.hp_search_backend = None
-        self._freeze = True  # If True, will train the body only; otherwise, train the body and head
-
-    def _validate_column_mapping(self, dataset: "Dataset") -> None:
-        """
-        Validates the provided column mapping against the dataset.
-        """
-        required_columns = {"text", "label"}
-        column_names = set(dataset.column_names)
-        if self.column_mapping is None and not required_columns.issubset(column_names):
-            raise ValueError(
-                f"A column mapping must be provided when the dataset does not contain the following columns: {required_columns}"
-            )
-        if self.column_mapping is not None:
-            missing_columns = required_columns.difference(self.column_mapping.values())
-            if missing_columns:
-                raise ValueError(
-                    f"The following columns are missing from the column mapping: {missing_columns}. Please provide a mapping for all required columns."
-                )
-            if not set(self.column_mapping.keys()).issubset(column_names):
-                raise ValueError(
-                    f"The following columns are missing from the dataset: {set(self.column_mapping.keys()).difference(column_names)}. Please provide a mapping for all required columns."
-                )
-
-    def _apply_column_mapping(self, dataset: "Dataset", column_mapping: Dict[str, str]) -> "Dataset":
-        """
-        Applies the provided column mapping to the dataset, renaming columns accordingly.
-        Extra features not in the column mapping are prefixed with `"feat_"`.
-        """
-        dataset = dataset.rename_columns(
-            {
-                **column_mapping,
-                **{col: f"feat_{col}" for col in dataset.column_names if col not in column_mapping},
-            }
-        )
-        dset_format = dataset.format
-        dataset = dataset.with_format(
-            type=dset_format["type"],
-            columns=dataset.column_names,
-            output_all_columns=dset_format["output_all_columns"],
-            **dset_format["format_kwargs"],
-        )
-        return dataset
-
-    def apply_hyperparameters(self, params: Dict[str, Any], final_model: bool = False):
-        """Applies a dictionary of hyperparameters to both the trainer and the model
-
-        Args:
-            params (`Dict[str, Any]`): The parameters, usually from `BestRun.hyperparameters`
-            final_model (`bool`, *optional*, defaults to `False`): If `True`, replace the `model_init()` function with a fixed model based on the parameters.
-        """
-        for key, value in params.items():
-            if hasattr(self, key):
-                old_attr = getattr(self, key, None)
-                # Casting value to the proper type
-                if old_attr is not None:
-                    value = type(old_attr)(value)
-                setattr(self, key, value)
-            elif number_of_arguments(self.model_init) == 0:  # we do not warn if model_init could be using it
-                logger.warning(
-                    f"Trying to set {key!r} in the hyperparameter search but there is no corresponding field in "
-                    "`SetFitTrainer`, and `model_init` does not take any arguments."
-                )
-
-        self.model = self.model_init(params)
-        if final_model:
-            self.model_init = None
-
-    def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
-        """HP search setup code"""
-
-        # Heavily inspired by transformers.Trainer._hp_search_setup
-        if self.hp_search_backend is None or trial is None:
-            return
-
-        if isinstance(trial, Dict):  # For passing a Dict to train() -- mostly unused for now
-            params = trial
-        elif self.hp_search_backend == HPSearchBackend.OPTUNA:
-            params = self.hp_space(trial)
-        else:
-            raise ValueError("Invalid trial parameter")
-
-        logger.info(f"Trial: {params}")
-        self.apply_hyperparameters(params, final_model=False)
-
-    def call_model_init(self, params: Optional[Dict[str, Any]] = None):
-        model_init_argcount = number_of_arguments(self.model_init)
-        if model_init_argcount == 0:
-            model = self.model_init()
-        elif model_init_argcount == 1:
-            model = self.model_init(params)
-        else:
-            raise RuntimeError("`model_init` should have 0 or 1 argument.")
-
-        if model is None:
-            raise RuntimeError("`model_init` should not return None.")
-
-        return model
-
-    def freeze(self):
-        """
-        Freeze SetFitModel's differentiable head.
-        Note: call this function only when using the differentiable head.
-        """
-        if not self.model.has_differentiable_head:
-            raise ValueError("Please use the differentiable head in `SetFitModel` when calling this function.")
-
-        self._freeze = True  # Currently use self._freeze as a switch
-        self.model.freeze("head")
-
-    def unfreeze(self, keep_body_frozen: bool = False):
-        """
-        Unfreeze SetFitModel's differentiable head.
-        Note: call this function only when using the differentiable head.
-
-        Args:
-            keep_body_frozen (`bool`, *optional*, defaults to `False`):
-                Whether to freeze the body when unfreeze the head.
-        """
-        if not self.model.has_differentiable_head:
-            raise ValueError("Please use the differentiable head in `SetFitModel` when calling this function.")
-
-        self._freeze = False  # Currently use self._freeze as a switch
-        self.model.unfreeze("head")
-        if keep_body_frozen:
-            self.model.freeze("body")
-        else:  # ensure to unfreeze the body
-            self.model.unfreeze("body")
-
-    def train(
-        self,
-        num_epochs: Optional[int] = None,
-        batch_size: Optional[int] = None,
-        learning_rate: Optional[float] = None,
-        body_learning_rate: Optional[float] = None,
-        l2_weight: Optional[float] = None,
-        max_length: Optional[int] = None,
-        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
-        show_progress_bar: bool = True,
-    ):
-        """
-        Main training entry point.
-
-        Args:
-            num_epochs (`int`, *optional*):
-                Temporary change the number of epochs to train the Sentence Transformer body/head for.
-                If ignore, will use the value given in initialization.
-            batch_size (`int`, *optional*):
-                Temporary change the batch size to use for contrastive training or logistic regression.
-                If ignore, will use the value given in initialization.
-            learning_rate (`float`, *optional*):
-                Temporary change the learning rate to use for contrastive training or SetFitModel's head in logistic regression.
-                If ignore, will use the value given in initialization.
-            body_learning_rate (`float`, *optional*):
-                Temporary change the learning rate to use for SetFitModel's body in logistic regression only.
-                If ignore, will be the same as `learning_rate`.
-            l2_weight (`float`, *optional*):
-                Temporary change the weight of L2 regularization for SetFitModel's differentiable head in logistic regression.
-            max_length (int, *optional*, defaults to `None`):
-                The maximum number of tokens for one data sample. Currently only for training the differentiable head.
-                If `None`, will use the maximum number of tokens the model body can accept.
-                If `max_length` is greater than the maximum number of acceptable tokens the model body can accept, it will be set to the maximum number of acceptable tokens.
-            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
-                The trial run or the hyperparameter dictionary for hyperparameter search.
-            show_progress_bar (`bool`, *optional*, defaults to `True`):
-                Whether to show a bar that indicates training progress.
-        """
-        set_seed(self.seed)  # Seed must be set before instantiating the model when using model_init.
-
-        if trial:  # Trial and model initialization
-            self._hp_search_setup(trial)  # sets trainer parameters and initializes model
-
-        if self.train_dataset is None:
-            raise ValueError("Training requires a `train_dataset` given to the `SetFitTrainer` initialization.")
-
-        self._validate_column_mapping(self.train_dataset)
-        train_dataset = self.train_dataset
-        if self.column_mapping is not None:
-            logger.info("Applying column mapping to training dataset")
-            train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
-
-        if self.loss_class is None:
-            logger.warning("No `loss_class` detected! Using `CosineSimilarityLoss` as the default.")
-            self.loss_class = losses.CosineSimilarityLoss
-
-        multilabel = True if self.model.multi_target_strategy is not None else False
-
-        num_epochs = num_epochs or self.num_epochs
-        batch_size = batch_size or self.batch_size
-        learning_rate = learning_rate or self.learning_rate
-
-        # dataset generation
-        x_train = train_dataset["text"]
-        y_train = train_dataset["label"]
-        train_examples = [InputExample(texts=[text], label=label) for text, label in zip(x_train, y_train)]
-
-        if not self.model.has_differentiable_head or self._freeze:
-            # sentence-transformers adaptation
-            if self.loss_class in [
-                losses.BatchAllTripletLoss,
-                losses.BatchHardTripletLoss,
-                losses.BatchSemiHardTripletLoss,
-                losses.BatchHardSoftMarginTripletLoss,
-                SupConLoss,
-            ]:
-                train_data_sampler = SentenceLabelDataset(train_examples, samples_per_label=self.samples_per_label)
-                batch_size = min(batch_size, len(train_data_sampler))
-                train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=True)
-            else: 
-                train_data_sampler = ConstrastiveDataset(train_examples, multilabel, self.sampling_strategy)
-                train_dataloader = DataLoader(train_data_sampler, batch_size=batch_size, drop_last=False)
-
-            total_train_steps = len(train_dataloader) * num_epochs
-            logger.info("***** Running training *****")
-            logger.info(f"  Num examples per epoch = {len(train_data_sampler)}")
-            logger.info(f"  Num epochs = {num_epochs}")
-            logger.info(f"  Total optimization steps = {total_train_steps}")
-            logger.info(f"  Total train batch size = {batch_size}")
-
-            # setup training loss
-            if self.loss_class in [
-                losses.BatchAllTripletLoss,
-                losses.BatchHardTripletLoss,
-                losses.BatchSemiHardTripletLoss,
-                losses.BatchHardSoftMarginTripletLoss,
-            ]:
-                train_loss = self.loss_class(
-                    model=self.model.model_body,
-                    distance_metric=self.distance_metric,
-                    margin=self.margin,
-                )
-            elif self.loss_class is losses.BatchHardSoftMarginTripletLoss:
-                train_loss = self.loss_class(
-                    model=self.model.model_body,
-                    distance_metric=self.distance_metric,
-                )
-            else:
-                train_loss = self.loss_class(model=self.model.model_body)
-
-            warmup_steps = math.ceil(total_train_steps * self.warmup_proportion)
-            self.model.model_body.fit(
-                train_objectives=[(train_dataloader, train_loss)],
-                epochs=num_epochs,
-                optimizer_params={"lr": learning_rate},
-                warmup_steps=warmup_steps,
-                show_progress_bar=show_progress_bar,
-                use_amp=self.use_amp,
-            )
-
-        if not self.model.has_differentiable_head or not self._freeze:
-            # Train the final classifier
-            self.model.fit(
-                x_train,
-                y_train,
-                num_epochs=num_epochs,
-                batch_size=batch_size,
-                learning_rate=learning_rate,
-                body_learning_rate=body_learning_rate,
-                l2_weight=l2_weight,
-                max_length=max_length,
-                show_progress_bar=True,
-            )
-
-    def evaluate(self):
-        """
-        Computes the metrics for a given classifier.
-
-        Returns:
-            `Dict[str, float]`: The evaluation metrics.
-        """
-
-        self._validate_column_mapping(self.eval_dataset)
-        eval_dataset = self.eval_dataset
-
-        if self.column_mapping is not None:
-            logger.info("Applying column mapping to evaluation dataset")
-            eval_dataset = self._apply_column_mapping(self.eval_dataset, self.column_mapping)
-
-        x_test = eval_dataset["text"]
-        y_test = eval_dataset["label"]
-
-        logger.info("***** Running evaluation *****")
-        y_pred = self.model.predict(x_test)
-
-        if isinstance(self.metric, str):
-            metric_config = "multilabel" if self.model.multi_target_strategy is not None else None
-            metric_fn = evaluate.load(self.metric, config_name=metric_config)
-
-            return metric_fn.compute(predictions=y_pred, references=y_test)
-
-        elif callable(self.metric):
-            return self.metric(y_pred, y_test)
-
-        else:
-            raise ValueError("metric must be a string or a callable")
-
-    def hyperparameter_search(
-        self,
-        hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None,
-        compute_objective: Optional[Callable[[Dict[str, float]], float]] = None,
-        n_trials: int = 10,
-        direction: str = "maximize",
-        backend: Optional[Union["str", HPSearchBackend]] = None,
-        hp_name: Optional[Callable[["optuna.Trial"], str]] = None,
-        **kwargs,
-    ) -> BestRun:
-        """
-        Launch a hyperparameter search using `optuna`. The optimized quantity is determined
-        by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided,
-        the sum of all metrics otherwise.
-
-        <Tip warning={true}>
-
-        To use this method, you need to have provided a `model_init` when initializing your [`SetFitTrainer`]: we need to
-        reinitialize the model at each new run.
-
-        </Tip>
-
-        Args:
-            hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*):
-                A function that defines the hyperparameter search space. Will default to
-                [`~trainer_utils.default_hp_space_optuna`].
-            compute_objective (`Callable[[Dict[str, float]], float]`, *optional*):
-                A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
-                method. Will default to [`~trainer_utils.default_compute_objective`] which uses the sum of metrics.
-            n_trials (`int`, *optional*, defaults to 100):
-                The number of trial runs to test.
-            direction (`str`, *optional*, defaults to `"maximize"`):
-                Whether to optimize greater or lower objects. Can be `"minimize"` or `"maximize"`, you should pick
-                `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics.
-            backend (`str` or [`~training_utils.HPSearchBackend`], *optional*):
-                The backend to use for hyperparameter search. Only optuna is supported for now.
-                TODO: add support for ray and sigopt.
-            hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
-                A function that defines the trial/run name. Will default to None.
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments passed along to `optuna.create_study`. For more
-                information see:
-
-                - the documentation of
-                  [optuna.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
-
-        Returns:
-            [`trainer_utils.BestRun`]: All the information about the best run.
-        """
-        if backend is None:
-            backend = default_hp_search_backend()
-            if backend is None:
-                raise RuntimeError("optuna should be installed. " "To install optuna run `pip install optuna`. ")
-        backend = HPSearchBackend(backend)
-        if backend == HPSearchBackend.OPTUNA and not is_optuna_available():
-            raise RuntimeError("You picked the optuna backend, but it is not installed. Use `pip install optuna`.")
-        elif backend != HPSearchBackend.OPTUNA:
-            raise RuntimeError("Only optuna backend is supported for hyperparameter search.")
-        self.hp_search_backend = backend
-        if self.model_init is None:
-            raise RuntimeError(
-                "To use hyperparameter search, you need to pass your model through a model_init function."
-            )
-
-        self.hp_space = default_hp_space_optuna if hp_space is None else hp_space
-        self.hp_name = hp_name
-        self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
-
-        backend_dict = {
-            HPSearchBackend.OPTUNA: run_hp_search_optuna,
-        }
-        best_run = backend_dict[backend](self, n_trials, direction, **kwargs)
-
-        self.hp_search_backend = None
-        return best_run
-
-    def push_to_hub(
-        self,
-        repo_path_or_name: Optional[str] = None,
-        repo_url: Optional[str] = None,
-        commit_message: Optional[str] = "Add SetFit model",
-        organization: Optional[str] = None,
-        private: Optional[bool] = None,
-        api_endpoint: Optional[str] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        git_user: Optional[str] = None,
-        git_email: Optional[str] = None,
-        config: Optional[dict] = None,
-        skip_lfs_files: bool = False,
-    ):
-
-        return self.model.push_to_hub(
-            repo_path_or_name,
-            repo_url,
-            commit_message,
-            organization,
-            private,
-            api_endpoint,
-            use_auth_token,
-            git_user,
-            git_email,
-            config,
-            skip_lfs_files,
-        )
diff --git a/tests/test_modeling.py b/tests/test_modeling.py
index c31417d2..844ec6e6 100644
--- a/tests/test_modeling.py
+++ b/tests/test_modeling.py
@@ -16,36 +16,6 @@
 torch_cuda_available = pytest.mark.skipif(not torch.cuda.is_available(), reason="PyTorch must be compiled with CUDA")
 
 
-def test_sentence_pairs_generation():
-    sentences = np.array(["sent 1", "sent 2", "sent 3"])
-    labels = np.array(["label 1", "label 2", "label 3"])
-
-    pairs = []
-    n_iterations = 2
-
-    for _ in range(n_iterations):
-        pairs = sentence_pairs_generation(sentences, labels, pairs)
-
-    assert len(pairs) == 12
-    assert pairs[0].texts == ["sent 1", "sent 1"]
-    assert pairs[0].label == 1.0
-
-
-def test_sentence_pairs_generation_multilabel():
-    sentences = np.array(["sent 1", "sent 2", "sent 3"])
-    labels = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]])
-
-    pairs = []
-    n_iterations = 2
-
-    for _ in range(n_iterations):
-        pairs = sentence_pairs_generation_multilabel(sentences, labels, pairs)
-
-    assert len(pairs) == 12
-    assert pairs[0].texts == ["sent 1", "sent 1"]
-    assert pairs[0].label == 1.0
-
-
 def test_setfit_model_body():
     model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
 
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
new file mode 100644
index 00000000..67e1bf07
--- /dev/null
+++ b/tests/test_sampler.py
@@ -0,0 +1,50 @@
+import pytest
+import numpy as np
+
+from sentence_transformers import InputExample
+
+from setfit.sampler import ConstrastiveDataset
+
+
+@pytest.mark.parametrize("sampling_strategy, expected_pos_pairs, expected_neg_pairs", [
+    ("unique", 4, 2),
+    ("undersampling", 2, 2),
+    ("oversampling", 4, 4)
+])
+def test_sentence_pairs_generation(sampling_strategy: str, expected_pos_pairs: int, expected_neg_pairs: int):
+    sentences = np.array(["sent 1", "sent 2", "sent 3"])
+    labels = np.array(["label 1", "label 1", "label 2"])
+
+    data = [InputExample(texts=[text], label=label) for text, label in zip(sentences, labels)]
+    multilabel = False
+
+    data_sampler = ConstrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
+
+    assert data_sampler.len_pos_pairs == expected_pos_pairs
+    assert data_sampler.len_neg_pairs == expected_neg_pairs
+    
+    pairs = [i for i in data_sampler]
+
+    assert len(pairs) == expected_pos_pairs + expected_neg_pairs
+    assert pairs[0].texts == ["sent 1", "sent 1"]
+    assert pairs[0].label == 1.0
+
+
+@pytest.mark.parametrize("sampling_strategy, expected_pos_pairs, expected_neg_pairs", [
+    ("unique", 6, 4),
+    ("undersampling", 4, 4),
+    ("oversampling", 6, 6)
+])
+def test_sentence_pairs_generation_multilabel(sampling_strategy: str, expected_pos_pairs: int, expected_neg_pairs: int):
+    sentences = np.array(["sent 1", "sent 2", "sent 3", "sent 4"])
+    labels = np.array([[1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
+
+    data = [InputExample(texts=[text], label=label) for text, label in zip(sentences, labels)]
+    multilabel = True
+
+    data_sampler = ConstrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
+    assert data_sampler.len_pos_pairs == expected_pos_pairs
+    assert data_sampler.len_neg_pairs == expected_neg_pairs
+    
+    pairs = [i for i in data_sampler]
+    assert len(pairs) == expected_pos_pairs + expected_neg_pairs

From 131aa267b2570296b4c29e6d92d614eebb6ef03b Mon Sep 17 00:00:00 2001
From: danstan5 <danstan5@hotmail.co.uk>
Date: Thu, 19 Oct 2023 12:39:03 +0100
Subject: [PATCH 49/77] add sampling_strategy into TrainingArguments

---
 src/setfit/trainer.py       |  8 +++++---
 src/setfit/training_args.py | 22 +++++++++++++++++++---
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 5d7c2a17..a4c841b4 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -447,10 +447,12 @@ def get_dataloader(self, x: List[str], y: Union[List[int], List[List[int]]], arg
                 )
         else:
             data_sampler = ConstrastiveDataset(
-                input_data, self.model.multi_target_strategy, args.num_iterations
-            ) # sets default sampling_strategy="oversampling"
+                input_data, self.model.multi_target_strategy, args.num_iterations, args.sampling_strategy
+            )
+            # shuffle_sampler = True can be dropped in for further 'randomising'
+            shuffle_sampler = True if args.sampling_strategy == "unique" else False
             batch_size = min(args.embedding_batch_size, len(data_sampler))
-            dataloader = DataLoader(data_sampler, batch_size=batch_size, drop_last=False) # shuffle=True can be dropped in for 'randomising'
+            dataloader = DataLoader(data_sampler, batch_size=batch_size, shuffle=shuffle_sampler, drop_last=False) 
             loss = args.loss(self.model.model_body)
 
         return dataloader, loss, batch_size
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 3ba751cb..53f04818 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -30,10 +30,25 @@ class TrainingArguments:
             Set the number of epochs the embedding and classifier training phases respectively,
             or set both if an integer is provided.
             Note that the number of epochs for the classifier is only used with a differentiable PyTorch head.
-        num_iterations (`int`, defaults to `20`):
-            The number of iterations to generate sentence pairs for.
+        num_iterations (`int`, *optional*):
+            If not set the `sampling_strategy` will determine the number of sentence pairs to generate.
+            This argument sets the number of iterations to generate sentence pairs for
+            and provides compatability with Setfit <v1.0.0. 
             This argument is ignored if triplet loss is used.
             It is only used in conjunction with `CosineSimilarityLoss`.
+        sampling_strategy (`str`, defaults to `"oversampling"`):
+            The sampling strategy of how to draw pairs in training. Possible values are:
+
+                - `"oversampling"`: Draws even number of positive/ negative sentence pairs until every 
+                    sentence pair has been drawn.
+                - `"undersampling"`: Draws the minimum number of positive/ negative sentence pairs until
+                    every sentence pair in the minority class has been drawn.
+                - `"unique"`: Draws every sentence pair combination (likely resulting in unbalanced 
+                    number of positive/ negative sentence pairs).
+
+            The default is set to `"oversampling"` ensuring all sentence pairs are drawn at least once.
+            Alternatively setting tje num_iterations will override this argument and determine the number of
+            generated sentence pairs. 
         body_learning_rate (`Union[float, Tuple[float, float]]`, defaults to `(2e-5, 1e-5)`):
             Set the learning rate for the `SentenceTransformer` body for the embedding and classifier
             training phases respectively, or set both if a float is provided.
@@ -146,7 +161,8 @@ class TrainingArguments:
     embedding_num_epochs: int = None
     classifier_num_epochs: int = None
 
-    num_iterations: int = 20
+    num_iterations: Optional[int] = None
+    sampling_strategy: str = "oversampling"
 
     # As with batch_size and num_epochs, the first value in the tuple is the learning rate
     # for the embeddings step, while the second value is the learning rate for the classifier step.

From 743100555669df58f704567d6706b95cde235c12 Mon Sep 17 00:00:00 2001
From: danstan5 <danstan5@hotmail.co.uk>
Date: Thu, 19 Oct 2023 12:54:13 +0100
Subject: [PATCH 50/77] num_iterations removed from TrainingArguments

---
 src/setfit/trainer.py       | 11 +++++++++--
 src/setfit/training_args.py | 11 ++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index fd867f21..e744e0fe 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -95,6 +95,11 @@ class Trainer:
             A mapping from the column names in the dataset to the column names expected by the model.
             The expected format is a dictionary with the following format:
             `{"text_column_name": "text", "label_column_name: "label"}`.
+        num_iterations (`int`, *optional*):
+            This argument sets the number of iterations to generate sentence pairs for
+            and provides compatability with Setfit <v1.0.0. Old default was set to 20.
+            This argument is ignored if triplet loss is used.
+            It is only used in conjunction with `CosineSimilarityLoss`.
     """
 
     _REQUIRED_COLUMNS = {"text", "label"}
@@ -110,6 +115,7 @@ def __init__(
         metric_kwargs: Optional[Dict[str, Any]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
         column_mapping: Optional[Dict[str, str]] = None,
+        num_iterations: Optional[int] = None,
     ) -> None:
         self.args = args or TrainingArguments()
         self.train_dataset = train_dataset
@@ -118,6 +124,7 @@ def __init__(
         self.metric = metric
         self.metric_kwargs = metric_kwargs
         self.column_mapping = column_mapping
+        self.num_iterations = num_iterations
 
         if model is None:
             if model_init is not None:
@@ -450,7 +457,7 @@ def get_dataloader(
                 )
         else:
             data_sampler = ConstrastiveDataset(
-                input_data, self.model.multi_target_strategy, args.num_iterations, args.sampling_strategy
+                input_data, self.model.multi_target_strategy, self.num_iterations, args.sampling_strategy
             )
             # shuffle_sampler = True can be dropped in for further 'randomising'
             shuffle_sampler = True if args.sampling_strategy == "unique" else False
@@ -905,7 +912,6 @@ def __init__(
             stacklevel=2,
         )
         args = TrainingArguments(
-            num_iterations=num_iterations,
             num_epochs=num_epochs,
             body_learning_rate=learning_rate,
             head_learning_rate=learning_rate,
@@ -927,4 +933,5 @@ def __init__(
             metric=metric,
             metric_kwargs=metric_kwargs,
             column_mapping=column_mapping,
+            num_iterations=num_iterations
         )
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 53f04818..71526c87 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -30,12 +30,6 @@ class TrainingArguments:
             Set the number of epochs the embedding and classifier training phases respectively,
             or set both if an integer is provided.
             Note that the number of epochs for the classifier is only used with a differentiable PyTorch head.
-        num_iterations (`int`, *optional*):
-            If not set the `sampling_strategy` will determine the number of sentence pairs to generate.
-            This argument sets the number of iterations to generate sentence pairs for
-            and provides compatability with Setfit <v1.0.0. 
-            This argument is ignored if triplet loss is used.
-            It is only used in conjunction with `CosineSimilarityLoss`.
         sampling_strategy (`str`, defaults to `"oversampling"`):
             The sampling strategy of how to draw pairs in training. Possible values are:
 
@@ -47,8 +41,8 @@ class TrainingArguments:
                     number of positive/ negative sentence pairs).
 
             The default is set to `"oversampling"` ensuring all sentence pairs are drawn at least once.
-            Alternatively setting tje num_iterations will override this argument and determine the number of
-            generated sentence pairs. 
+            Alternatively setting the num_iterations in the SetFitTrainer class will override this 
+            argument and determine the number of generated sentence pairs.
         body_learning_rate (`Union[float, Tuple[float, float]]`, defaults to `(2e-5, 1e-5)`):
             Set the learning rate for the `SentenceTransformer` body for the embedding and classifier
             training phases respectively, or set both if a float is provided.
@@ -161,7 +155,6 @@ class TrainingArguments:
     embedding_num_epochs: int = None
     classifier_num_epochs: int = None
 
-    num_iterations: Optional[int] = None
     sampling_strategy: str = "oversampling"
 
     # As with batch_size and num_epochs, the first value in the tuple is the learning rate

From 3bd2accac33fe23edfcff8798f9f73dee992dffb Mon Sep 17 00:00:00 2001
From: danstan5 <danstan5@hotmail.co.uk>
Date: Fri, 20 Oct 2023 14:27:32 +0100
Subject: [PATCH 51/77] run_fewshot compatible with <v.1.0.0

---
 scripts/setfit/run_fewshot.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/setfit/run_fewshot.py b/scripts/setfit/run_fewshot.py
index 1248fddc..08f7023e 100644
--- a/scripts/setfit/run_fewshot.py
+++ b/scripts/setfit/run_fewshot.py
@@ -59,6 +59,7 @@ def parse_args():
     parser.add_argument("--override_results", default=False, action="store_true")
     parser.add_argument("--keep_body_frozen", default=False, action="store_true")
     parser.add_argument("--add_data_augmentation", default=False)
+    parser.add_argument("--evaluation_strategy", default=False)
 
     args = parser.parse_args()
 
@@ -148,6 +149,8 @@ def main():
                 num_epochs=args.num_epochs,
                 num_iterations=args.num_iterations,
             )
+            if not args.evaluation_strategy:
+                trainer.args.evaluation_strategy = "no"
             if args.classifier == "pytorch":
                 trainer.freeze()
                 trainer.train()

From 3d07e6cf63081457d77709c4fe1134dd0c8aadbc Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 13:44:21 +0200
Subject: [PATCH 52/77] Run make style

---
 src/setfit/trainer.py       |  4 ++--
 src/setfit/training_args.py |  6 +++---
 tests/test_sampler.py       | 29 ++++++++++++++---------------
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index e744e0fe..cdda7adc 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -462,7 +462,7 @@ def get_dataloader(
             # shuffle_sampler = True can be dropped in for further 'randomising'
             shuffle_sampler = True if args.sampling_strategy == "unique" else False
             batch_size = min(args.embedding_batch_size, len(data_sampler))
-            dataloader = DataLoader(data_sampler, batch_size=batch_size, shuffle=shuffle_sampler, drop_last=False) 
+            dataloader = DataLoader(data_sampler, batch_size=batch_size, shuffle=shuffle_sampler, drop_last=False)
             loss = args.loss(self.model.model_body)
 
         return dataloader, loss, batch_size
@@ -933,5 +933,5 @@ def __init__(
             metric=metric,
             metric_kwargs=metric_kwargs,
             column_mapping=column_mapping,
-            num_iterations=num_iterations
+            num_iterations=num_iterations,
         )
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 71526c87..85edd757 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -33,15 +33,15 @@ class TrainingArguments:
         sampling_strategy (`str`, defaults to `"oversampling"`):
             The sampling strategy of how to draw pairs in training. Possible values are:
 
-                - `"oversampling"`: Draws even number of positive/ negative sentence pairs until every 
+                - `"oversampling"`: Draws even number of positive/ negative sentence pairs until every
                     sentence pair has been drawn.
                 - `"undersampling"`: Draws the minimum number of positive/ negative sentence pairs until
                     every sentence pair in the minority class has been drawn.
-                - `"unique"`: Draws every sentence pair combination (likely resulting in unbalanced 
+                - `"unique"`: Draws every sentence pair combination (likely resulting in unbalanced
                     number of positive/ negative sentence pairs).
 
             The default is set to `"oversampling"` ensuring all sentence pairs are drawn at least once.
-            Alternatively setting the num_iterations in the SetFitTrainer class will override this 
+            Alternatively setting the num_iterations in the SetFitTrainer class will override this
             argument and determine the number of generated sentence pairs.
         body_learning_rate (`Union[float, Tuple[float, float]]`, defaults to `(2e-5, 1e-5)`):
             Set the learning rate for the `SentenceTransformer` body for the embedding and classifier
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
index 67e1bf07..c3207592 100644
--- a/tests/test_sampler.py
+++ b/tests/test_sampler.py
@@ -1,16 +1,14 @@
-import pytest
 import numpy as np
-
+import pytest
 from sentence_transformers import InputExample
 
 from setfit.sampler import ConstrastiveDataset
 
 
-@pytest.mark.parametrize("sampling_strategy, expected_pos_pairs, expected_neg_pairs", [
-    ("unique", 4, 2),
-    ("undersampling", 2, 2),
-    ("oversampling", 4, 4)
-])
+@pytest.mark.parametrize(
+    "sampling_strategy, expected_pos_pairs, expected_neg_pairs",
+    [("unique", 4, 2), ("undersampling", 2, 2), ("oversampling", 4, 4)],
+)
 def test_sentence_pairs_generation(sampling_strategy: str, expected_pos_pairs: int, expected_neg_pairs: int):
     sentences = np.array(["sent 1", "sent 2", "sent 3"])
     labels = np.array(["label 1", "label 1", "label 2"])
@@ -22,7 +20,7 @@ def test_sentence_pairs_generation(sampling_strategy: str, expected_pos_pairs: i
 
     assert data_sampler.len_pos_pairs == expected_pos_pairs
     assert data_sampler.len_neg_pairs == expected_neg_pairs
-    
+
     pairs = [i for i in data_sampler]
 
     assert len(pairs) == expected_pos_pairs + expected_neg_pairs
@@ -30,12 +28,13 @@ def test_sentence_pairs_generation(sampling_strategy: str, expected_pos_pairs: i
     assert pairs[0].label == 1.0
 
 
-@pytest.mark.parametrize("sampling_strategy, expected_pos_pairs, expected_neg_pairs", [
-    ("unique", 6, 4),
-    ("undersampling", 4, 4),
-    ("oversampling", 6, 6)
-])
-def test_sentence_pairs_generation_multilabel(sampling_strategy: str, expected_pos_pairs: int, expected_neg_pairs: int):
+@pytest.mark.parametrize(
+    "sampling_strategy, expected_pos_pairs, expected_neg_pairs",
+    [("unique", 6, 4), ("undersampling", 4, 4), ("oversampling", 6, 6)],
+)
+def test_sentence_pairs_generation_multilabel(
+    sampling_strategy: str, expected_pos_pairs: int, expected_neg_pairs: int
+):
     sentences = np.array(["sent 1", "sent 2", "sent 3", "sent 4"])
     labels = np.array([[1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
 
@@ -45,6 +44,6 @@ def test_sentence_pairs_generation_multilabel(sampling_strategy: str, expected_p
     data_sampler = ConstrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
     assert data_sampler.len_pos_pairs == expected_pos_pairs
     assert data_sampler.len_neg_pairs == expected_neg_pairs
-    
+
     pairs = [i for i in data_sampler]
     assert len(pairs) == expected_pos_pairs + expected_neg_pairs

From 978daeed6965146caf03c2092e6325a746c8e5f3 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 13:48:15 +0200
Subject: [PATCH 53/77] Use "no" as the default evaluation_strategy

This matches the default from 'transformers'
---
 src/setfit/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 85edd757..a118a151 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -189,7 +189,7 @@ class TrainingArguments:
     logging_first_step: bool = True
     logging_steps: int = 5
 
-    evaluation_strategy: str = "steps"
+    evaluation_strategy: str = "no"
     eval_steps: Optional[int] = None
     eval_delay: int = 0
 

From 2802a3f8f225734b0bdb9758472e14510405b0c1 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 14:08:04 +0200
Subject: [PATCH 54/77] Move num_iterations back to TrainingArguments

---
 src/setfit/trainer.py       | 11 ++---------
 src/setfit/training_args.py | 13 ++++++++++---
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index cdda7adc..55e72fcf 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -95,11 +95,6 @@ class Trainer:
             A mapping from the column names in the dataset to the column names expected by the model.
             The expected format is a dictionary with the following format:
             `{"text_column_name": "text", "label_column_name: "label"}`.
-        num_iterations (`int`, *optional*):
-            This argument sets the number of iterations to generate sentence pairs for
-            and provides compatability with Setfit <v1.0.0. Old default was set to 20.
-            This argument is ignored if triplet loss is used.
-            It is only used in conjunction with `CosineSimilarityLoss`.
     """
 
     _REQUIRED_COLUMNS = {"text", "label"}
@@ -115,7 +110,6 @@ def __init__(
         metric_kwargs: Optional[Dict[str, Any]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
         column_mapping: Optional[Dict[str, str]] = None,
-        num_iterations: Optional[int] = None,
     ) -> None:
         self.args = args or TrainingArguments()
         self.train_dataset = train_dataset
@@ -124,7 +118,6 @@ def __init__(
         self.metric = metric
         self.metric_kwargs = metric_kwargs
         self.column_mapping = column_mapping
-        self.num_iterations = num_iterations
 
         if model is None:
             if model_init is not None:
@@ -457,7 +450,7 @@ def get_dataloader(
                 )
         else:
             data_sampler = ConstrastiveDataset(
-                input_data, self.model.multi_target_strategy, self.num_iterations, args.sampling_strategy
+                input_data, self.model.multi_target_strategy, args.num_iterations, args.sampling_strategy
             )
             # shuffle_sampler = True can be dropped in for further 'randomising'
             shuffle_sampler = True if args.sampling_strategy == "unique" else False
@@ -912,6 +905,7 @@ def __init__(
             stacklevel=2,
         )
         args = TrainingArguments(
+            num_iterations=num_iterations,
             num_epochs=num_epochs,
             body_learning_rate=learning_rate,
             head_learning_rate=learning_rate,
@@ -933,5 +927,4 @@ def __init__(
             metric=metric,
             metric_kwargs=metric_kwargs,
             column_mapping=column_mapping,
-            num_iterations=num_iterations,
         )
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index a118a151..6a13ef80 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -40,9 +40,15 @@ class TrainingArguments:
                 - `"unique"`: Draws every sentence pair combination (likely resulting in unbalanced
                     number of positive/ negative sentence pairs).
 
-            The default is set to `"oversampling"` ensuring all sentence pairs are drawn at least once.
-            Alternatively setting the num_iterations in the SetFitTrainer class will override this
-            argument and determine the number of generated sentence pairs.
+            The default is set to `"oversampling"`, ensuring all sentence pairs are drawn at least once.
+            Alternatively setting `num_iterations` will override this argument and determine the number
+            of generated sentence pairs.
+        num_iterations (`int`, *optional*):
+            If not set the `sampling_strategy` will determine the number of sentence pairs to generate.
+            This argument sets the number of iterations to generate sentence pairs for
+            and provides compatability with Setfit <v1.0.0.
+            This argument is ignored if triplet loss is used.
+            It is only used in conjunction with `CosineSimilarityLoss`.
         body_learning_rate (`Union[float, Tuple[float, float]]`, defaults to `(2e-5, 1e-5)`):
             Set the learning rate for the `SentenceTransformer` body for the embedding and classifier
             training phases respectively, or set both if a float is provided.
@@ -156,6 +162,7 @@ class TrainingArguments:
     classifier_num_epochs: int = None
 
     sampling_strategy: str = "oversampling"
+    num_iterations: Optional[int] = None
 
     # As with batch_size and num_epochs, the first value in the tuple is the learning rate
     # for the embeddings step, while the second value is the learning rate for the classifier step.

From 391f9914330e5ae330ee381165e2a0e9eeb094fa Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 14:34:54 +0200
Subject: [PATCH 55/77] Fix broken trainer tests due to new default sampling

---
 tests/test_trainer.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 1a75fefb..2c699ea2 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -548,7 +548,7 @@ def test_train_no_dataset(model: SetFitModel):
 
 
 def test_train_amp_save(model: SetFitModel, tmp_path):
-    args = TrainingArguments(output_dir=tmp_path, use_amp=True, save_steps=5)
+    args = TrainingArguments(output_dir=tmp_path, use_amp=True, save_steps=5, num_epochs=5)
     dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
     trainer = Trainer(model, args=args, train_dataset=dataset, eval_dataset=dataset)
     trainer.train()
@@ -557,7 +557,14 @@ def test_train_amp_save(model: SetFitModel, tmp_path):
 
 
 def test_train_load_best(model: SetFitModel, tmp_path, caplog):
-    args = TrainingArguments(output_dir=tmp_path, save_steps=5, eval_steps=5, load_best_model_at_end=True)
+    args = TrainingArguments(
+        output_dir=tmp_path,
+        save_steps=5,
+        eval_steps=5,
+        evaluation_strategy="steps",
+        load_best_model_at_end=True,
+        num_epochs=5,
+    )
     dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
     trainer = Trainer(model, args=args, train_dataset=dataset, eval_dataset=dataset)
     with caplog.at_level(logging.INFO):

From f8b7253348e9f76534b6df2843926ca758031bad Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 16:15:14 +0200
Subject: [PATCH 56/77] Use the Contrastive Dataset for Distillation

---
 src/setfit/sampler.py              |  57 ++++++++------
 src/setfit/trainer.py              |  20 +++--
 src/setfit/trainer_distillation.py | 122 ++++++-----------------------
 tests/test_sampler.py              |   6 +-
 4 files changed, 71 insertions(+), 134 deletions(-)

diff --git a/src/setfit/sampler.py b/src/setfit/sampler.py
index 54d72e46..1bea2e78 100644
--- a/src/setfit/sampler.py
+++ b/src/setfit/sampler.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from sentence_transformers import InputExample
+import torch
 from torch.utils.data import IterableDataset
 
 from . import logging
@@ -12,28 +13,6 @@
 logger = logging.get_logger(__name__)
 
 
-def sentence_pairs_generation_cos_sim(sentences, pairs, cos_sim_matrix):
-    # initialize two empty lists to hold the (sentence, sentence) pairs and
-    # labels to indicate if a pair is positive or negative
-
-    idx = list(range(len(sentences)))
-
-    for first_idx in range(len(sentences)):
-        current_sentence = sentences[first_idx]
-        second_idx = int(np.random.choice([x for x in idx if x != first_idx]))
-
-        cos_sim = float(cos_sim_matrix[first_idx][second_idx])
-        paired_sentence = sentences[second_idx]
-        pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
-
-        third_idx = np.random.choice([x for x in idx if x != first_idx])
-        cos_sim = float(cos_sim_matrix[first_idx][third_idx])
-        paired_sentence = sentences[third_idx]
-        pairs.append(InputExample(texts=[current_sentence, paired_sentence], label=cos_sim))
-
-    return pairs
-
-
 def shuffle_combinations(iterable: Iterable, replacement: bool = True) -> Generator:
     """Generates shuffled pair combinations for any iterable data provided.
 
@@ -53,10 +32,10 @@ def shuffle_combinations(iterable: Iterable, replacement: bool = True) -> Genera
         yield iterable[_idx], iterable[idx]
 
 
-class ConstrastiveDataset(IterableDataset):
+class ContrastiveDataset(IterableDataset):
     def __init__(
         self,
-        examples: InputExample,
+        examples: List[InputExample],
         multilabel: bool,
         num_iterations: Optional[None] = None,
         sampling_strategy: str = "oversampling",
@@ -145,3 +124,33 @@ def __iter__(self):
 
     def __len__(self) -> int:
         return self.len_pos_pairs + self.len_neg_pairs
+
+
+class ContrastiveDistillationDataset(ContrastiveDataset):
+    def __init__(
+        self,
+        examples: List[InputExample],
+        cos_sim_matrix: torch.Tensor,
+        num_iterations: Optional[None] = None,
+        sampling_strategy: str = "oversampling",
+    ) -> None:
+        self.cos_sim_matrix = cos_sim_matrix
+        super().__init__(
+            examples,
+            multilabel=False,
+            num_iterations=num_iterations,
+            sampling_strategy=sampling_strategy,
+        )
+        # Internally we store all pairs in pos_pairs, regardless of sampling strategy.
+        # After all, without labels, there isn't much of a strategy.
+        self.sentence_labels = list(enumerate(self.sentences))
+
+        self.len_neg_pairs = 0
+        if num_iterations is not None and num_iterations > 0:
+            self.len_pos_pairs = num_iterations * len(self.sentences)
+        else:
+            self.len_pos_pairs = len(self.pos_pairs)
+
+    def generate_pairs(self) -> None:
+        for (text_one, id_one), (text_two, id_two) in shuffle_combinations(self.sentence_labels):
+            self.pos_pairs.append(InputExample(texts=[text_one, text_two], label=self.cos_sim_matrix[id_one][id_two]))
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 55e72fcf..606c9c37 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import evaluate
 import torch
@@ -38,7 +38,7 @@
 from . import logging
 from .integrations import default_hp_search_backend, is_optuna_available, run_hp_search_optuna
 from .losses import SupConLoss
-from .sampler import ConstrastiveDataset
+from .sampler import ContrastiveDataset
 from .training_args import TrainingArguments
 from .utils import BestRun, default_hp_space_optuna
 
@@ -367,17 +367,21 @@ def train(
                 logger.info(f"Applying column mapping to {dataset_name} dataset")
                 dataset = self._apply_column_mapping(dataset, self.column_mapping)
 
-            parameters.extend([dataset["text"], dataset["label"]])
+            parameters.extend(self.dataset_to_parameters(dataset))
 
         self.train_embeddings(*parameters, args=args)
-        self.train_classifier(*parameters[:2], args=args)
+        training_parameters = parameters[:len(parameters) // 2] if self.eval_dataset else parameters
+        self.train_classifier(*training_parameters, args=args)
+
+    def dataset_to_parameters(self, dataset: Dataset) -> List[Iterable]:
+        return [dataset["text"], dataset["label"]]
 
     def train_embeddings(
         self,
         x_train: List[str],
-        y_train: Union[List[int], List[List[int]]],
-        x_eval: List[str] = None,
-        y_eval: Union[List[int], List[List[int]]] = None,
+        y_train: Optional[Union[List[int], List[List[int]]]] = None,
+        x_eval: Optional[List[str]] = None,
+        y_eval: Optional[Union[List[int], List[List[int]]]] = None,
         args: Optional[TrainingArguments] = None,
     ) -> None:
         """
@@ -449,7 +453,7 @@ def get_dataloader(
                     margin=args.margin,
                 )
         else:
-            data_sampler = ConstrastiveDataset(
+            data_sampler = ContrastiveDataset(
                 input_data, self.model.multi_target_strategy, args.num_iterations, args.sampling_strategy
             )
             # shuffle_sampler = True can be dropped in for further 'randomising'
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index faebe43c..2bc2d629 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -1,23 +1,19 @@
-import math
 import warnings
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
-import numpy as np
+from datasets import Dataset
 import torch
-from sentence_transformers import losses, util
+from sentence_transformers import losses, util, InputExample
+from torch import nn
 from torch.utils.data import DataLoader
-from transformers.trainer_utils import set_seed
 
 from . import logging
-from .sampler import sentence_pairs_generation_cos_sim
+from .sampler import ContrastiveDistillationDataset
 from .trainer import Trainer
 from .training_args import TrainingArguments
 
 
 if TYPE_CHECKING:
-    import optuna
-    from datasets import Dataset
-
     from .modeling import SetFitModel
 
 logging.set_verbosity_info()
@@ -78,99 +74,27 @@ def __init__(
         self.teacher_model = teacher_model
         self.student_model = self.model
 
-    def train(
-        self,
-        args: Optional[TrainingArguments] = None,
-        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
-        **kwargs,
-    ) -> None:
-        """
-        Main training entry point.
-
-        Args:
-            args (`TrainingArguments`, *optional*):
-                Temporarily change the training arguments for this training call.
-            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
-                The trial run or the hyperparameter dictionary for hyperparameter search.
-        """
-        if len(kwargs):
-            warnings.warn(
-                f"`{self.__class__.__name__}.train` does not accept keyword arguments anymore. "
-                f"Please provide training arguments via a `TrainingArguments` instance to the `{self.__class__.__name__}` "
-                f"initialisation or the `{self.__class__.__name__}.train` method.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
-        args = args or self.args or TrainingArguments()
-
-        set_seed(args.seed)  # Seed must be set before instantiating the model when using model_init.
-
-        if trial:  # Trial and model initialization
-            self._hp_search_setup(trial)  # sets trainer parameters and initializes model
-
-        if self.train_dataset is None:
-            raise ValueError(
-                f"Training requires a `train_dataset` given to the `{self.__class__.__name__}` initialization."
-            )
-
-        self._validate_column_mapping(self.train_dataset)
-        train_dataset = self.train_dataset
-        if self.column_mapping is not None:
-            logger.info("Applying column mapping to training dataset")
-            train_dataset = self._apply_column_mapping(self.train_dataset, self.column_mapping)
+    def dataset_to_parameters(self, dataset: Dataset) -> List[Iterable]:
+        return [dataset["text"]]
 
-        x_train: List[str] = train_dataset["text"]
-
-        self.train_embeddings(x_train, args)
-        self.train_classifier(x_train, args)
-
-    def train_embeddings(
-        self,
-        x_train: List[str],
-        args: Optional[TrainingArguments] = None,
-    ) -> None:
-        """
-        Method to perform the embedding phase: finetuning the student its `SentenceTransformer` body.
-
-        Args:
-            x_train (`List[str]`): A list of training sentences.
-            args (`TrainingArguments`, *optional*):
-                Temporarily change the training arguments for this training call.
-        """
-        args = args or self.args or TrainingArguments()
-
-        # **************** student training *********************
-        x_train_embd_student = self.teacher_model.model_body.encode(
-            x_train, convert_to_tensor=self.teacher_model.has_differentiable_head
+    def get_dataloader(
+        self, x: List[str], y: Optional[Union[List[int], List[List[int]]]], args: TrainingArguments
+    ) -> Tuple[DataLoader, nn.Module, int]:
+        x_embd_student = self.teacher_model.model_body.encode(
+            x, convert_to_tensor=self.teacher_model.has_differentiable_head
         )
-        cos_sim_matrix = util.cos_sim(x_train_embd_student, x_train_embd_student)
-
-        train_examples = []
-        for _ in range(args.num_iterations):
-            train_examples = sentence_pairs_generation_cos_sim(np.array(x_train), train_examples, cos_sim_matrix)
-        # **************** student training END *****************
-
-        batch_size = args.embedding_batch_size
-        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
-        train_loss = args.loss(self.student_model.model_body)
-
-        total_train_steps = len(train_dataloader) * args.embedding_num_epochs
-        logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {len(train_examples)}")
-        logger.info(f"  Num epochs = {args.embedding_num_epochs}")
-        logger.info(f"  Total optimization steps = {total_train_steps}")
-        logger.info(f"  Total train batch size = {batch_size}")
-
-        warmup_steps = math.ceil(total_train_steps * args.warmup_proportion)
-        self.student_model.model_body.fit(
-            train_objectives=[(train_dataloader, train_loss)],
-            epochs=args.embedding_num_epochs,
-            optimizer_params={"lr": args.body_embedding_learning_rate},
-            warmup_steps=warmup_steps,
-            show_progress_bar=args.show_progress_bar,
-            use_amp=args.use_amp,
+        cos_sim_matrix = util.cos_sim(x_embd_student, x_embd_student)
+
+        input_data = [InputExample(texts=[text]) for text in x]
+        data_sampler = ContrastiveDistillationDataset(
+            input_data, cos_sim_matrix, args.num_iterations, args.sampling_strategy
         )
+        # shuffle_sampler = True can be dropped in for further 'randomising'
+        shuffle_sampler = True if args.sampling_strategy == "unique" else False
+        batch_size = min(args.embedding_batch_size, len(data_sampler))
+        dataloader = DataLoader(data_sampler, batch_size=batch_size, shuffle=shuffle_sampler, drop_last=False)
+        loss = args.loss(self.model.model_body)
+        return dataloader, loss, batch_size
 
     def train_classifier(self, x_train: List[str], args: Optional[TrainingArguments] = None) -> None:
         """
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
index c3207592..d8d37712 100644
--- a/tests/test_sampler.py
+++ b/tests/test_sampler.py
@@ -2,7 +2,7 @@
 import pytest
 from sentence_transformers import InputExample
 
-from setfit.sampler import ConstrastiveDataset
+from setfit.sampler import ContrastiveDataset
 
 
 @pytest.mark.parametrize(
@@ -16,7 +16,7 @@ def test_sentence_pairs_generation(sampling_strategy: str, expected_pos_pairs: i
     data = [InputExample(texts=[text], label=label) for text, label in zip(sentences, labels)]
     multilabel = False
 
-    data_sampler = ConstrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
+    data_sampler = ContrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
 
     assert data_sampler.len_pos_pairs == expected_pos_pairs
     assert data_sampler.len_neg_pairs == expected_neg_pairs
@@ -41,7 +41,7 @@ def test_sentence_pairs_generation_multilabel(
     data = [InputExample(texts=[text], label=label) for text, label in zip(sentences, labels)]
     multilabel = True
 
-    data_sampler = ConstrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
+    data_sampler = ContrastiveDataset(data, multilabel, sampling_strategy=sampling_strategy)
     assert data_sampler.len_pos_pairs == expected_pos_pairs
     assert data_sampler.len_neg_pairs == expected_neg_pairs
 

From 38e96070cddff9d6428bc506194722743a44f9f4 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 16:15:22 +0200
Subject: [PATCH 57/77] Set the default logging steps at 50

---
 src/setfit/training_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 6a13ef80..d0be79ec 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -109,7 +109,7 @@ class TrainingArguments:
 
         logging_first_step (`bool`, *optional*, defaults to `False`):
             Whether to log and evaluate the first `global_step` or not.
-        logging_steps (`int`, *optional*, defaults to 500):
+        logging_steps (`int`, *optional*, defaults to 50):
             Number of update steps between two logs if `logging_strategy="steps"`.
         evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
@@ -194,7 +194,7 @@ class TrainingArguments:
     logging_dir: Optional[str] = None
     logging_strategy: str = "steps"
     logging_first_step: bool = True
-    logging_steps: int = 5
+    logging_steps: int = 50
 
     evaluation_strategy: str = "no"
     eval_steps: Optional[int] = None

From 4ead15dc4caa71f1a2d4acea23ac744f5c360ad7 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 17:45:52 +0200
Subject: [PATCH 58/77] Add max_steps argument to TrainingArguments

---
 src/setfit/trainer.py       | 6 ++++--
 src/setfit/training_args.py | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 606c9c37..3bbc1789 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -506,8 +506,10 @@ def _train_sentence_transformer(
 
         self.state.epoch = 0
         start_time = time.time()
-        # TODO: Add max_steps via args.max_steps here?
-        self.state.max_steps = len(train_dataloader) * args.embedding_num_epochs
+        if args.max_steps:
+            self.state.max_steps = args.max_steps
+        else:
+            self.state.max_steps = len(train_dataloader) * args.embedding_num_epochs
         self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
         if args.use_amp:
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index d0be79ec..73fd95f8 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -30,6 +30,9 @@ class TrainingArguments:
             Set the number of epochs the embedding and classifier training phases respectively,
             or set both if an integer is provided.
             Note that the number of epochs for the classifier is only used with a differentiable PyTorch head.
+        max_steps (`int`, *optional*, defaults to `-1`):
+            If set to a positive number, the total number of training steps to perform. Overrides `num_epochs`.
+            The training may stop before reaching the set number of steps when all data is exhausted.
         sampling_strategy (`str`, defaults to `"oversampling"`):
             The sampling strategy of how to draw pairs in training. Possible values are:
 
@@ -161,6 +164,8 @@ class TrainingArguments:
     embedding_num_epochs: int = None
     classifier_num_epochs: int = None
 
+    max_steps: int = -1
+
     sampling_strategy: str = "oversampling"
     num_iterations: Optional[int] = None
 

From eb703363f15ac0636ae32d97e1c7bcc9ff8d1d5e Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Wed, 25 Oct 2023 17:52:08 +0200
Subject: [PATCH 59/77] Change max_steps conditional

The old conditional was True with the default -1, not ideal
---
 src/setfit/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 3bbc1789..0abdd110 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -506,7 +506,7 @@ def _train_sentence_transformer(
 
         self.state.epoch = 0
         start_time = time.time()
-        if args.max_steps:
+        if args.max_steps > 0:
             self.state.max_steps = args.max_steps
         else:
             self.state.max_steps = len(train_dataloader) * args.embedding_num_epochs

From 5b39f062d1f3c4b684703af389c88806931b0681 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 11:57:03 +0100
Subject: [PATCH 60/77] Seeds are now correctly applied for reproducibility

---
 src/setfit/data.py    | 2 +-
 src/setfit/trainer.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/setfit/data.py b/src/setfit/data.py
index 7eb36224..2d9cd5f8 100644
--- a/src/setfit/data.py
+++ b/src/setfit/data.py
@@ -151,7 +151,7 @@ def sample_dataset(dataset: Dataset, label_column: str = "label", num_samples: i
     df = df.groupby(label_column)
 
     # sample num_samples, or at least as much as possible
-    df = df.apply(lambda x: x.sample(min(num_samples, len(x))))
+    df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))
     df = df.reset_index(drop=True)
 
     all_samples = Dataset.from_pandas(df, features=dataset.features)
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 0abdd110..9b1b2333 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -119,6 +119,9 @@ def __init__(
         self.metric_kwargs = metric_kwargs
         self.column_mapping = column_mapping
 
+        # Seed must be set before instantiating the model when using model_init.
+        set_seed(12)
+
         if model is None:
             if model_init is not None:
                 model = self.call_model_init()
@@ -347,7 +350,8 @@ def train(
 
         args = args or self.args or TrainingArguments()
 
-        set_seed(args.seed)  # Seed must be set before instantiating the model when using model_init.
+        # Seed must be set before instantiating the model when using model_init.
+        set_seed(args.seed)
 
         if trial:  # Trial and model initialization
             self._hp_search_setup(trial)  # sets trainer parameters and initializes model

From 7c3feeda54b5be9ee0a3c1c1e7f8dcd2faf8691c Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 13:34:42 +0100
Subject: [PATCH 61/77] Don't scale gradients during evaluation

---
 src/setfit/trainer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 9b1b2333..d8828f46 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -601,7 +601,7 @@ def _train_sentence_transformer(
                     self.control = self.log(args, metrics)
 
                 eval_loss = None
-                if self.control.should_evaluate and eval_dataloader:
+                if self.control.should_evaluate and eval_dataloader is not None:
                     eval_loss = self._evaluate_with_loss(model_body, eval_dataloader, args, loss_func)
                     learning_rate = scheduler_obj.get_last_lr()[0]
                     metrics = {"eval_embedding_loss": round(eval_loss, 4), "learning_rate": learning_rate}
@@ -654,12 +654,8 @@ def _evaluate_with_loss(
         loss_func: nn.Module,
     ) -> float:
         model_body.eval()
-
-        if args.use_amp:
-            scaler = torch.cuda.amp.GradScaler()
-
         losses = []
-        for data in tqdm(iter(eval_dataloader), leave=False, disable=not args.show_progress_bar):
+        for data in tqdm(iter(eval_dataloader), total=len(eval_dataloader), leave=False, disable=not args.show_progress_bar):
             features, labels = data
             labels = labels.to(model_body._target_device)
             features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
@@ -668,7 +664,7 @@ def _evaluate_with_loss(
                 with autocast():
                     loss_value = loss_func(features, labels)
 
-                losses.append(scaler.scale(loss_value).item())
+                losses.append(loss_value.item())
             else:
                 losses.append(loss_func(features, labels).item())
 

From cdc8979e0c641bd2324662a081462f5085900738 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 13:34:59 +0100
Subject: [PATCH 62/77] Use evaluation_strategy="steps" if eval_steps is set

---
 src/setfit/training_args.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 73fd95f8..5336bf4c 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -13,6 +13,9 @@
 from transformers.training_args import default_logdir
 from transformers.utils import is_torch_available
 
+from . import logging
+
+logger = logging.get_logger(__name__)
 
 @dataclass
 class TrainingArguments:
@@ -257,6 +260,10 @@ def __post_init__(self) -> None:
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
         self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
 
+        if self.eval_steps is not None and self.evaluation_strategy == IntervalStrategy.NO:
+            logger.info("Using `evaluation_strategy=\"steps\"` as `eval_steps` is defined.")
+            self.evaluation_strategy = IntervalStrategy.STEPS
+
         # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
         if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
             if self.logging_steps > 0:

From e0401674fde5532f2d0076774da6663527e5620f Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 13:35:42 +0100
Subject: [PATCH 63/77] Run formatting

---
 src/setfit/sampler.py              | 2 +-
 src/setfit/trainer.py              | 6 ++++--
 src/setfit/trainer_distillation.py | 4 ++--
 src/setfit/training_args.py        | 4 +++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/setfit/sampler.py b/src/setfit/sampler.py
index 1bea2e78..0eceba1e 100644
--- a/src/setfit/sampler.py
+++ b/src/setfit/sampler.py
@@ -2,8 +2,8 @@
 from typing import Generator, Iterable, List, Optional
 
 import numpy as np
-from sentence_transformers import InputExample
 import torch
+from sentence_transformers import InputExample
 from torch.utils.data import IterableDataset
 
 from . import logging
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index d8828f46..848796b6 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -374,7 +374,7 @@ def train(
             parameters.extend(self.dataset_to_parameters(dataset))
 
         self.train_embeddings(*parameters, args=args)
-        training_parameters = parameters[:len(parameters) // 2] if self.eval_dataset else parameters
+        training_parameters = parameters[: len(parameters) // 2] if self.eval_dataset else parameters
         self.train_classifier(*training_parameters, args=args)
 
     def dataset_to_parameters(self, dataset: Dataset) -> List[Iterable]:
@@ -655,7 +655,9 @@ def _evaluate_with_loss(
     ) -> float:
         model_body.eval()
         losses = []
-        for data in tqdm(iter(eval_dataloader), total=len(eval_dataloader), leave=False, disable=not args.show_progress_bar):
+        for data in tqdm(
+            iter(eval_dataloader), total=len(eval_dataloader), leave=False, disable=not args.show_progress_bar
+        ):
             features, labels = data
             labels = labels.to(model_body._target_device)
             features = list(map(lambda batch: batch_to_device(batch, model_body._target_device), features))
diff --git a/src/setfit/trainer_distillation.py b/src/setfit/trainer_distillation.py
index 2bc2d629..da5ec4b8 100644
--- a/src/setfit/trainer_distillation.py
+++ b/src/setfit/trainer_distillation.py
@@ -1,9 +1,9 @@
 import warnings
 from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
-from datasets import Dataset
 import torch
-from sentence_transformers import losses, util, InputExample
+from datasets import Dataset
+from sentence_transformers import InputExample, losses, util
 from torch import nn
 from torch.utils.data import DataLoader
 
diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
index 5336bf4c..9ed24fb7 100644
--- a/src/setfit/training_args.py
+++ b/src/setfit/training_args.py
@@ -15,8 +15,10 @@
 
 from . import logging
 
+
 logger = logging.get_logger(__name__)
 
+
 @dataclass
 class TrainingArguments:
     """
@@ -261,7 +263,7 @@ def __post_init__(self) -> None:
         self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
 
         if self.eval_steps is not None and self.evaluation_strategy == IntervalStrategy.NO:
-            logger.info("Using `evaluation_strategy=\"steps\"` as `eval_steps` is defined.")
+            logger.info('Using `evaluation_strategy="steps"` as `eval_steps` is defined.')
             self.evaluation_strategy = IntervalStrategy.STEPS
 
         # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero

From d2f24896dc8b95ce21686518047a99ff6bf5bdcd Mon Sep 17 00:00:00 2001
From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com>
Date: Thu, 9 Nov 2023 21:22:03 +0100
Subject: [PATCH 64/77] Implement SetFit for ABSA from Intel Labs (#6)

* Initial version for SetFit ABSA

* Create complete test suite for ABSA (100%,90%,96%)

Only push_to_hub is not under test

* Run formatting

* Allow initializing models with different span_context

* Remove resolved TODO

* Raise error if args is the wrong type

* Update column mapping, allow partial maps

* Remove model_init from ABSA Trainer, not used

* Split train into train_aspect and train_polarity

And reformat

* Prefix logs with aspect/polarity when training

* Add ABSA-specific model cards

* If spaCy doesn't agree with the start/end, just ignore those cases

* If there are no aspects, just return

* Elaborate on the required columns in the datasets

* Add Absa to the WIP docs
---
 .github/workflows/tests.yml            |   2 +
 docs/source/en/api/main.mdx            |   4 +
 docs/source/en/api/trainer.mdx         |   6 +-
 setup.py                               |  12 +-
 src/setfit/__init__.py                 |   1 +
 src/setfit/modeling.py                 |  41 ++--
 src/setfit/span/__init__.py            |   3 +
 src/setfit/span/aspect_extractor.py    |  34 +++
 src/setfit/span/model_card_template.md |  64 +++++
 src/setfit/span/modeling.py            | 292 +++++++++++++++++++++++
 src/setfit/span/trainer.py             | 316 +++++++++++++++++++++++++
 src/setfit/trainer.py                  | 154 +++++++-----
 tests/conftest.py                      |  23 +-
 tests/span/test_modeling.py            |  78 ++++++
 tests/span/test_trainer.py             |  75 ++++++
 tests/test_trainer.py                  |  32 ++-
 16 files changed, 1052 insertions(+), 85 deletions(-)
 create mode 100644 src/setfit/span/__init__.py
 create mode 100644 src/setfit/span/aspect_extractor.py
 create mode 100644 src/setfit/span/model_card_template.md
 create mode 100644 src/setfit/span/modeling.py
 create mode 100644 src/setfit/span/trainer.py
 create mode 100644 tests/span/test_modeling.py
 create mode 100644 tests/span/test_trainer.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index afdcf1ec..243c1306 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -40,6 +40,8 @@ jobs:
         run: |
           python -m pip install --no-cache-dir --upgrade pip
           python -m pip install --no-cache-dir ${{ matrix.requirements }}
+          python -m spacy download en_core_web_lg
+          python -m spacy download en_core_web_sm
         if: steps.restore-cache.outputs.cache-hit != 'true'
 
       - name: Install the checked-out setfit
diff --git a/docs/source/en/api/main.mdx b/docs/source/en/api/main.mdx
index ac2b77e4..a65b3db4 100644
--- a/docs/source/en/api/main.mdx
+++ b/docs/source/en/api/main.mdx
@@ -6,3 +6,7 @@
 # SetFitHead
 
 [[autodoc]] SetFitHead
+
+# AbsaModel
+
+[[autodoc]] AbsaModel
\ No newline at end of file
diff --git a/docs/source/en/api/trainer.mdx b/docs/source/en/api/trainer.mdx
index 4b605dc8..3e3d39d1 100644
--- a/docs/source/en/api/trainer.mdx
+++ b/docs/source/en/api/trainer.mdx
@@ -5,4 +5,8 @@
 
 # DistillationTrainer
 
-[[autodoc]] DistillationTrainer
\ No newline at end of file
+[[autodoc]] DistillationTrainer
+
+# AbsaTrainer
+
+[[autodoc]] AbsaTrainer
\ No newline at end of file
diff --git a/setup.py b/setup.py
index dcd5a8ea..7079d145 100644
--- a/setup.py
+++ b/setup.py
@@ -10,11 +10,18 @@
 MAINTAINER_EMAIL = "lewis@huggingface.co"
 
 INTEGRATIONS_REQUIRE = ["optuna"]
-REQUIRED_PKGS = ["datasets>=2.3.0", "sentence-transformers>=2.2.1", "evaluate>=0.3.0"]
+REQUIRED_PKGS = [
+    "datasets>=2.3.0",
+    "sentence-transformers>=2.2.1",
+    "evaluate>=0.3.0",
+    "huggingface_hub>=0.11.0",
+    "scikit-learn",
+]
+ABSA_REQUIRE = ["spacy"]
 QUALITY_REQUIRE = ["black", "flake8", "isort", "tabulate"]
 ONNX_REQUIRE = ["onnxruntime", "onnx", "skl2onnx"]
 OPENVINO_REQUIRE = ["hummingbird-ml<0.4.9", "openvino==2022.3.0"]
-TESTS_REQUIRE = ["pytest", "pytest-cov"] + ONNX_REQUIRE + OPENVINO_REQUIRE
+TESTS_REQUIRE = ["pytest", "pytest-cov"] + ONNX_REQUIRE + OPENVINO_REQUIRE + ABSA_REQUIRE
 DOCS_REQUIRE = ["hf-doc-builder>=0.3.0"]
 EXTRAS_REQUIRE = {
     "optuna": INTEGRATIONS_REQUIRE,
@@ -23,6 +30,7 @@
     "onnx": ONNX_REQUIRE,
     "openvino": ONNX_REQUIRE + OPENVINO_REQUIRE,
     "docs": DOCS_REQUIRE,
+    "absa": ABSA_REQUIRE,
 }
 
 
diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
index c36d630d..f131eee0 100644
--- a/src/setfit/__init__.py
+++ b/src/setfit/__init__.py
@@ -4,6 +4,7 @@
 
 from .data import get_templated_dataset, sample_dataset
 from .modeling import SetFitHead, SetFitModel
+from .span import AbsaModel, AbsaTrainer, AspectExtractor, AspectModel, PolarityModel
 from .trainer import SetFitTrainer, Trainer
 from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
 from .training_args import TrainingArguments
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
index 0662d2d3..793b2c72 100644
--- a/src/setfit/modeling.py
+++ b/src/setfit/modeling.py
@@ -17,6 +17,7 @@
 import requests
 import torch
 from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+from huggingface_hub.utils import validate_hf_hub_args
 from sentence_transformers import SentenceTransformer, models
 from sklearn.linear_model import LogisticRegression
 from sklearn.multiclass import OneVsRestClassifier
@@ -74,14 +75,14 @@
 
 ```bibtex
 @article{{https://doi.org/10.48550/arxiv.2209.11055,
-doi = {{10.48550/ARXIV.2209.11055}},
-url = {{https://arxiv.org/abs/2209.11055}},
-author = {{Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren}},
-keywords = {{Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}},
-title = {{Efficient Few-Shot Learning Without Prompts}},
-publisher = {{arXiv}},
-year = {{2022}},
-copyright = {{Creative Commons Attribution 4.0 International}}
+    doi = {{10.48550/ARXIV.2209.11055}},
+    url = {{https://arxiv.org/abs/2209.11055}},
+    author = {{Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren}},
+    keywords = {{Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}},
+    title = {{Efficient Few-Shot Learning Without Prompts}},
+    publisher = {{arXiv}},
+    year = {{2022}},
+    copyright = {{Creative Commons Attribution 4.0 International}}
 }}
 ```
 """
@@ -246,7 +247,6 @@ class SetFitModel(PyTorchModelHubMixin):
     model_body: Optional[SentenceTransformer] = (None,)
     model_head: Optional[Union[SetFitHead, LogisticRegression]] = None
     multi_target_strategy: Optional[str] = None
-    l2_weight: float = 1e-2
     normalize_embeddings: bool = False
 
     @property
@@ -372,7 +372,7 @@ def _prepare_optimizer(
         l2_weight: float,
     ) -> torch.optim.Optimizer:
         body_learning_rate = body_learning_rate or head_learning_rate
-        l2_weight = l2_weight or self.l2_weight
+        l2_weight = l2_weight or 1e-2
         optimizer = torch.optim.AdamW(
             [
                 {
@@ -519,6 +519,15 @@ def predict_proba(
         outputs = self.model_head.predict_proba(embeddings)
         return self._output_type_conversion(outputs, as_numpy=as_numpy)
 
+    @property
+    def device(self) -> torch.device:
+        """Get the Torch device that this model is on.
+
+        Returns:
+            torch.device: The device that the model is on.
+        """
+        return self.model_body.device
+
     def to(self, device: Union[str, torch.device]) -> "SetFitModel":
         """Move this SetFitModel to `device`, and then return `self`. This method does not copy.
 
@@ -589,6 +598,7 @@ def _save_pretrained(self, save_directory: Union[Path, str]) -> None:
         joblib.dump(self.model_head, str(Path(save_directory) / MODEL_HEAD_NAME))
 
     @classmethod
+    @validate_hf_hub_args
     def _from_pretrained(
         cls,
         model_id: str,
@@ -598,13 +608,13 @@ def _from_pretrained(
         proxies: Optional[Dict] = None,
         resume_download: Optional[bool] = None,
         local_files_only: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
         multi_target_strategy: Optional[str] = None,
         use_differentiable_head: bool = False,
         normalize_embeddings: bool = False,
         **model_kwargs,
     ) -> "SetFitModel":
-        model_body = SentenceTransformer(model_id, cache_folder=cache_dir, use_auth_token=use_auth_token)
+        model_body = SentenceTransformer(model_id, cache_folder=cache_dir, use_auth_token=token)
         target_device = model_body._target_device
         model_body.to(target_device)  # put `model_body` on the target device
 
@@ -628,7 +638,7 @@ def _from_pretrained(
                     force_download=force_download,
                     proxies=proxies,
                     resume_download=resume_download,
-                    use_auth_token=use_auth_token,
+                    token=token,
                     local_files_only=local_files_only,
                 )
             except requests.exceptions.RequestException:
@@ -641,7 +651,7 @@ def _from_pretrained(
         if model_head_file is not None:
             model_head = joblib.load(model_head_file)
         else:
-            head_params = model_kwargs.get("head_params", {})
+            head_params = model_kwargs.pop("head_params", {})
             if use_differentiable_head:
                 if multi_target_strategy is None:
                     use_multitarget = False
@@ -677,9 +687,12 @@ def _from_pretrained(
                 else:
                     model_head = clf
 
+        # Remove the `transformers` config
+        model_kwargs.pop("config", None)
         return cls(
             model_body=model_body,
             model_head=model_head,
             multi_target_strategy=multi_target_strategy,
             normalize_embeddings=normalize_embeddings,
+            **model_kwargs,
         )
diff --git a/src/setfit/span/__init__.py b/src/setfit/span/__init__.py
new file mode 100644
index 00000000..7fc6f9db
--- /dev/null
+++ b/src/setfit/span/__init__.py
@@ -0,0 +1,3 @@
+from .aspect_extractor import AspectExtractor
+from .modeling import AbsaModel, AspectModel, PolarityModel
+from .trainer import AbsaTrainer
diff --git a/src/setfit/span/aspect_extractor.py b/src/setfit/span/aspect_extractor.py
new file mode 100644
index 00000000..096b9bb6
--- /dev/null
+++ b/src/setfit/span/aspect_extractor.py
@@ -0,0 +1,34 @@
+from typing import TYPE_CHECKING, List, Tuple
+
+
+if TYPE_CHECKING:
+    from spacy.tokens import Doc
+
+
+class AspectExtractor:
+    def __init__(self, spacy_model: str) -> None:
+        super().__init__()
+        import spacy
+
+        self.nlp = spacy.load(spacy_model)
+
+    def find_groups(self, aspect_mask: List[bool]):
+        start = None
+        for idx, flag in enumerate(aspect_mask):
+            if flag:
+                if start is None:
+                    start = idx
+            else:
+                if start is not None:
+                    yield slice(start, idx)
+                    start = None
+        if start is not None:
+            yield slice(start, idx)
+
+    def __call__(self, texts: List[str]) -> Tuple[List["Doc"], List[slice]]:
+        aspects_list = []
+        docs = list(self.nlp.pipe(texts))
+        for doc in docs:
+            aspect_mask = [token.pos_ in ("NOUN", "PROPN") for token in doc]
+            aspects_list.append(list(self.find_groups(aspect_mask)))
+        return docs, aspects_list
diff --git a/src/setfit/span/model_card_template.md b/src/setfit/span/model_card_template.md
new file mode 100644
index 00000000..31ec618f
--- /dev/null
+++ b/src/setfit/span/model_card_template.md
@@ -0,0 +1,64 @@
+---
+license: apache-2.0
+tags:
+- setfit
+- sentence-transformers
+- absa
+- token-classification
+pipeline_tag: token-classification
+---
+
+# {{ model_name | default("SetFit ABSA Model", true) }}
+
+This is a [SetFit ABSA model](https://github.com/huggingface/setfit) that can be used for Aspect Based Sentiment Analysis (ABSA). \
+In particular, this model is in charge of {{ "filtering aspect span candidates" if is_aspect else "classifying aspect polarities"}}.
+It has been trained using SetFit, an efficient few-shot learning technique that involves:
+
+1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning.
+2. Training a classification head with features from the fine-tuned Sentence Transformer.
+
+This model was trained within the context of a larger system for ABSA, which looks like so:
+
+1. Use a spaCy model to select possible aspect span candidates.
+2. {{ "**" if is_aspect else "" }}Use {{ "this" if is_aspect else "a" }} SetFit model to filter these possible aspect span candidates.{{ "**" if is_aspect else "" }}
+3. {{ "**" if not is_aspect else "" }}Use {{ "this" if not is_aspect else "a" }} SetFit model to classify the filtered aspect span candidates.{{ "**" if not is_aspect else "" }}
+
+## Usage
+
+To use this model for inference, first install the SetFit library:
+
+```bash
+pip install setfit
+```
+
+You can then run inference as follows:
+
+```python
+from setfit import AbsaModel
+
+# Download from Hub and run inference
+model = AbsaModel.from_pretrained(
+    "{{ aspect_model }}",
+    "{{ polarity_model }}",
+)
+# Run inference
+preds = model([
+    "The best pizza outside of Italy and really tasty.",
+    "The food here is great but the service is terrible",
+])
+```
+
+## BibTeX entry and citation info
+
+```bibtex
+@article{https://doi.org/10.48550/arxiv.2209.11055,
+    doi = {10.48550/ARXIV.2209.11055},
+    url = {https://arxiv.org/abs/2209.11055},
+    author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren},
+    keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
+    title = {Efficient Few-Shot Learning Without Prompts},
+    publisher = {arXiv},
+    year = {2022},
+    copyright = {Creative Commons Attribution 4.0 International}
+}
+```
\ No newline at end of file
diff --git a/src/setfit/span/modeling.py b/src/setfit/span/modeling.py
new file mode 100644
index 00000000..02b0b1dd
--- /dev/null
+++ b/src/setfit/span/modeling.py
@@ -0,0 +1,292 @@
+import json
+import os
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import SoftTemporaryDirectory, validate_hf_hub_args
+from jinja2 import Environment, FileSystemLoader
+
+from .. import logging
+from ..modeling import SetFitModel
+from .aspect_extractor import AspectExtractor
+
+
+if TYPE_CHECKING:
+    from spacy.tokens import Doc
+
+logger = logging.get_logger(__name__)
+
+CONFIG_NAME = "config_span_setfit.json"
+
+
+@dataclass
+class SpanSetFitModel(SetFitModel):
+    span_context: int = 0
+
+    def prepend_aspects(self, docs: List["Doc"], aspects_list: List[List[slice]]) -> List[str]:
+        for doc, aspects in zip(docs, aspects_list):
+            for aspect_slice in aspects:
+                aspect = doc[max(aspect_slice.start - self.span_context, 0) : aspect_slice.stop + self.span_context]
+                # TODO: Investigate performance difference of different formats
+                yield aspect.text + ":" + doc.text
+
+    def __call__(self, docs: List["Doc"], aspects_list: List[List[slice]]) -> List[bool]:
+        inputs_list = list(self.prepend_aspects(docs, aspects_list))
+        preds = self.predict(inputs_list, as_numpy=True)
+        iter_preds = iter(preds)
+        return [[next(iter_preds) for _ in aspects] for aspects in aspects_list]
+
+    @classmethod
+    @validate_hf_hub_args
+    def _from_pretrained(
+        cls,
+        model_id: str,
+        span_context: Optional[int] = None,
+        revision: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        force_download: Optional[bool] = None,
+        proxies: Optional[Dict] = None,
+        resume_download: Optional[bool] = None,
+        local_files_only: Optional[bool] = None,
+        token: Optional[Union[bool, str]] = None,
+        **model_kwargs,
+    ) -> "SpanSetFitModel":
+        config_file: Optional[str] = None
+        if os.path.isdir(model_id):
+            if CONFIG_NAME in os.listdir(model_id):
+                config_file = os.path.join(model_id, CONFIG_NAME)
+        else:
+            try:
+                config_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=CONFIG_NAME,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            except requests.exceptions.RequestException:
+                pass
+
+        if config_file is not None:
+            with open(config_file, "r", encoding="utf-8") as f:
+                config = json.load(f)
+            model_kwargs.update(config)
+
+        if span_context is not None:
+            model_kwargs["span_context"] = span_context
+
+        return super(SpanSetFitModel, cls)._from_pretrained(
+            model_id,
+            revision,
+            cache_dir,
+            force_download,
+            proxies,
+            resume_download,
+            local_files_only,
+            token,
+            **model_kwargs,
+        )
+
+    def _save_pretrained(self, save_directory: Union[Path, str]) -> None:
+        path = os.path.join(save_directory, CONFIG_NAME)
+        with open(path, "w") as f:
+            json.dump({"span_context": self.span_context}, f, indent=2)
+
+        super()._save_pretrained(save_directory)
+
+    def create_model_card(self, path: str, model_name: Optional[str] = None) -> None:
+        """Creates and saves a model card for a SetFit model.
+
+        Args:
+            path (str): The path to save the model card to.
+            model_name (str, *optional*): The name of the model. Defaults to `SetFit Model`.
+        """
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+        # If the model_path is a folder that exists locally, i.e. when create_model_card is called
+        # via push_to_hub, and the path is in a temporary folder, then we only take the last two
+        # directories
+        model_path = Path(model_name)
+        if model_path.exists() and Path(tempfile.gettempdir()) in model_path.resolve().parents:
+            model_name = "/".join(model_path.parts[-2:])
+
+        environment = Environment(loader=FileSystemLoader(Path(__file__).parent))
+        template = environment.get_template("model_card_template.md")
+        is_aspect = isinstance(self, AspectModel)
+        aspect_model = "setfit-absa-aspect"
+        polarity_model = "setfit-absa-polarity"
+        if model_name is not None:
+            if is_aspect:
+                aspect_model = model_name
+                if model_name.endswith("-aspect"):
+                    polarity_model = model_name[: -len("-aspect")] + "-polarity"
+            else:
+                polarity_model = model_name
+                if model_name.endswith("-polarity"):
+                    aspect_model = model_name[: -len("-polarity")] + "-aspect"
+
+        model_card_content = template.render(
+            model_name=model_name, is_aspect=is_aspect, aspect_model=aspect_model, polarity_model=polarity_model
+        )
+        with open(os.path.join(path, "README.md"), "w", encoding="utf-8") as f:
+            f.write(model_card_content)
+
+
+class AspectModel(SpanSetFitModel):
+    # TODO: Assumes binary SetFitModel with 0 == no aspect, 1 == aspect
+    def __call__(self, docs: List["Doc"], aspects_list: List[List[slice]]) -> List[bool]:
+        sentence_preds = super().__call__(docs, aspects_list)
+        return [
+            [aspect for aspect, pred in zip(aspects, preds) if pred == 1]
+            for aspects, preds in zip(aspects_list, sentence_preds)
+        ]
+
+
+@dataclass
+class PolarityModel(SpanSetFitModel):
+    span_context: int = 3
+
+
+@dataclass
+class AbsaModel:
+    aspect_extractor: AspectExtractor
+    aspect_model: AspectModel
+    polarity_model: PolarityModel
+
+    def predict(self, inputs: Union[str, List[str]]) -> List[Dict[str, Any]]:
+        is_str = isinstance(inputs, str)
+        inputs_list = [inputs] if is_str else inputs
+        docs, aspects_list = self.aspect_extractor(inputs_list)
+        if sum(aspects_list, []) == []:
+            return aspects_list
+
+        aspects_list = self.aspect_model(docs, aspects_list)
+        if sum(aspects_list, []) == []:
+            return aspects_list
+
+        polarity_list = self.polarity_model(docs, aspects_list)
+        outputs = []
+        for docs, aspects, polarities in zip(docs, aspects_list, polarity_list):
+            outputs.append(
+                [
+                    {"span": docs[aspect_slice].text, "polarity": polarity}
+                    for aspect_slice, polarity in zip(aspects, polarities)
+                ]
+            )
+        return outputs if not is_str else outputs[0]
+
+    @property
+    def device(self) -> torch.device:
+        return self.aspect_model.device
+
+    def to(self, device: Union[str, torch.device]) -> "AbsaModel":
+        self.aspect_model.to(device)
+        self.polarity_model.to(device)
+
+    def __call__(self, inputs: Union[str, List[str]]) -> List[Dict[str, Any]]:
+        return self.predict(inputs)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, Path],
+        polarity_save_directory: Optional[Union[str, Path]] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ) -> None:
+        if polarity_save_directory is None:
+            base_save_directory = Path(save_directory)
+            save_directory = base_save_directory.parent / (base_save_directory.name + "-aspect")
+            polarity_save_directory = base_save_directory.parent / (base_save_directory.name + "-polarity")
+        self.aspect_model.save_pretrained(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+        self.polarity_model.save_pretrained(save_directory=polarity_save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: str,
+        polarity_model_id: Optional[str] = None,
+        spacy_model: Optional[str] = "en_core_web_lg",
+        span_contexts: Tuple[Optional[int], Optional[int]] = (None, None),
+        force_download: bool = False,
+        resume_download: bool = False,
+        proxies: Optional[Dict] = None,
+        token: Optional[Union[str, bool]] = None,
+        cache_dir: Optional[str] = None,
+        local_files_only: bool = False,
+        use_differentiable_head: bool = False,
+        normalize_embeddings: bool = False,
+        **model_kwargs,
+    ) -> "AbsaModel":
+        revision = None
+        if len(model_id.split("@")) == 2:
+            model_id, revision = model_id.split("@")
+        aspect_model = AspectModel.from_pretrained(
+            model_id,
+            span_context=span_contexts[0],
+            revision=revision,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            use_differentiable_head=use_differentiable_head,
+            normalize_embeddings=normalize_embeddings,
+            **model_kwargs,
+        )
+        if polarity_model_id:
+            model_id = polarity_model_id
+            revision = None
+            if len(model_id.split("@")) == 2:
+                model_id, revision = model_id.split("@")
+        polarity_model = PolarityModel.from_pretrained(
+            model_id,
+            span_context=span_contexts[1],
+            revision=revision,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            use_differentiable_head=use_differentiable_head,
+            normalize_embeddings=normalize_embeddings,
+            **model_kwargs,
+        )
+
+        aspect_extractor = AspectExtractor(spacy_model=spacy_model)
+
+        return cls(aspect_extractor, aspect_model, polarity_model)
+
+    def push_to_hub(self, repo_id: str, polarity_repo_id: Optional[str] = None, **kwargs) -> None:
+        if "/" not in repo_id:
+            raise ValueError(
+                '`repo_id` must be a full repository ID, including organisation, e.g. "tomaarsen/setfit-absa-restaurant".'
+            )
+        if polarity_repo_id is not None and "/" not in polarity_repo_id:
+            raise ValueError(
+                '`polarity_repo_id` must be a full repository ID, including organisation, e.g. "tomaarsen/setfit-absa-restaurant".'
+            )
+        commit_message = kwargs.pop("commit_message", "Add SetFit ABSA model")
+
+        # Push the files to the repo in a single commit
+        with SoftTemporaryDirectory() as tmp_dir:
+            save_directory = Path(tmp_dir) / repo_id
+            polarity_save_directory = None if polarity_repo_id is None else Path(tmp_dir) / polarity_repo_id
+            self.save_pretrained(
+                save_directory=save_directory,
+                polarity_save_directory=polarity_save_directory,
+                push_to_hub=True,
+                commit_message=commit_message,
+                **kwargs,
+            )
diff --git a/src/setfit/span/trainer.py b/src/setfit/span/trainer.py
new file mode 100644
index 00000000..477cddf8
--- /dev/null
+++ b/src/setfit/span/trainer.py
@@ -0,0 +1,316 @@
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import optuna
+from datasets import Dataset
+from transformers.trainer_callback import TrainerCallback
+
+from setfit.span.modeling import AbsaModel, AspectModel, PolarityModel, SpanSetFitModel
+from setfit.training_args import TrainingArguments
+
+from .. import logging
+from ..trainer import ColumnMappingMixin, Trainer
+
+
+logger = logging.get_logger(__name__)
+
+
+class AbsaTrainer(ColumnMappingMixin):
+    """Trainer to train a SetFit ABSA model.
+
+    Args:
+        model (`AbsaModel`):
+            The AbsaModel model to train.
+        args (`TrainingArguments`, *optional*):
+            The training arguments to use. If `polarity_args` is not defined, then `args` is used for both
+            the aspect and the polarity model.
+        polarity_args (`TrainingArguments`, *optional*):
+            The training arguments to use for the polarity model. If not defined, `args` is used for both
+            the aspect and the polarity model.
+        train_dataset (`Dataset`):
+            The training dataset. The dataset must have "text", "span", "label" and "ordinal" columns.
+        eval_dataset (`Dataset`, *optional*):
+            The evaluation dataset. The dataset must have "text", "span", "label" and "ordinal" columns.
+        metric (`str` or `Callable`, *optional*, defaults to `"accuracy"`):
+            The metric to use for evaluation. If a string is provided, we treat it as the metric
+            name and load it with default settings.
+            If a callable is provided, it must take two arguments (`y_pred`, `y_test`).
+        metric_kwargs (`Dict[str, Any]`, *optional*):
+            Keyword arguments passed to the evaluation function if `metric` is an evaluation string like "f1".
+            For example useful for providing an averaging strategy for computing f1 in a multi-label setting.
+        callbacks: (`List[~transformers.TrainerCallback]`, *optional*):
+            A list of callbacks to customize the training loop. Will add those to the list of default callbacks
+            detailed in [here](https://huggingface.co/docs/transformers/main/en/main_classes/callback).
+            If you want to remove one of the default callbacks used, use the `Trainer.remove_callback()` method.
+        column_mapping (`Dict[str, str]`, *optional*):
+            A mapping from the column names in the dataset to the column names expected by the model.
+            The expected format is a dictionary with the following format:
+            `{"text_column_name": "text", "span_column_name": "span", "label_column_name: "label", "ordinal_column_name": "ordinal"}`.
+    """
+
+    _REQUIRED_COLUMNS = {"text", "span", "label", "ordinal"}
+
+    def __init__(
+        self,
+        model: AbsaModel,
+        args: Optional[TrainingArguments] = None,
+        polarity_args: Optional[TrainingArguments] = None,
+        train_dataset: Optional["Dataset"] = None,
+        eval_dataset: Optional["Dataset"] = None,
+        metric: Union[str, Callable[["Dataset", "Dataset"], Dict[str, float]]] = "accuracy",
+        metric_kwargs: Optional[Dict[str, Any]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        column_mapping: Optional[Dict[str, str]] = None,
+    ) -> None:
+        self.model = model
+        self.aspect_extractor = model.aspect_extractor
+
+        if train_dataset is not None and column_mapping:
+            train_dataset = self._apply_column_mapping(train_dataset, column_mapping)
+        aspect_train_dataset, polarity_train_dataset = self.preprocess_dataset(
+            model.aspect_model, model.polarity_model, train_dataset
+        )
+        if eval_dataset is not None and column_mapping:
+            eval_dataset = self._apply_column_mapping(eval_dataset, column_mapping)
+        aspect_eval_dataset, polarity_eval_dataset = self.preprocess_dataset(
+            model.aspect_model, model.polarity_model, eval_dataset
+        )
+
+        self.aspect_trainer = Trainer(
+            model.aspect_model,
+            args=args,
+            train_dataset=aspect_train_dataset,
+            eval_dataset=aspect_eval_dataset,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            callbacks=callbacks,
+        )
+        self.aspect_trainer._set_logs_mapper(
+            {"eval_embedding_loss": "eval_aspect_embedding_loss", "embedding_loss": "aspect_embedding_loss"}
+        )
+        self.polarity_trainer = Trainer(
+            model.polarity_model,
+            args=polarity_args or args,
+            train_dataset=polarity_train_dataset,
+            eval_dataset=polarity_eval_dataset,
+            metric=metric,
+            metric_kwargs=metric_kwargs,
+            callbacks=callbacks,
+        )
+        self.polarity_trainer._set_logs_mapper(
+            {"eval_embedding_loss": "eval_polarity_embedding_loss", "embedding_loss": "polarity_embedding_loss"}
+        )
+
+    def preprocess_dataset(
+        self, aspect_model: AspectModel, polarity_model: PolarityModel, dataset: Dataset
+    ) -> Dataset:
+        if dataset is None:
+            return dataset, dataset
+
+        # Group by "text"
+        grouped_data = defaultdict(list)
+        for sample in dataset:
+            text = sample.pop("text")
+            grouped_data[text].append(sample)
+
+        def index_ordinal(text: str, target: str, ordinal: int) -> Tuple[int, int]:
+            find_from = 0
+            for _ in range(ordinal + 1):
+                start_idx = text.index(target, find_from)
+                find_from = start_idx + 1
+            return start_idx, start_idx + len(target)
+
+        docs, aspects_list = self.aspect_extractor(grouped_data.keys())
+        intersected_aspect_list = []
+        polarity_labels = []
+        aspect_labels = []
+        for doc, aspects, text in zip(docs, aspects_list, grouped_data):
+            gold_aspects = []
+            gold_polarity_labels = []
+            for annotation in grouped_data[text]:
+                try:
+                    start, end = index_ordinal(text, annotation["span"], annotation["ordinal"])
+                except ValueError:
+                    logger.info(
+                        f"The ordinal of {annotation['ordinal']} for span {annotation['span']!r} in {text!r} is too high. "
+                        "Skipping this sample."
+                    )
+                    continue
+
+                gold_aspect_span = doc.char_span(start, end)
+                if gold_aspect_span is None:
+                    continue
+                gold_aspects.append(slice(gold_aspect_span.start, gold_aspect_span.end))
+                gold_polarity_labels.append(annotation["label"])
+
+            # The Aspect model uses all predicted aspects, with labels depending on whether
+            # the predicted aspects are indeed true/gold aspects.
+            aspect_labels.extend([aspect in gold_aspects for aspect in aspects])
+
+            # The Polarity model uses the intersection of pred and gold aspects, with labels for the gold label.
+            intersected_aspects = []
+            for gold_aspect, gold_label in zip(gold_aspects, gold_polarity_labels):
+                if gold_aspect in aspects:
+                    intersected_aspects.append(gold_aspect)
+                    polarity_labels.append(gold_label)
+            intersected_aspect_list.append(intersected_aspects)
+
+        aspect_texts = list(aspect_model.prepend_aspects(docs, aspects_list))
+        polarity_texts = list(polarity_model.prepend_aspects(docs, intersected_aspect_list))
+        return Dataset.from_dict({"text": aspect_texts, "label": aspect_labels}), Dataset.from_dict(
+            {"text": polarity_texts, "label": polarity_labels}
+        )
+
+    def train(
+        self,
+        args: Optional[TrainingArguments] = None,
+        polarity_args: Optional[TrainingArguments] = None,
+        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Main training entry point.
+
+        Args:
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the aspect training arguments for this training call.
+            polarity_args (`TrainingArguments`, *optional*):
+                Temporarily change the polarity training arguments for this training call.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+        """
+        self.train_aspect(args=args, trial=trial, **kwargs)
+        self.train_polarity(args=polarity_args, trial=trial, **kwargs)
+
+    def train_aspect(
+        self,
+        args: Optional[TrainingArguments] = None,
+        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Train the aspect model only.
+
+        Args:
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the aspect training arguments for this training call.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+        """
+        self.aspect_trainer.train(args=args, trial=trial, **kwargs)
+
+    def train_polarity(
+        self,
+        args: Optional[TrainingArguments] = None,
+        trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Train the polarity model only.
+
+        Args:
+            args (`TrainingArguments`, *optional*):
+                Temporarily change the aspect training arguments for this training call.
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+                The trial run or the hyperparameter dictionary for hyperparameter search.
+        """
+        self.polarity_trainer.train(args=args, trial=trial, **kwargs)
+
+    def add_callback(self, callback):
+        """
+        Add a callback to the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will instantiate a member of that class.
+        """
+        self.aspect_trainer.add_callback(callback)
+        self.polarity_trainer.add_callback(callback)
+
+    def pop_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.
+
+        If the callback is not found, returns `None` (and no error is raised).
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will pop the first member of that class found in the list of callbacks.
+
+        Returns:
+            [`Tuple[~transformer.TrainerCallback]`]: The callbacks removed from the aspect and polarity trainers, if found.
+        """
+        return self.aspect_trainer.pop_callback(callback), self.polarity_trainer.pop_callback(callback)
+
+    def remove_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~transformer.TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+               first case, will remove the first member of that class found in the list of callbacks.
+        """
+        self.aspect_trainer.remove_callback(callback)
+        self.polarity_trainer.remove_callback(callback)
+
+    def push_to_hub(self, repo_id: str, polarity_repo_id: Optional[str] = None, **kwargs) -> None:
+        """Upload model checkpoint to the Hub using `huggingface_hub`.
+
+        See the full list of parameters for your `huggingface_hub` version in the\
+        [huggingface_hub documentation](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.ModelHubMixin.push_to_hub).
+
+        Args:
+            repo_id (`str`):
+                The full repository ID to push to, e.g. `"tomaarsen/setfit-aspect"`.
+            repo_id (`str`):
+                The full repository ID to push to, e.g. `"tomaarsen/setfit-sst2"`.
+            config (`dict`, *optional*):
+                Configuration object to be saved alongside the model weights.
+            commit_message (`str`, *optional*):
+                Message to commit while pushing.
+            private (`bool`, *optional*, defaults to `False`):
+                Whether the repository created should be private.
+            api_endpoint (`str`, *optional*):
+                The API endpoint to use when pushing the model to the hub.
+            token (`str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files.
+                If not set, will use the token set when logging in with
+                `transformers-cli login` (stored in `~/.huggingface`).
+            branch (`str`, *optional*):
+                The git branch on which to push the model. This defaults to
+                the default branch as specified in your repository, which
+                defaults to `"main"`.
+            create_pr (`boolean`, *optional*):
+                Whether or not to create a Pull Request from `branch` with that commit.
+                Defaults to `False`.
+            allow_patterns (`List[str]` or `str`, *optional*):
+                If provided, only files matching at least one pattern are pushed.
+            ignore_patterns (`List[str]` or `str`, *optional*):
+                If provided, files matching any of the patterns are not pushed.
+        """
+        return self.model.push_to_hub(repo_id=repo_id, polarity_repo_id=polarity_repo_id, **kwargs)
+
+    def evaluate(self, dataset: Optional[Dataset] = None) -> Dict[str, Dict[str, float]]:
+        """
+        Computes the metrics for a given classifier.
+
+        Args:
+            dataset (`Dataset`, *optional*):
+                The dataset to compute the metrics on. If not provided, will use the evaluation dataset passed via
+                the `eval_dataset` argument at `Trainer` initialization.
+
+        Returns:
+            `Dict[str, Dict[str, float]]`: The evaluation metrics.
+        """
+        aspect_eval_dataset = polarity_eval_dataset = None
+        if dataset:
+            aspect_eval_dataset, polarity_eval_dataset = self.preprocess_dataset(
+                self.model.aspect_model, self.model.polarity_model, dataset
+            )
+        return {
+            "aspect": self.aspect_trainer.evaluate(aspect_eval_dataset),
+            "polarity": self.polarity_trainer.evaluate(polarity_eval_dataset),
+        }
diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
index 848796b6..baecd053 100644
--- a/src/setfit/trainer.py
+++ b/src/setfit/trainer.py
@@ -12,6 +12,7 @@
 from sentence_transformers.datasets import SentenceLabelDataset
 from sentence_transformers.losses.BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
 from sentence_transformers.util import batch_to_device
+from sklearn.preprocessing import LabelEncoder
 from torch import nn
 from torch.cuda.amp import autocast
 from torch.utils.data import DataLoader
@@ -68,7 +69,70 @@
     DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback
 
 
-class Trainer:
+class ColumnMappingMixin:
+    _REQUIRED_COLUMNS = {"text", "label"}
+
+    def _validate_column_mapping(self, dataset: "Dataset") -> None:
+        """
+        Validates the provided column mapping against the dataset.
+        """
+        column_names = set(dataset.column_names)
+        if self.column_mapping is None and not self._REQUIRED_COLUMNS.issubset(column_names):
+            # Issue #226: load_dataset will automatically assign points to "train" if no split is specified
+            if column_names == {"train"} and isinstance(dataset, DatasetDict):
+                raise ValueError(
+                    "SetFit expected a Dataset, but it got a DatasetDict with the split ['train']. "
+                    "Did you mean to select the training split with dataset['train']?"
+                )
+            elif isinstance(dataset, DatasetDict):
+                raise ValueError(
+                    f"SetFit expected a Dataset, but it got a DatasetDict with the splits {sorted(column_names)}. "
+                    "Did you mean to select one of these splits from the dataset?"
+                )
+            else:
+                raise ValueError(
+                    f"SetFit expected the dataset to have the columns {sorted(self._REQUIRED_COLUMNS)}, "
+                    f"but only the columns {sorted(column_names)} were found. "
+                    "Either make sure these columns are present, or specify which columns to use with column_mapping in Trainer."
+                )
+        if self.column_mapping is not None:
+            missing_columns = self._REQUIRED_COLUMNS.difference(self.column_mapping.values())
+            if missing_columns:
+                raise ValueError(
+                    f"The following columns are missing from the column mapping: {missing_columns}. Please provide a mapping for all required columns."
+                )
+            if not set(self.column_mapping.keys()).issubset(column_names):
+                raise ValueError(
+                    f"The column mapping expected the columns {sorted(self.column_mapping.keys())} in the dataset, "
+                    f"but the dataset had the columns {sorted(column_names)}."
+                )
+
+    def _apply_column_mapping(self, dataset: "Dataset", column_mapping: Dict[str, str]) -> "Dataset":
+        """
+        Applies the provided column mapping to the dataset, renaming columns accordingly.
+        Extra features not in the column mapping are prefixed with `"feat_"`.
+        """
+        dataset = dataset.rename_columns(
+            {
+                **column_mapping,
+                **{
+                    col: f"feat_{col}"
+                    for col in dataset.column_names
+                    if col not in column_mapping and col not in self._REQUIRED_COLUMNS
+                },
+            }
+        )
+        dset_format = dataset.format
+        dataset = dataset.with_format(
+            type=dset_format["type"],
+            columns=dataset.column_names,
+            output_all_columns=dset_format["output_all_columns"],
+            **dset_format["format_kwargs"],
+        )
+        return dataset
+
+
+class Trainer(ColumnMappingMixin):
     """Trainer to train a SetFit model.
 
     Args:
@@ -91,14 +155,16 @@ class Trainer:
         metric_kwargs (`Dict[str, Any]`, *optional*):
             Keyword arguments passed to the evaluation function if `metric` is an evaluation string like "f1".
             For example useful for providing an averaging strategy for computing f1 in a multi-label setting.
+        callbacks: (`List[~transformers.TrainerCallback]`, *optional*):
+            A list of callbacks to customize the training loop. Will add those to the list of default callbacks
+            detailed in [here](https://huggingface.co/docs/transformers/main/en/main_classes/callback).
+            If you want to remove one of the default callbacks used, use the `Trainer.remove_callback()` method.
         column_mapping (`Dict[str, str]`, *optional*):
             A mapping from the column names in the dataset to the column names expected by the model.
             The expected format is a dictionary with the following format:
             `{"text_column_name": "text", "label_column_name: "label"}`.
     """
 
-    _REQUIRED_COLUMNS = {"text", "label"}
-
     def __init__(
         self,
         model: Optional["SetFitModel"] = None,
@@ -111,6 +177,8 @@ def __init__(
         callbacks: Optional[List[TrainerCallback]] = None,
         column_mapping: Optional[Dict[str, str]] = None,
     ) -> None:
+        if args is not None and not isinstance(args, TrainingArguments):
+            raise ValueError("`args` must be a `TrainingArguments` instance imported from `setfit`.")
         self.args = args or TrainingArguments()
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
@@ -118,6 +186,7 @@ def __init__(
         self.metric = metric
         self.metric_kwargs = metric_kwargs
         self.column_mapping = column_mapping
+        self.logs_mapper = {}
 
         # Seed must be set before instantiating the model when using model_init.
         set_seed(12)
@@ -184,61 +253,6 @@ def remove_callback(self, callback):
         """
         self.callback_handler.remove_callback(callback)
 
-    def _validate_column_mapping(self, dataset: "Dataset") -> None:
-        """
-        Validates the provided column mapping against the dataset.
-        """
-        column_names = set(dataset.column_names)
-        if self.column_mapping is None and not self._REQUIRED_COLUMNS.issubset(column_names):
-            # Issue #226: load_dataset will automatically assign points to "train" if no split is specified
-            if column_names == {"train"} and isinstance(dataset, DatasetDict):
-                raise ValueError(
-                    "SetFit expected a Dataset, but it got a DatasetDict with the split ['train']. "
-                    "Did you mean to select the training split with dataset['train']?"
-                )
-            elif isinstance(dataset, DatasetDict):
-                raise ValueError(
-                    f"SetFit expected a Dataset, but it got a DatasetDict with the splits {sorted(column_names)}. "
-                    "Did you mean to select one of these splits from the dataset?"
-                )
-            else:
-                raise ValueError(
-                    f"SetFit expected the dataset to have the columns {sorted(self._REQUIRED_COLUMNS)}, "
-                    f"but only the columns {sorted(column_names)} were found. "
-                    "Either make sure these columns are present, or specify which columns to use with column_mapping in Trainer."
-                )
-        if self.column_mapping is not None:
-            missing_columns = self._REQUIRED_COLUMNS.difference(self.column_mapping.values())
-            if missing_columns:
-                raise ValueError(
-                    f"The following columns are missing from the column mapping: {missing_columns}. Please provide a mapping for all required columns."
-                )
-            if not set(self.column_mapping.keys()).issubset(column_names):
-                raise ValueError(
-                    f"The column mapping expected the columns {sorted(self.column_mapping.keys())} in the dataset, "
-                    f"but the dataset had the columns {sorted(column_names)}."
-                )
-
-    def _apply_column_mapping(self, dataset: "Dataset", column_mapping: Dict[str, str]) -> "Dataset":
-        """
-        Applies the provided column mapping to the dataset, renaming columns accordingly.
-        Extra features not in the column mapping are prefixed with `"feat_"`.
-        """
-        dataset = dataset.rename_columns(
-            {
-                **column_mapping,
-                **{col: f"feat_{col}" for col in dataset.column_names if col not in column_mapping},
-            }
-        )
-        dset_format = dataset.format
-        dataset = dataset.with_format(
-            type=dset_format["type"],
-            columns=dataset.column_names,
-            output_all_columns=dset_format["output_all_columns"],
-            **dset_format["format_kwargs"],
-        )
-        return dataset
-
     def apply_hyperparameters(self, params: Dict[str, Any], final_model: bool = False) -> None:
         """Applies a dictionary of hyperparameters to both the trainer and the model
 
@@ -329,7 +343,7 @@ def train(
         args: Optional[TrainingArguments] = None,
         trial: Optional[Union["optuna.Trial", Dict[str, Any]]] = None,
         **kwargs,
-    ):
+    ) -> None:
         """
         Main training entry point.
 
@@ -478,6 +492,7 @@ def log(self, args: TrainingArguments, logs: Dict[str, float]) -> None:
             logs (`Dict[str, float]`):
                 The values to log.
         """
+        logs = {self.logs_mapper.get(key, key): value for key, value in logs.items()}
         if self.state.epoch is not None:
             logs["epoch"] = round(self.state.epoch, 2)
 
@@ -485,6 +500,14 @@ def log(self, args: TrainingArguments, logs: Dict[str, float]) -> None:
         self.state.log_history.append(output)
         return self.callback_handler.on_log(args, self.state, self.control, logs)
 
+    def _set_logs_mapper(self, logs_mapper: Dict[str, str]) -> None:
+        """Set the logging mapper.
+
+        Args:
+            logs_mapper (str): The logging mapper, e.g. {"eval_embedding_loss": "eval_aspect_embedding_loss"}.
+        """
+        self.logs_mapper = logs_mapper
+
     def _train_sentence_transformer(
         self,
         model_body: SentenceTransformer,
@@ -732,6 +755,8 @@ def evaluate(self, dataset: Optional[Dataset] = None) -> Dict[str, float]:
         """
 
         eval_dataset = dataset or self.eval_dataset
+        if eval_dataset is None:
+            raise ValueError("No evaluation dataset provided to `Trainer.evaluate` nor the `Trainer` initialzation.")
         self._validate_column_mapping(eval_dataset)
 
         if self.column_mapping is not None:
@@ -746,6 +771,13 @@ def evaluate(self, dataset: Optional[Dataset] = None) -> Dict[str, float]:
         if isinstance(y_pred, torch.Tensor):
             y_pred = y_pred.cpu()
 
+        # Normalize string outputs
+        if y_test and isinstance(y_test[0], str):
+            encoder = LabelEncoder()
+            encoder.fit(list(y_test) + list(y_pred))
+            y_test = encoder.transform(y_test)
+            y_pred = encoder.transform(y_pred)
+
         if isinstance(self.metric, str):
             metric_config = "multilabel" if self.model.multi_target_strategy is not None else None
             metric_fn = evaluate.load(self.metric, config_name=metric_config)
@@ -843,7 +875,7 @@ def push_to_hub(self, repo_id: str, **kwargs) -> str:
 
         Args:
             repo_id (`str`):
-                The full repository ID to push to, e.g. `"tomaarsen/setfit_sst2"`.
+                The full repository ID to push to, e.g. `"tomaarsen/setfit-sst2"`.
             config (`dict`, *optional*):
                 Configuration object to be saved alongside the model weights.
             commit_message (`str`, *optional*):
@@ -873,7 +905,7 @@ def push_to_hub(self, repo_id: str, **kwargs) -> str:
         """
         if "/" not in repo_id:
             raise ValueError(
-                '`repo_id` must be a full repository ID, including organisation, e.g. "tomaarsen/setfit_sst2".'
+                '`repo_id` must be a full repository ID, including organisation, e.g. "tomaarsen/setfit-sst2".'
             )
         commit_message = kwargs.pop("commit_message", "Add SetFit model")
         return self.model.push_to_hub(repo_id, commit_message=commit_message, **kwargs)
diff --git a/tests/conftest.py b/tests/conftest.py
index acf5b825..11051223 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,8 +1,29 @@
 import pytest
+from datasets import Dataset
 
-from setfit import SetFitModel
+from setfit import AbsaModel, SetFitModel
 
 
 @pytest.fixture()
 def model() -> SetFitModel:
     return SetFitModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+
+
+@pytest.fixture()
+def absa_model() -> AbsaModel:
+    return AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+
+
+@pytest.fixture()
+def absa_dataset() -> Dataset:
+    texts = [
+        "It is about food and ambiance, and imagine how dreadful it will be it we only had to listen to an idle engine.",
+        "It is about food and ambiance, and imagine how dreadful it will be it we only had to listen to an idle engine.",
+        "Food is great and inexpensive.",
+        "Good bagels and good cream cheese.",
+        "Good bagels and good cream cheese.",
+    ]
+    spans = ["food", "ambiance", "Food", "bagels", "cream cheese"]
+    labels = ["negative", "negative", "positive", "positive", "positive"]
+    ordinals = [0, 0, 0, 0, 0]
+    return Dataset.from_dict({"text": texts, "span": spans, "label": labels, "ordinal": ordinals})
diff --git a/tests/span/test_modeling.py b/tests/span/test_modeling.py
new file mode 100644
index 00000000..02fd7c3e
--- /dev/null
+++ b/tests/span/test_modeling.py
@@ -0,0 +1,78 @@
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pytest
+import torch
+
+from setfit import AbsaModel
+from setfit.span.aspect_extractor import AspectExtractor
+from setfit.span.modeling import AspectModel, PolarityModel
+
+
+def test_loading():
+    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+    assert isinstance(model, AbsaModel)
+    assert isinstance(model.aspect_extractor, AspectExtractor)
+    assert isinstance(model.aspect_model, AspectModel)
+    assert isinstance(model.polarity_model, PolarityModel)
+
+    model = AbsaModel.from_pretrained(
+        "sentence-transformers/paraphrase-albert-small-v2@6c91e73a51599e35bd1145dfdcd3289215225009",
+        "sentence-transformers/paraphrase-albert-small-v2",
+    )
+    assert isinstance(model, AbsaModel)
+
+    model = AbsaModel.from_pretrained(
+        "sentence-transformers/paraphrase-albert-small-v2",
+        "sentence-transformers/paraphrase-albert-small-v2@6c91e73a51599e35bd1145dfdcd3289215225009",
+    )
+    assert isinstance(model, AbsaModel)
+
+    with pytest.raises(OSError):
+        model = AbsaModel.from_pretrained(
+            "sentence-transformers/paraphrase-albert-small-v2", spacy_model="not_a_spacy_model"
+        )
+
+    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", normalize_embeddings=True)
+    assert model.aspect_model.normalize_embeddings
+    assert model.polarity_model.normalize_embeddings
+
+    aspect_model = AspectModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_context=12)
+    assert aspect_model.span_context == 12
+    polarity_model = PolarityModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_context=12)
+    assert polarity_model.span_context == 12
+
+    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_contexts=(12, None))
+    assert model.aspect_model.span_context == 12
+    assert model.polarity_model.span_context == 3  # <- default
+
+
+def test_save_load(absa_model: AbsaModel) -> None:
+    absa_model.polarity_model.span_context = 5
+
+    with TemporaryDirectory() as tmp_dir:
+        tmp_dir = str(Path(tmp_dir) / "model")
+        absa_model.save_pretrained(tmp_dir)
+        assert (Path(tmp_dir + "-aspect") / "config_span_setfit.json").exists()
+        assert (Path(tmp_dir + "-polarity") / "config_span_setfit.json").exists()
+
+        fresh_model = AbsaModel.from_pretrained(tmp_dir + "-aspect", tmp_dir + "-polarity")
+        assert fresh_model.polarity_model.span_context == 5
+
+    with TemporaryDirectory() as aspect_tmp_dir:
+        with TemporaryDirectory() as polarity_tmp_dir:
+            absa_model.save_pretrained(aspect_tmp_dir, polarity_tmp_dir)
+            assert (Path(aspect_tmp_dir) / "config_span_setfit.json").exists()
+            assert (Path(polarity_tmp_dir) / "config_span_setfit.json").exists()
+
+            fresh_model = AbsaModel.from_pretrained(aspect_tmp_dir, polarity_tmp_dir)
+            assert fresh_model.polarity_model.span_context == 5
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA must be available to move a model between devices")
+def test_to(absa_model: AbsaModel) -> None:
+    assert absa_model.device.type == "cuda"
+    absa_model.to("cpu")
+    assert absa_model.device.type == "cpu"
+    assert absa_model.aspect_model.device.type == "cpu"
+    assert absa_model.polarity_model.device.type == "cpu"
diff --git a/tests/span/test_trainer.py b/tests/span/test_trainer.py
new file mode 100644
index 00000000..f89044dc
--- /dev/null
+++ b/tests/span/test_trainer.py
@@ -0,0 +1,75 @@
+from datasets import Dataset
+from transformers import TrainerCallback
+
+from setfit import AbsaTrainer
+from setfit.span.modeling import AbsaModel
+
+
+def test_trainer(absa_model: AbsaModel, absa_dataset: Dataset) -> None:
+    trainer = AbsaTrainer(absa_model, train_dataset=absa_dataset, eval_dataset=absa_dataset)
+    trainer.train()
+
+    metrics = trainer.evaluate()
+    assert "aspect" in metrics
+    assert "polarity" in metrics
+    assert "accuracy" in metrics["aspect"]
+    assert "accuracy" in metrics["polarity"]
+    assert metrics["aspect"]["accuracy"] > 0.0
+    assert metrics["polarity"]["accuracy"] > 0.0
+    new_metrics = trainer.evaluate(absa_dataset)
+    assert metrics == new_metrics
+
+    predict = absa_model.predict("Best pizza outside of Italy and really tasty.")
+    assert {"span": "pizza", "polarity": "positive"} in predict
+    predict = absa_model.predict(["Best pizza outside of Italy and really tasty.", "This is another sentence"])
+    assert isinstance(predict, list) and len(predict) == 2 and isinstance(predict[0], list)
+    predict = absa_model(["Best pizza outside of Italy and really tasty.", "This is another sentence"])
+    assert isinstance(predict, list) and len(predict) == 2 and isinstance(predict[0], list)
+
+
+def test_trainer_callbacks(absa_model: AbsaModel) -> None:
+    trainer = AbsaTrainer(absa_model)
+    assert len(trainer.aspect_trainer.callback_handler.callbacks) >= 2
+    callback_names = {callback.__class__.__name__ for callback in trainer.aspect_trainer.callback_handler.callbacks}
+    assert {"DefaultFlowCallback", "ProgressCallback"} <= callback_names
+
+    class TestCallback(TrainerCallback):
+        pass
+
+    callback = TestCallback()
+    trainer.add_callback(callback)
+    assert len(trainer.aspect_trainer.callback_handler.callbacks) == len(callback_names) + 1
+    assert len(trainer.polarity_trainer.callback_handler.callbacks) == len(callback_names) + 1
+    assert trainer.aspect_trainer.callback_handler.callbacks[-1] == callback
+    assert trainer.polarity_trainer.callback_handler.callbacks[-1] == callback
+
+    assert trainer.pop_callback(callback) == (callback, callback)
+    trainer.add_callback(callback)
+    assert trainer.aspect_trainer.callback_handler.callbacks[-1] == callback
+    assert trainer.polarity_trainer.callback_handler.callbacks[-1] == callback
+    trainer.remove_callback(callback)
+    assert callback not in trainer.aspect_trainer.callback_handler.callbacks
+    assert callback not in trainer.polarity_trainer.callback_handler.callbacks
+
+
+def test_train_ordinal_too_high(absa_model: AbsaModel) -> None:
+    absa_dataset = Dataset.from_dict(
+        {
+            "text": [
+                "It is about food and ambiance, and imagine how dreadful it will be it we only had to listen to an idle engine."
+            ],
+            "span": ["food"],
+            "label": ["negative"],
+            "ordinal": [1],
+        }
+    )
+    AbsaTrainer(absa_model, train_dataset=absa_dataset)
+    # TODO: Capture warning and test against it.
+
+
+def test_train_column_mapping(absa_model: AbsaModel, absa_dataset: Dataset) -> None:
+    absa_dataset = absa_dataset.rename_columns({"text": "sentence", "span": "aspect"})
+    trainer = AbsaTrainer(
+        absa_model, train_dataset=absa_dataset, column_mapping={"sentence": "text", "aspect": "span"}
+    )
+    trainer.train()
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 2c699ea2..8eee4d57 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1,15 +1,17 @@
 import os
-import pathlib
 import re
 import tempfile
+from pathlib import Path
 from unittest import TestCase
 
 import evaluate
 import pytest
 import torch
 from datasets import Dataset, load_dataset
+from pytest import LogCaptureFixture
 from sentence_transformers import losses
 from transformers import TrainerCallback
+from transformers import TrainingArguments as TransformersTrainingArguments
 from transformers.testing_utils import require_optuna
 from transformers.utils.hp_naming import TrialShortNamer
 
@@ -132,7 +134,7 @@ def test_trainer_raises_error_when_dataset_not_split(self):
     def test_trainer_raises_error_when_dataset_is_dataset_dict_with_train(self):
         """Verify that a useful error is raised if we pass an unsplit dataset with only a `train` split to the trainer."""
         with tempfile.TemporaryDirectory() as tmpdirname:
-            path = pathlib.Path(tmpdirname) / "test_dataset_dict_with_train.csv"
+            path = Path(tmpdirname) / "test_dataset_dict_with_train.csv"
             path.write_text("label,text\n1,good\n0,terrible\n")
             dataset = load_dataset("csv", data_files=str(path))
         trainer = Trainer(model=self.model, args=self.args, train_dataset=dataset, eval_dataset=dataset)
@@ -534,20 +536,20 @@ def test_trainer_warn_freeze(model: SetFitModel):
         trainer.freeze()
 
 
-def test_train_with_kwargs(model: SetFitModel):
+def test_train_with_kwargs(model: SetFitModel) -> None:
     train_dataset = Dataset.from_dict({"text": ["positive sentence", "negative sentence"], "label": [1, 0]})
     trainer = Trainer(model, train_dataset=train_dataset)
     with pytest.warns(DeprecationWarning, match="`Trainer.train` does not accept keyword arguments anymore."):
         trainer.train(num_epochs=5)
 
 
-def test_train_no_dataset(model: SetFitModel):
+def test_train_no_dataset(model: SetFitModel) -> None:
     trainer = Trainer(model)
     with pytest.raises(ValueError, match="Training requires a `train_dataset` given to the `Trainer` initialization."):
         trainer.train()
 
 
-def test_train_amp_save(model: SetFitModel, tmp_path):
+def test_train_amp_save(model: SetFitModel, tmp_path: Path) -> None:
     args = TrainingArguments(output_dir=tmp_path, use_amp=True, save_steps=5, num_epochs=5)
     dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
     trainer = Trainer(model, args=args, train_dataset=dataset, eval_dataset=dataset)
@@ -556,7 +558,7 @@ def test_train_amp_save(model: SetFitModel, tmp_path):
     assert os.listdir(tmp_path) == ["step_5"]
 
 
-def test_train_load_best(model: SetFitModel, tmp_path, caplog):
+def test_train_load_best(model: SetFitModel, tmp_path: Path, caplog: LogCaptureFixture) -> None:
     args = TrainingArguments(
         output_dir=tmp_path,
         save_steps=5,
@@ -571,3 +573,21 @@ def test_train_load_best(model: SetFitModel, tmp_path, caplog):
         trainer.train()
 
     assert any("Load pretrained SentenceTransformer" in text for _, _, text in caplog.record_tuples)
+
+
+def test_evaluate_with_strings(model: SetFitModel) -> None:
+    dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": ["positive", "positive", "negative"]})
+    trainer = Trainer(model, train_dataset=dataset, eval_dataset=dataset)
+    trainer.train()
+    metrics = trainer.evaluate()
+    assert "accuracy" in metrics
+
+
+def test_trainer_wrong_args(model: SetFitModel, tmp_path: Path) -> None:
+    args = TransformersTrainingArguments(output_dir=tmp_path)
+    dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
+    expected = "`args` must be a `TrainingArguments` instance imported from `setfit`."
+    with pytest.raises(ValueError, match=expected):
+        Trainer(model, args=args)
+    with pytest.raises(ValueError, match=expected):
+        Trainer(model, dataset)

From 5c4569db8b6f1954781080cbe4785ece1ebb424d Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 21:32:32 +0100
Subject: [PATCH 65/77] Import optuna under TYPE_CHECKING

---
 src/setfit/span/trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/setfit/span/trainer.py b/src/setfit/span/trainer.py
index 477cddf8..f5647322 100644
--- a/src/setfit/span/trainer.py
+++ b/src/setfit/span/trainer.py
@@ -1,7 +1,6 @@
 from collections import defaultdict
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
-import optuna
 from datasets import Dataset
 from transformers.trainer_callback import TrainerCallback
 
@@ -12,6 +11,9 @@
 from ..trainer import ColumnMappingMixin, Trainer
 
 
+if TYPE_CHECKING:
+    import optuna
+
 logger = logging.get_logger(__name__)
 
 

From ceeb7256ae44be2b2a8ca8fa4c0c3c58c251a0d5 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 21:39:48 +0100
Subject: [PATCH 66/77] Remove unused import, reformat

---
 src/setfit/span/trainer.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/setfit/span/trainer.py b/src/setfit/span/trainer.py
index f5647322..1d362616 100644
--- a/src/setfit/span/trainer.py
+++ b/src/setfit/span/trainer.py
@@ -4,7 +4,7 @@
 from datasets import Dataset
 from transformers.trainer_callback import TrainerCallback
 
-from setfit.span.modeling import AbsaModel, AspectModel, PolarityModel, SpanSetFitModel
+from setfit.span.modeling import AbsaModel, AspectModel, PolarityModel
 from setfit.training_args import TrainingArguments
 
 from .. import logging
@@ -88,7 +88,10 @@ def __init__(
             callbacks=callbacks,
         )
         self.aspect_trainer._set_logs_mapper(
-            {"eval_embedding_loss": "eval_aspect_embedding_loss", "embedding_loss": "aspect_embedding_loss"}
+            {
+                "eval_embedding_loss": "eval_aspect_embedding_loss",
+                "embedding_loss": "aspect_embedding_loss",
+            }
         )
         self.polarity_trainer = Trainer(
             model.polarity_model,
@@ -100,7 +103,10 @@ def __init__(
             callbacks=callbacks,
         )
         self.polarity_trainer._set_logs_mapper(
-            {"eval_embedding_loss": "eval_polarity_embedding_loss", "embedding_loss": "polarity_embedding_loss"}
+            {
+                "eval_embedding_loss": "eval_polarity_embedding_loss",
+                "embedding_loss": "polarity_embedding_loss",
+            }
         )
 
     def preprocess_dataset(

From 5c669b5add5370cd1273389539b012c1a5e8a58f Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 21:42:17 +0100
Subject: [PATCH 67/77] Add MANIFEST.in with model_card_template

---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..69617566
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include src/setfit/span/model_card_template.md
\ No newline at end of file

From 8e201e5d4ef50e0980f284fa90238bcb62b80257 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 21:43:21 +0100
Subject: [PATCH 68/77] Don't require transformers TrainingArgs in tests

As it requires accelerate to be updated
---
 tests/test_trainer.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 8eee4d57..c5654524 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -11,7 +11,6 @@
 from pytest import LogCaptureFixture
 from sentence_transformers import losses
 from transformers import TrainerCallback
-from transformers import TrainingArguments as TransformersTrainingArguments
 from transformers.testing_utils import require_optuna
 from transformers.utils.hp_naming import TrialShortNamer
 
@@ -584,10 +583,7 @@ def test_evaluate_with_strings(model: SetFitModel) -> None:
 
 
 def test_trainer_wrong_args(model: SetFitModel, tmp_path: Path) -> None:
-    args = TransformersTrainingArguments(output_dir=tmp_path)
     dataset = Dataset.from_dict({"text": ["a", "b", "c"], "label": [0, 1, 2]})
     expected = "`args` must be a `TrainingArguments` instance imported from `setfit`."
-    with pytest.raises(ValueError, match=expected):
-        Trainer(model, args=args)
     with pytest.raises(ValueError, match=expected):
         Trainer(model, dataset)

From 6ae5045a186465348a58994b249a57d5cf5d5441 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 21:47:20 +0100
Subject: [PATCH 69/77] Update URLs in setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7079d145..c3ea792c 100644
--- a/setup.py
+++ b/setup.py
@@ -54,8 +54,8 @@ def combine_requirements(base_keys):
     long_description_content_type="text/markdown",
     maintainer=MAINTAINER,
     maintainer_email=MAINTAINER_EMAIL,
-    url="https://github.com/SetFit/setfit",
-    download_url="https://github.com/SetFit/setfit/tags",
+    url="https://github.com/huggingface/setfit",
+    download_url="https://github.com/huggingface/setfit/tags",
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),

From ecaabb47d923630e1db4deb06ab6e75ba885bce2 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 21:57:16 +0100
Subject: [PATCH 70/77] Increase min hf_hub version to 0.12.0 for
 SoftTemporaryDirectory

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c3ea792c..d7918a99 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
     "datasets>=2.3.0",
     "sentence-transformers>=2.2.1",
     "evaluate>=0.3.0",
-    "huggingface_hub>=0.11.0",
+    "huggingface_hub>=0.12.0",
     "scikit-learn",
 ]
 ABSA_REQUIRE = ["spacy"]

From 4e79397679b14bdc2742614f03e33416682cf301 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 22:04:02 +0100
Subject: [PATCH 71/77] Include MANIFEST.in data via
 `include_package_data=True`

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index d7918a99..3d1f8433 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@ def combine_requirements(base_keys):
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
+    include_package_data=True,
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
     classifiers=[

From 65aff3215095b9f0576ca2ff0627b488e964cd52 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 22:29:19 +0100
Subject: [PATCH 72/77] Use kwargs instead of args in super call

---
 src/setfit/span/modeling.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/setfit/span/modeling.py b/src/setfit/span/modeling.py
index 02b0b1dd..f25a72c1 100644
--- a/src/setfit/span/modeling.py
+++ b/src/setfit/span/modeling.py
@@ -85,14 +85,14 @@ def _from_pretrained(
             model_kwargs["span_context"] = span_context
 
         return super(SpanSetFitModel, cls)._from_pretrained(
-            model_id,
-            revision,
-            cache_dir,
-            force_download,
-            proxies,
-            resume_download,
-            local_files_only,
-            token,
+            model_id=model_id,
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+            token=token,
             **model_kwargs,
         )
 

From eeeac55c07126a39099a70e51d4fdb202cc4ed29 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Thu, 9 Nov 2023 22:39:00 +0100
Subject: [PATCH 73/77] Use v0.13.0 as min. version as
 huggingface/huggingface_hub#1315

solved an issue that was causing failures
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3d1f8433..bdc32252 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
     "datasets>=2.3.0",
     "sentence-transformers>=2.2.1",
     "evaluate>=0.3.0",
-    "huggingface_hub>=0.12.0",
+    "huggingface_hub>=0.13.0",
     "scikit-learn",
 ]
 ABSA_REQUIRE = ["spacy"]

From 3214f1bbad2692249748393f177f8e15e86ed5d6 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 10 Nov 2023 11:30:08 +0100
Subject: [PATCH 74/77] Use en_core_web_sm for tests

---
 tests/conftest.py           |  2 +-
 tests/span/test_modeling.py | 26 +++++++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 11051223..f92a81d8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,7 @@ def model() -> SetFitModel:
 
 @pytest.fixture()
 def absa_model() -> AbsaModel:
-    return AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+    return AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm")
 
 
 @pytest.fixture()
diff --git a/tests/span/test_modeling.py b/tests/span/test_modeling.py
index 02fd7c3e..4a2caec0 100644
--- a/tests/span/test_modeling.py
+++ b/tests/span/test_modeling.py
@@ -10,7 +10,7 @@
 
 
 def test_loading():
-    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2")
+    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm")
     assert isinstance(model, AbsaModel)
     assert isinstance(model.aspect_extractor, AspectExtractor)
     assert isinstance(model.aspect_model, AspectModel)
@@ -19,12 +19,14 @@ def test_loading():
     model = AbsaModel.from_pretrained(
         "sentence-transformers/paraphrase-albert-small-v2@6c91e73a51599e35bd1145dfdcd3289215225009",
         "sentence-transformers/paraphrase-albert-small-v2",
+        spacy_model="en_core_web_sm",
     )
     assert isinstance(model, AbsaModel)
 
     model = AbsaModel.from_pretrained(
         "sentence-transformers/paraphrase-albert-small-v2",
         "sentence-transformers/paraphrase-albert-small-v2@6c91e73a51599e35bd1145dfdcd3289215225009",
+        spacy_model="en_core_web_sm",
     )
     assert isinstance(model, AbsaModel)
 
@@ -33,16 +35,24 @@ def test_loading():
             "sentence-transformers/paraphrase-albert-small-v2", spacy_model="not_a_spacy_model"
         )
 
-    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", normalize_embeddings=True)
+    model = AbsaModel.from_pretrained(
+        "sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm", normalize_embeddings=True
+    )
     assert model.aspect_model.normalize_embeddings
     assert model.polarity_model.normalize_embeddings
 
-    aspect_model = AspectModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_context=12)
+    aspect_model = AspectModel.from_pretrained(
+        "sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm", span_context=12
+    )
     assert aspect_model.span_context == 12
-    polarity_model = PolarityModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_context=12)
+    polarity_model = PolarityModel.from_pretrained(
+        "sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm", span_context=12
+    )
     assert polarity_model.span_context == 12
 
-    model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_contexts=(12, None))
+    model = AbsaModel.from_pretrained(
+        "sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm", span_contexts=(12, None)
+    )
     assert model.aspect_model.span_context == 12
     assert model.polarity_model.span_context == 3  # <- default
 
@@ -56,7 +66,9 @@ def test_save_load(absa_model: AbsaModel) -> None:
         assert (Path(tmp_dir + "-aspect") / "config_span_setfit.json").exists()
         assert (Path(tmp_dir + "-polarity") / "config_span_setfit.json").exists()
 
-        fresh_model = AbsaModel.from_pretrained(tmp_dir + "-aspect", tmp_dir + "-polarity")
+        fresh_model = AbsaModel.from_pretrained(
+            tmp_dir + "-aspect", tmp_dir + "-polarity", spacy_model="en_core_web_sm"
+        )
         assert fresh_model.polarity_model.span_context == 5
 
     with TemporaryDirectory() as aspect_tmp_dir:
@@ -65,7 +77,7 @@ def test_save_load(absa_model: AbsaModel) -> None:
             assert (Path(aspect_tmp_dir) / "config_span_setfit.json").exists()
             assert (Path(polarity_tmp_dir) / "config_span_setfit.json").exists()
 
-            fresh_model = AbsaModel.from_pretrained(aspect_tmp_dir, polarity_tmp_dir)
+            fresh_model = AbsaModel.from_pretrained(aspect_tmp_dir, polarity_tmp_dir, spacy_model="en_core_web_sm")
             assert fresh_model.polarity_model.span_context == 5
 
 

From 2b78bb05dea12ba4f827c9b082393c7b3e0b061d Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 10 Nov 2023 11:39:01 +0100
Subject: [PATCH 75/77] Remove incorrect spacy_model from
 AspectModel/PolarityModel

---
 tests/span/test_modeling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/span/test_modeling.py b/tests/span/test_modeling.py
index 4a2caec0..a2998e97 100644
--- a/tests/span/test_modeling.py
+++ b/tests/span/test_modeling.py
@@ -42,11 +42,11 @@ def test_loading():
     assert model.polarity_model.normalize_embeddings
 
     aspect_model = AspectModel.from_pretrained(
-        "sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm", span_context=12
+        "sentence-transformers/paraphrase-albert-small-v2", span_context=12
     )
     assert aspect_model.span_context == 12
     polarity_model = PolarityModel.from_pretrained(
-        "sentence-transformers/paraphrase-albert-small-v2", spacy_model="en_core_web_sm", span_context=12
+        "sentence-transformers/paraphrase-albert-small-v2", span_context=12
     )
     assert polarity_model.span_context == 12
 

From b68f655dae56bc9ddf86426eb1a9966dd6ea2339 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 10 Nov 2023 11:55:57 +0100
Subject: [PATCH 76/77] Rerun formatting

---
 tests/span/test_modeling.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/span/test_modeling.py b/tests/span/test_modeling.py
index a2998e97..0bc3ccb8 100644
--- a/tests/span/test_modeling.py
+++ b/tests/span/test_modeling.py
@@ -41,13 +41,9 @@ def test_loading():
     assert model.aspect_model.normalize_embeddings
     assert model.polarity_model.normalize_embeddings
 
-    aspect_model = AspectModel.from_pretrained(
-        "sentence-transformers/paraphrase-albert-small-v2", span_context=12
-    )
+    aspect_model = AspectModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_context=12)
     assert aspect_model.span_context == 12
-    polarity_model = PolarityModel.from_pretrained(
-        "sentence-transformers/paraphrase-albert-small-v2", span_context=12
-    )
+    polarity_model = PolarityModel.from_pretrained("sentence-transformers/paraphrase-albert-small-v2", span_context=12)
     assert polarity_model.span_context == 12
 
     model = AbsaModel.from_pretrained(

From d85f0d97cf75a3c5d84219787e39e87fd87fd99a Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 10 Nov 2023 12:14:23 +0100
Subject: [PATCH 77/77] Run CI on pre branch & workflow dispatch

---
 .github/workflows/quality.yml | 3 +++
 .github/workflows/tests.yml   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 9ced4d45..b3cdcd6b 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -5,9 +5,12 @@ on:
     branches:
       - main
       - v*-release
+      - v*-pre
   pull_request:
     branches:
       - main
+      - v*-pre
+  workflow_dispatch:
 
 jobs:
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 243c1306..45dccb7f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -5,9 +5,12 @@ on:
     branches:
       - main
       - v*-release
+      - v*-pre
   pull_request:
     branches:
       - main
+      - v*-pre
+  workflow_dispatch:
 
 jobs: