diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx index dd267be3e69..75d46d42933 100644 --- a/docs/source/dpo_trainer.mdx +++ b/docs/source/dpo_trainer.mdx @@ -109,8 +109,6 @@ The [cDPO](https://ericmitchell.ai/cdpo.pdf) is a tweak on the DPO loss where we The [Robust DPO](https://arxiv.org/abs/2403.00409) authors propose an unbiased estimate of the DPO loss that is robust to preference noise in the data. Like in cDPO, assume that the preference labels are noisy with some probability that can be passed to the `DPOTrainer` via `label_smoothing` argument (between 0 and 0.5). Use `loss_type="robust"` to the trainer to use it. -The [KTO](https://arxiv.org/abs/2402.01306) authors directly maximize the utility of LLM generations instead of the log-likelihood of preferences. To use preference data with KTO, we recommend breaking up the n preferences into 2n examples and using [`KTOTrainer`](kto_trainer) (i.e., treating the data like an unpaired feedback dataset). Although it is possible to pass in `loss_type="kto_pair"` into DPOTrainer, this is a highly simplified version of KTO that we *do not recommend* in most cases. Please use [`KTOTrainer`](kto_trainer) when possible. - The [BCO](https://arxiv.org/abs/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0. The `DPOTrainer` can be switched to this loss via the `loss_type="bco_pair"` argument. The [SPPO](https://arxiv.org/abs/2405.00675) authors claim that SPPO is capable of solving the Nash equilibrium iteratively by pushing the chosen rewards to be as large as 1/2 and the rejected rewards to be as small as -1/2 and can alleviate data sparsity issues. The implementation using loss_type="sppo_hard" approximates this algorithm by employing hard label probabilities, assigning 1 to the winner and 0 to the loser. @@ -121,7 +119,7 @@ The [TR-DPO](https://arxiv.org/pdf/2404.09656) paper suggests syncing the refere The [RPO](https://arxiv.org/abs/2404.19733) paper implements an iterative preference tuning algorithm using a loss related to the RPO loss in this [paper](https://arxiv.org/abs/2405.16436) that essentially consists of the SFT loss on the chosen preferences together with a weighted DPO loss. To use this loss set the `rpo_alpha` in the `DPOConfig` to an appropriate value. -The [AOT](https://arxiv.org/abs/2406.05882) authors propose to use Distributional Preference Alignment Via Optimal Transport. Traditionally, the alignment algorithms use paired preferences at a sample level, which does not ensure alignment on the distributional level. AOT, on the other hand, can align LLMs on paired or unpaired preference data by making the reward distribution of the positive samples stochastically dominant in the first order on the distribution of negative samples. Specifically, `loss_type="aot"` is appropriate for paired datasets, where each prompt has both chosen and rejected responses; `loss_type="aot_pair"` is for unpaired datasets. Note that `loss_type="aot_pair"` is similar in spirit to `loss_type="kto_pair"` that applies unpaired alignment methodology on paired dataset. In a nutshell, `loss_type="aot"` ensures that the log-likelihood ratio of chosen to rejected of the aligned model has higher quantiles than that ratio for the reference model. `loss_type="aot_pair"` ensures that the chosen reward is higher on all quantiles than the rejected reward. Note that in both cases quantiles are obtained via sorting. To fully leverage the advantages of the AOT algorithm, it is important to maximize the per-GPU batch size. +The [AOT](https://arxiv.org/abs/2406.05882) authors propose to use Distributional Preference Alignment Via Optimal Transport. Traditionally, the alignment algorithms use paired preferences at a sample level, which does not ensure alignment on the distributional level. AOT, on the other hand, can align LLMs on paired or unpaired preference data by making the reward distribution of the positive samples stochastically dominant in the first order on the distribution of negative samples. Specifically, `loss_type="aot"` is appropriate for paired datasets, where each prompt has both chosen and rejected responses; `loss_type="aot_pair"` is for unpaired datasets. In a nutshell, `loss_type="aot"` ensures that the log-likelihood ratio of chosen to rejected of the aligned model has higher quantiles than that ratio for the reference model. `loss_type="aot_pair"` ensures that the chosen reward is higher on all quantiles than the rejected reward. Note that in both cases quantiles are obtained via sorting. To fully leverage the advantages of the AOT algorithm, it is important to maximize the per-GPU batch size. ## Logging diff --git a/tests/slow/testing_constants.py b/tests/slow/testing_constants.py index 94cdf4baac1..40051ce1cc4 100644 --- a/tests/slow/testing_constants.py +++ b/tests/slow/testing_constants.py @@ -23,5 +23,5 @@ GRADIENT_CHECKPOINTING_KWARGS = [None, {"use_reentrant": False}, {"use_reentrant": True}] DEVICE_MAP_OPTIONS = [{"": 0}, "auto"] -DPO_LOSS_TYPES = ["sigmoid", "ipo", "kto_pair"] +DPO_LOSS_TYPES = ["sigmoid", "ipo"] DPO_PRECOMPUTE_LOGITS = [True, False] diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py index 8fabbe708f2..e047c17cdb2 100644 --- a/tests/test_dpo_trainer.py +++ b/tests/test_dpo_trainer.py @@ -86,8 +86,6 @@ def _init_dummy_dataset(self): ["t5", "hinge", False], ["gpt2", "ipo", False], ["t5", "ipo", True], - ["gpt2", "kto_pair", True], - ["t5", "kto_pair", False], ["gpt2", "aot_pair", True], ["t5", "aot_pair", False], ["gpt2", "aot", True], @@ -506,10 +504,6 @@ def test_dpo_lora_bf16_autocast_llama(self): ["gpt2", "ipo", False, True], ["gpt2", "ipo", True, False], ["gpt2", "ipo", True, True], - ["gpt2", "kto_pair", False, False], - ["gpt2", "kto_pair", False, True], - ["gpt2", "kto_pair", True, False], - ["gpt2", "kto_pair", True, True], ["gpt2", "aot_pair", False, False], ["gpt2", "aot_pair", False, True], ["gpt2", "aot_pair", True, False], diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py index 51032609836..d231fa83ccd 100644 --- a/trl/trainer/cpo_config.py +++ b/trl/trainer/cpo_config.py @@ -66,7 +66,7 @@ class CPOConfig(TrainingArguments): beta: float = 0.1 label_smoothing: float = 0 - loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "simpo"] = "sigmoid" + loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid" disable_dropout: bool = True simpo_gamma: float = 0.5 @@ -79,3 +79,7 @@ class CPOConfig(TrainingArguments): model_init_kwargs: Optional[Dict] = None dataset_num_proc: Optional[int] = None + + def __post_init__(self): + if self.loss_type == "kto_pair": + raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.") diff --git a/trl/trainer/cpo_trainer.py b/trl/trainer/cpo_trainer.py index 4bb155ee8af..e0c42f0f6c5 100644 --- a/trl/trainer/cpo_trainer.py +++ b/trl/trainer/cpo_trainer.py @@ -259,10 +259,12 @@ def make_inputs_require_grad(module, input, output): self.max_target_length = max_target_length self.tokenizer = tokenizer - if args.loss_type in ["hinge", "ipo", "kto_pair"] and args.label_smoothing > 0: + if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0: warnings.warn( "You are using a loss type that does not support label smoothing. Ignoring label_smoothing parameter." ) + if args.loss_type == "kto_pair": + raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.") self.beta = args.beta self.label_smoothing = args.label_smoothing @@ -610,7 +612,7 @@ def cpo_loss( losses = (logits - 1 / (2 * self.beta)) ** 2 else: raise ValueError( - f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'simpo']" + f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']" ) chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach() diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py index 540bb814eb5..88b8b2d1fbd 100644 --- a/trl/trainer/dpo_config.py +++ b/trl/trainer/dpo_config.py @@ -29,7 +29,7 @@ class DPOConfig(TrainingArguments): The robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and [Robust DPO](https://arxiv.org/abs/2403.00409) paper that should be between 0 and 0.5. loss_type (`str`, defaults to `"sigmoid"`): The type of DPO loss to use. Either `"sigmoid"` the default DPO loss,`"hinge"` loss from [SLiC](https://arxiv.org/abs/2305.10425) paper, `"ipo"` from [IPO](https://arxiv.org/abs/2310.12036) paper, - `"kto_pair"` from the HALOs [report](https://github.com/ContextualAI/HALOs/blob/main/assets/report.pdf), `"bco_pair"` from [BCO](https://arxiv.org/abs/2404.04656) paper or `"robust"` from [Robust DPO](https://arxiv.org/abs/2403.00409) paper, + `"bco_pair"` from [BCO](https://arxiv.org/abs/2404.04656) paper or `"robust"` from [Robust DPO](https://arxiv.org/abs/2403.00409) paper, "aot" and "aot_pair" from alignment via optimal transport label_pad_token_id (`int`, defaults to `-100`): The label pad token id. This argument is required if you want to use the default data collator. @@ -79,7 +79,7 @@ class DPOConfig(TrainingArguments): beta: float = 0.1 label_smoothing: float = 0 loss_type: Literal[ - "sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo_hard", "nca_pair", "robust", "aot", "aot_pair" + "sigmoid", "hinge", "ipo", "bco_pair", "sppo_hard", "nca_pair", "robust", "aot", "aot_pair" ] = "sigmoid" label_pad_token_id: int = -100 padding_value: Optional[int] = None @@ -102,3 +102,7 @@ class DPOConfig(TrainingArguments): ref_model_mixup_alpha: float = 0.9 ref_model_sync_steps: int = 64 rpo_alpha: Optional[float] = None + + def __post_init__(self): + if self.loss_type == "kto_pair": + raise ValueError("Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer.") diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index 17006925998..7f51d800535 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -135,7 +135,7 @@ def __init__( ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, beta: float = 0.1, label_smoothing: float = 0, - loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "robust", "aot", "aot_pair"] = "sigmoid", + loss_type: Literal["sigmoid", "hinge", "ipo", "bco_pair", "robust", "aot", "aot_pair"] = "sigmoid", args: Optional[DPOConfig] = None, data_collator: Optional[DataCollator] = None, label_pad_token_id: int = -100, @@ -463,10 +463,12 @@ def make_inputs_require_grad(module, input, output): "You passed `label_smoothing` to the DPOTrainer, the value you passed will override the one in the `DPOConfig`." ) args.label_smoothing = label_smoothing - if args.loss_type in ["hinge", "ipo", "kto_pair", "bco_pair"] and args.label_smoothing > 0: + if args.loss_type in ["hinge", "ipo", "bco_pair"] and args.label_smoothing > 0: warnings.warn( "You are using a loss type that does not support label smoothing. Ignoring label_smoothing parameter." ) + if args.loss_type == "kto_pair": + raise ValueError("Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer.") if beta != 0.1: warnings.warn( @@ -1024,21 +1026,6 @@ def dpo_loss( elif self.loss_type == "ipo": # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper. losses = (logits - 1 / (2 * self.beta)) ** 2 - elif self.loss_type == "kto_pair": - # eqn (7) of the HALOs paper - chosen_KL = (policy_chosen_logps - reference_chosen_logps).mean().clamp(min=0) - rejected_KL = (policy_rejected_logps - reference_rejected_logps).mean().clamp(min=0) - - chosen_logratios = policy_chosen_logps - reference_chosen_logps - rejected_logratios = policy_rejected_logps - reference_rejected_logps - # As described in the KTO report, the KL term for chosen (rejected) is estimated using the rejected (chosen) half. - losses = torch.cat( - ( - 1 - F.sigmoid(self.beta * (chosen_logratios - rejected_KL)), - 1 - F.sigmoid(self.beta * (chosen_KL - rejected_logratios)), - ), - 0, - ) elif self.loss_type == "bco_pair": chosen_logratios = policy_chosen_logps - reference_chosen_logps rejected_logratios = policy_rejected_logps - reference_rejected_logps @@ -1096,7 +1083,7 @@ def dpo_loss( else: raise ValueError( - f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'bco_pair', 'sppo_hard', 'nca_pair', 'robust']" + f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'bco_pair', 'sppo_hard', 'nca_pair', 'robust']" ) chosen_rewards = ( diff --git a/trl/trainer/kto_trainer.py b/trl/trainer/kto_trainer.py index 770a34f63e0..8c8c01b48d1 100644 --- a/trl/trainer/kto_trainer.py +++ b/trl/trainer/kto_trainer.py @@ -639,24 +639,15 @@ def make_inputs_require_grad(module, input, output): # merge the datasets eval_dataset = concatenate_datasets([eval_dataset, eval_kl_dataset], axis=1) - desirable = train_dataset.filter( - lambda x: x["label"], num_proc=args.dataset_num_proc, desc="Filtering desirable examples" - ) - undesirable = train_dataset.filter( - lambda x: not x["label"], num_proc=args.dataset_num_proc, desc="Filtering undesirable examples" - ) - - if len(desirable) == 0: - raise ValueError("The set of desirable completions cannot be empty.") - elif len(undesirable) == 0: - raise ValueError("The set of undesirable completions cannot be empty.") + num_desirable = max(sum(train_dataset["label"]), 1) + num_undesirable = max(len(train_dataset["label"]) - num_desirable, 1) # "label" is binary - if len(desirable) != len(undesirable): + if num_desirable != num_undesirable: # The lower and upper bounds come from Eq. (8) of https://arxiv.org/abs/2402.01306 - des_weight_lower_bound = round((len(undesirable) * self.undesirable_weight / len(desirable)) * 1, 2) - des_weight_upper_bound = round((len(undesirable) * self.undesirable_weight / len(desirable)) * 1.33, 2) - und_weight_lower_bound = round((len(desirable) * self.desirable_weight / len(undesirable)) / 1.33, 2) - und_weight_upper_bound = round((len(desirable) * self.desirable_weight / len(undesirable)) / 1, 2) + des_weight_lower_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1, 2) + des_weight_upper_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1.33, 2) + und_weight_lower_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1.33, 2) + und_weight_upper_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1, 2) des_weight_in_range = des_weight_lower_bound <= self.desirable_weight <= des_weight_upper_bound und_weight_in_range = und_weight_lower_bound <= self.undesirable_weight <= und_weight_upper_bound @@ -673,6 +664,13 @@ def make_inputs_require_grad(module, input, output): ) if self.loss_type == "bco": + desirable = train_dataset.filter( + lambda x: x["label"], num_proc=args.dataset_num_proc, desc="Filtering desirable examples" + ) + undesirable = train_dataset.filter( + lambda x: not x["label"], num_proc=args.dataset_num_proc, desc="Filtering undesirable examples" + ) + desirable = desirable.shuffle(seed=args.data_seed) undesirable = undesirable.shuffle(seed=args.data_seed) @@ -727,18 +725,20 @@ def make_inputs_require_grad(module, input, output): if self.loss_type == "bco": self.running = RunningMoments(self.accelerator) - if self.embedding_func is None: - return + if self.embedding_func is None: + return - chosen_embeddings = self._get_sample_prompt_embeddings(desirable, sample_size=self.args.prompt_sample_size) - rejected_embeddings = self._get_sample_prompt_embeddings(undesirable, sample_size=self.args.prompt_sample_size) + chosen_embeddings = self._get_sample_prompt_embeddings(desirable, sample_size=self.args.prompt_sample_size) + rejected_embeddings = self._get_sample_prompt_embeddings( + undesirable, sample_size=self.args.prompt_sample_size + ) - embeddings = torch.cat((chosen_embeddings, rejected_embeddings), dim=0) - labels = torch.cat( - (torch.ones_like(chosen_embeddings[:, 0]), torch.zeros_like(rejected_embeddings[:, 0])), dim=0 - ) + embeddings = torch.cat((chosen_embeddings, rejected_embeddings), dim=0) + labels = torch.cat( + (torch.ones_like(chosen_embeddings[:, 0]), torch.zeros_like(rejected_embeddings[:, 0])), dim=0 + ) - self.clf = LogisticRegression(class_weight="balanced").fit(embeddings.cpu().numpy(), labels.cpu().numpy()) + self.clf = LogisticRegression(class_weight="balanced").fit(embeddings.cpu().numpy(), labels.cpu().numpy()) @property def match_underlying_distribution(self):