From 64203dd7eb7bc6d042eb744a66ba752eb3d6d556 Mon Sep 17 00:00:00 2001
From: Igor Melnyk <igor.melnyk@ibm.com>
Date: Wed, 5 Jun 2024 16:35:37 -0400
Subject: [PATCH 1/3] adds AOT

---
 trl/trainer/dpo_config.py  |  5 +++--
 trl/trainer/dpo_trainer.py | 26 +++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
index b4b259fa464..8628bc0c5ae 100644
--- a/trl/trainer/dpo_config.py
+++ b/trl/trainer/dpo_config.py
@@ -29,7 +29,8 @@ class DPOConfig(TrainingArguments):
             The robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and [Robust DPO](https://arxiv.org/abs/2403.00409) paper that should be between 0 and 0.5.
         loss_type (`str`, defaults to `"sigmoid"`):
             The type of DPO loss to use. Either `"sigmoid"` the default DPO loss,`"hinge"` loss from [SLiC](https://arxiv.org/abs/2305.10425) paper, `"ipo"` from [IPO](https://arxiv.org/abs/2310.12036) paper,
-            `"kto_pair"` from the HALOs [report](https://github.com/ContextualAI/HALOs/blob/main/assets/report.pdf), `"bco_pair"` from [BCO](https://arxiv.org/abs/2404.04656) paper or `"robust"` from [Robust DPO](https://arxiv.org/abs/2403.00409) paper.
+            `"kto_pair"` from the HALOs [report](https://github.com/ContextualAI/HALOs/blob/main/assets/report.pdf), `"bco_pair"` from [BCO](https://arxiv.org/abs/2404.04656) paper or `"robust"` from [Robust DPO](https://arxiv.org/abs/2403.00409) paper,
+            "aot" and "aot_pair" from alignment via optimal transport
         label_pad_token_id (`int`, defaults to `-100`):
             The label pad token id. This argument is required if you want to use the default data collator.
         padding_value (`int`, defaults to `0`):
@@ -78,7 +79,7 @@ class DPOConfig(TrainingArguments):
     beta: float = 0.1
     label_smoothing: float = 0
     loss_type: Literal[
-        "sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo_hard", "nca_pair", "robust"
+        "sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo_hard", "nca_pair", "robust", "aot", "aot_pair"
     ] = "sigmoid"
     label_pad_token_id: int = -100
     padding_value: int = 0
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 7a6e0fe6afc..410f7f7ab6e 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -135,7 +135,7 @@ def __init__(
         ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None,
         beta: float = 0.1,
         label_smoothing: float = 0,
-        loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "robust"] = "sigmoid",
+        loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "robust", "aot", "aot_pair"] = "sigmoid",
         args: Optional[DPOConfig] = None,
         data_collator: Optional[DataCollator] = None,
         label_pad_token_id: int = -100,
@@ -1066,6 +1066,30 @@ def dpo_loss(
                 - 0.5 * F.logsigmoid(-chosen_rewards)
                 - 0.5 * F.logsigmoid(-rejected_rewards)
             )
+        elif self.loss_type == "aot_pair":
+            chosen_logratios = policy_chosen_logps - reference_chosen_logps
+            rejected_logratios = policy_rejected_logps - reference_rejected_logps
+
+            chosen_logratios_sorted, _ = torch.sort(chosen_logratios, dim=0)
+            rejected_logratios_sorted, _ = torch.sort(rejected_logratios, dim=0)
+
+            delta = chosen_logratios_sorted - rejected_logratios_sorted
+
+            losses = (-F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                      - F.logsigmoid(-self.beta * delta) * self.label_smoothing)
+
+        elif self.loss_type == "aot":
+            pi_logratios = policy_chosen_logps - policy_rejected_logps
+            ref_logratios = reference_chosen_logps - reference_rejected_logps
+
+            pi_logratios_sorted, _ = torch.sort(pi_logratios, dim=0)
+            ref_logratios_sorted, _ = torch.sort(ref_logratios, dim=0)
+
+            delta = pi_logratios_sorted - ref_logratios_sorted
+
+            losses = (-F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                      - F.logsigmoid(-self.beta * delta) * self.label_smoothing)
+
         else:
             raise ValueError(
                 f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'bco_pair', 'sppo_hard', 'nca_pair', 'robust']"

From 1724d82c3916461442f9740ad11555541863e531 Mon Sep 17 00:00:00 2001
From: Igor Melnyk <igor.melnyk@ibm.com>
Date: Thu, 6 Jun 2024 10:50:00 -0400
Subject: [PATCH 2/3] Applied format changes

---
 trl/trainer/dpo_trainer.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index 410f7f7ab6e..4cbf3e9461e 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -1075,8 +1075,10 @@ def dpo_loss(
 
             delta = chosen_logratios_sorted - rejected_logratios_sorted
 
-            losses = (-F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
-                      - F.logsigmoid(-self.beta * delta) * self.label_smoothing)
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
 
         elif self.loss_type == "aot":
             pi_logratios = policy_chosen_logps - policy_rejected_logps
@@ -1087,8 +1089,10 @@ def dpo_loss(
 
             delta = pi_logratios_sorted - ref_logratios_sorted
 
-            losses = (-F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
-                      - F.logsigmoid(-self.beta * delta) * self.label_smoothing)
+            losses = (
+                -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * delta) * self.label_smoothing
+            )
 
         else:
             raise ValueError(

From 83b6c34d8d2429cd09f0866141d1068d463c540c Mon Sep 17 00:00:00 2001
From: Igor Melnyk <igor.melnyk@ibm.com>
Date: Tue, 11 Jun 2024 19:27:21 -0400
Subject: [PATCH 3/3] added docs and tests

---
 docs/source/dpo_trainer.mdx |  2 ++
 tests/test_dpo_trainer.py   | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
index 2022743b4eb..dd267be3e69 100644
--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@@ -121,6 +121,8 @@ The [TR-DPO](https://arxiv.org/pdf/2404.09656) paper suggests syncing the refere
 
 The [RPO](https://arxiv.org/abs/2404.19733) paper implements an iterative preference tuning algorithm using a loss related to the RPO loss in this [paper](https://arxiv.org/abs/2405.16436) that essentially consists of the SFT loss on the chosen preferences together with a weighted DPO loss. To use this loss set the `rpo_alpha` in the `DPOConfig` to an appropriate value.
 
+The [AOT](https://arxiv.org/abs/2406.05882) authors propose to use Distributional Preference Alignment Via Optimal Transport. Traditionally, the alignment algorithms use paired preferences at a sample level, which does not ensure alignment on the distributional level. AOT, on the other hand, can align LLMs on paired or unpaired preference data by making the reward distribution of the positive samples stochastically dominant in the first order on the distribution of negative samples. Specifically, `loss_type="aot"` is appropriate for  paired datasets, where each prompt has both chosen and rejected responses; `loss_type="aot_pair"` is for unpaired datasets. Note that `loss_type="aot_pair"` is similar in spirit to `loss_type="kto_pair"` that applies unpaired alignment methodology on paired dataset. In a nutshell, `loss_type="aot"` ensures that the log-likelihood ratio of chosen to rejected of the aligned model has higher quantiles than that ratio for the reference model. `loss_type="aot_pair"` ensures that the chosen reward is higher on all quantiles than the rejected reward. Note that in both cases quantiles are obtained via sorting. To fully leverage the advantages of the AOT algorithm, it is important to maximize the per-GPU batch size.
+
 ## Logging
 
 While training and evaluating we record the following reward metrics:
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
index 3b5bfcae2b9..8fabbe708f2 100644
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@@ -88,6 +88,10 @@ def _init_dummy_dataset(self):
             ["t5", "ipo", True],
             ["gpt2", "kto_pair", True],
             ["t5", "kto_pair", False],
+            ["gpt2", "aot_pair", True],
+            ["t5", "aot_pair", False],
+            ["gpt2", "aot", True],
+            ["t5", "aot", False],
             ["gpt2", "bco_pair", False],
             ["t5", "bco_pair", True],
             ["gpt2", "sppo_hard", False],
@@ -506,6 +510,14 @@ def test_dpo_lora_bf16_autocast_llama(self):
             ["gpt2", "kto_pair", False, True],
             ["gpt2", "kto_pair", True, False],
             ["gpt2", "kto_pair", True, True],
+            ["gpt2", "aot_pair", False, False],
+            ["gpt2", "aot_pair", False, True],
+            ["gpt2", "aot_pair", True, False],
+            ["gpt2", "aot_pair", True, True],
+            ["gpt2", "aot", False, False],
+            ["gpt2", "aot", False, True],
+            ["gpt2", "aot", True, False],
+            ["gpt2", "aot", True, True],
             ["gpt2", "bco_pair", False, False],
             ["gpt2", "bco_pair", False, True],
             ["gpt2", "bco_pair", True, False],