huggingface · vwxyzjn · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/docs/source/cpo_trainer.mdx b/docs/source/cpo_trainer.mdx
@@ -5,6 +5,9 @@ avoid generating adequate, but not perfect translations in Machine Translation (
 
 CPO aims to mitigate two fundamental shortcomings of SFT. First, SFT’s methodology of minimizing the discrepancy between predicted outputs and gold-standard references inherently caps model performance at the quality level of the training data. Secondly, SFT lacks a mechanism to prevent the model from rejecting mistakes in translations. The CPO objective is derived from the DPO objective.
 
+## A Variant of CPO: SimPO
+There is also a variant of CPO, [SimPO: Simple Preference Optimization with a Reference-Free Reward](https://arxiv.org/abs/2405.14734), which adds a reward margin and does not use BC regularization. Use the `loss_type="simpo"` in the `CPOConfig` to use this loss.
+
 ## Expected dataset format
 
 The CPO trainer expects a format identical to the DPO trainer, which should include three entries. These entries should be named as follows:

diff --git a/tests/test_cpo_trainer.py b/tests/test_cpo_trainer.py
@@ -84,6 +84,8 @@ def _init_dummy_dataset(self):
             ["t5", "hinge"],
             ["gpt2", "ipo"],
             ["t5", "ipo"],
+            ["gpt2", "simpo"],
+            ["t5", "simpo"],
         ]
     )
     def test_cpo_trainer(self, name, loss_type):

diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py
@@ -41,6 +41,8 @@ class CPOConfig(TrainingArguments):
             The type of loss to use. This argument is required if you want to use the default data collator.
         label_pad_token_id (`int`, defaults to `-100`):
             The label pad token id. This argument is required if you want to use the default data collator.
+        simpo_gamma (`float`, defaults to `0.5`):
+            A target reward margin for the SimPO loss, used only when the "simpo" option is enabled.
         padding_value (`int`, defaults to `None`):
             The padding value if it is different to the tokenizer's pad_token_id.
         truncation_mode (`str`, defaults to `keep_end`):
@@ -64,8 +66,9 @@ class CPOConfig(TrainingArguments):
 
     beta: float = 0.1
     label_smoothing: float = 0
-    loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = "sigmoid"
+    loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "simpo"] = "sigmoid"
     disable_dropout: bool = True
+    simpo_gamma: float = 0.5
 
     label_pad_token_id: int = -100
     padding_value: int = None

diff --git a/trl/trainer/cpo_trainer.py b/trl/trainer/cpo_trainer.py
@@ -268,6 +268,9 @@ def make_inputs_require_grad(module, input, output):
         self.label_smoothing = args.label_smoothing
         self.loss_type = args.loss_type
 
+        if args.loss_type == "simpo":
+            self.simpo_gamma = args.simpo_gamma
+
         self._stored_metrics = defaultdict(lambda: defaultdict(list))
 
         # Compute that only on the main process for faster data processing.
@@ -585,7 +588,16 @@ def cpo_loss(
         # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
         # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
         # calculates a conservative CPO loss.
-        if self.loss_type == "sigmoid":
+
+        if self.loss_type == "simpo":
+            gamma_logratios = self.simpo_gamma / self.beta
+            logits = logits - gamma_logratios
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "sigmoid":
             # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
             losses = (
                 -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
@@ -598,7 +610,7 @@ def cpo_loss(
             losses = (logits - 1 / (2 * self.beta)) ** 2
         else:
             raise ValueError(
-                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair']"
+                f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'simpo']"
             )
 
         chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
@@ -691,12 +703,16 @@ def cross_entropy_loss(logits, labels):
             return loss
 
         labels = concatenated_batch["concatenated_labels"].clone()
-        nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        if self.loss_type != "simpo":
+            nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+        else:
+            nll_loss = torch.tensor(0.0).to(self.accelerator.device)
 
         all_logps = self.get_batch_logps(
             all_logits,
             concatenated_batch["concatenated_labels"],
-            average_log_prob=self.loss_type == "ipo",
+            average_log_prob=self.loss_type in ["ipo", "simpo"],
             is_encoder_decoder=self.is_encoder_decoder,
             label_pad_token_id=self.label_pad_token_id,
         )