🥾 Allow bootstrap GRPO (#2829)

qgallouedec · kashif · web-flow · commit 7347c292c3d1 · 2025-02-11T18:56:22.000+01:00
Co-authored-by: Kashif Rasul &lt;kashif.rasul@gmail.com&gt;
diff --git a/trl/data_utils.py b/trl/data_utils.py
@@ -90,8 +90,21 @@ def apply_chat_template(
 
     # Apply the chat template to the prompt, adding the generation prompt
     if "prompt" in example:
+        last_role = example["prompt"][-1]["role"]
+        if last_role == "user":
+            add_generation_prompt = True
+            continue_final_message = False
+        elif last_role == "assistant":
+            add_generation_prompt = False
+            continue_final_message = True
+        else:
+            raise ValueError(f"Invalid role in the last message: {last_role}")
         prompt = tokenizer.apply_chat_template(
-            example["prompt"], tools=tools, tokenize=False, add_generation_prompt=True
+            example["prompt"],
+            tools=tools,
+            continue_final_message=continue_final_message,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
         )
 
     # Apply the chat template to the entire prompt + completion
@@ -180,10 +193,13 @@ def maybe_apply_chat_template(
     Returns:
         `dict[str, str]`: The formatted example with the chat template applied.
 
-    Note:
-        This function does not alter the keys, except for Language modeling dataset, where `"messages"` is replaced by
+    Notes:
+        - This function does not alter the keys, except for Language modeling dataset, where `"messages"` is replaced by
         `"text"`.
 
+        - In case of prompt-only data, if the last role is `"user"`, the generation prompt is added to the prompt. Else,
+        if the last role is `"assistant"`, the final message is continued.
+
     Example:
 
     ```python
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -577,7 +577,10 @@ def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[s
         # Decode the generated completions
         completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
         if is_conversational(inputs[0]):
-            completions = [[{"role": "assistant", "content": completion}] for completion in completions_text]
+            completions = []
+            for prompt, completion in zip(prompts, completions_text):
+                bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+                completions.append([{"role": "assistant", "content": bootstrap + completion}])
         else:
             completions = completions_text