Handle num_items_in_batch in Mistral's forward

This PR enables handling loss keyword arguments in the Mistral forward() method. Specifically, if `num_items_in_batch` is passed, the value is used to properly normalize the loss value. This relates to the Gradient Accumulation fix (#34191) Fixes #34575
huggingface · Nov 2, 2024 · a4faa09 · a4faa09
1 parent 33868a0
commit a4faa09
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -1027,6 +1027,7 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
+        **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1095,8 +1096,12 @@ def forward(
             shift_labels = shift_labels.view(-1)
             # Ensure tensors are on the same device
             shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
+            num_items_in_batch = loss_kwargs.pop("num_items_in_batch", None)
+            reduction = "sum" if num_items_in_batch is not None else "mean"
+            loss_fct = CrossEntropyLoss(reduction=reduction)
             loss = loss_fct(shift_logits, shift_labels)
+            if reduction == "sum":
+                loss = loss / num_items_in_batch
 
         if not return_dict:
             output = (logits,) + outputs[1:]