huggingface · amyeroberts · Jun 19, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -86,7 +86,7 @@ def load_balancing_loss_func(
     experts is too unbalanced.
 
     Args:
-        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]]):
             Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
             shape [batch_size X sequence_length, num_experts].
         attention_mask (`torch.Tensor`, None):
@@ -112,10 +112,10 @@ def load_balancing_loss_func(
     expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
 
     if attention_mask is None:
-        # Compute the percentage of tokens routed to each experts
+        # Compute the percentage of tokens routed to each expert across all layers
         tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
 
-        # Compute the average probability of routing to these experts
+        # Compute the average probability of routing to these experts across all layers
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
         batch_size, sequence_length = attention_mask.shape
@@ -129,7 +129,7 @@ def load_balancing_loss_func(
             .to(compute_device)
         )
 
-        # Compute the percentage of tokens routed to each experts
+        # Compute the percentage of tokens routed to each expert across all layers
         tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
             expert_attention_mask, dim=0
         )
@@ -147,9 +147,10 @@ def load_balancing_loss_func(
             router_per_expert_attention_mask, dim=0
         )
 
-    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
-    return overall_loss * num_experts
+    # Ensure the loss calculation balances each expert's load correctly
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert)
 
+    return overall_loss * num_experts
 
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -810,6 +810,12 @@ def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optio
             )
 
         columns = [k for k in signature_columns if k in dataset.column_names]
+        if len(columns) == 0:
+            raise ValueError(
+                "No columns in the dataset match the model's forward method signature. "
+                f"The following columns have been ignored: [{', '.join(ignored_columns)}]. "
+                "Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`."
+            )
 
         if version.parse(datasets.__version__) < version.parse("1.4.0"):
             dataset.set_format(