huggingface · zucchini-nlp · Aug 7, 2024 · Jul 26, 2024 · Jul 29, 2024 · Jul 30, 2024
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
@@ -391,6 +391,15 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - get_seq_length
     - reset
 
+[[autodoc]] HybridCache
+    - update
+    - get_seq_length
+    - reset
+
+[[autodoc]] SlidingWindowCache
+    - update
+    - reset
+
 [[autodoc]] EncoderDecoderCache
     - get_seq_length
     - to_legacy_cache

diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -747,7 +747,19 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if cache_position is None:
-            cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
+            if past_key_values is None:
+                cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
+            else:
+                raise ValueError("When `past_key_values` is passed, `cache_position` must be too")
+
+        # Probably a forward call with caching, so we set up cache for one call only
+        if use_cache and past_key_values is None and not self.training:
+            logger.warning_once(
+                "You are calling the model with `use_cache=True` but didn't pass `past_key_values`. ",
+                "Make sure to pass an instance of `HybridCache`. Caching will be disabled. See for more: "
+                "(https://huggingface.co/docs/transformers/main/en/internal/generation_utils#transformers.HybridCache)",
+            )
+            use_cache = False
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -832,7 +844,7 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if past_key_values is not None:
+        if past_key_values is not None and past_key_values.get_max_length() is not None:
             target_length = past_key_values.get_max_length()
         else:
             target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1]