From ef8a5589923cc457a9fa123a51e8f3ac8372dc57 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 19 Sep 2024 12:02:46 +0100 Subject: [PATCH] Cache: don't show warning in forward passes when `past_key_values` is None (#33541) --- docs/source/en/kv_cache.md | 33 ++++++++++++++++--- .../models/bloom/modeling_bloom.py | 26 ++++++++------- .../models/codegen/modeling_codegen.py | 22 ++++++++----- .../models/cohere/modeling_cohere.py | 19 ++++++----- src/transformers/models/dbrx/modeling_dbrx.py | 19 ++++++----- .../models/falcon/modeling_falcon.py | 24 ++++++++------ src/transformers/models/gemma/diff_gemma.py | 15 ++++++--- .../models/gemma/modeling_gemma.py | 28 ++++++++-------- src/transformers/models/git/modeling_git.py | 27 ++++++++------- .../models/gpt_neo/modeling_gpt_neo.py | 24 ++++++++------ .../models/gpt_neox/modeling_gpt_neox.py | 22 ++++++++----- .../modeling_gpt_neox_japanese.py | 22 ++++++++----- src/transformers/models/gptj/modeling_gptj.py | 22 ++++++++----- .../models/granite/modeling_granite.py | 17 ++++++---- .../models/idefics/modeling_idefics.py | 14 +++++--- .../models/idefics2/modeling_idefics2.py | 14 ++++++-- .../models/jetmoe/modeling_jetmoe.py | 15 ++++++--- .../models/llama/modeling_llama.py | 19 ++++++----- .../models/mistral/modeling_mistral.py | 17 ++++++---- .../models/mixtral/modeling_mixtral.py | 27 ++++++++------- src/transformers/models/olmo/modeling_olmo.py | 19 ++++++----- .../models/olmoe/modeling_olmoe.py | 17 ++++++---- .../models/persimmon/modeling_persimmon.py | 27 ++++++++------- src/transformers/models/phi/modeling_phi.py | 28 +++++++++------- src/transformers/models/phi3/modeling_phi3.py | 28 +++++++++------- .../models/qwen2/modeling_qwen2.py | 27 ++++++++------- .../models/qwen2_moe/modeling_qwen2_moe.py | 27 ++++++++------- .../models/stablelm/modeling_stablelm.py | 27 ++++++++------- .../models/starcoder2/modeling_starcoder2.py | 27 ++++++++------- 29 files changed, 402 insertions(+), 251 deletions(-) diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index 1a9ea1ac0019..05ab9eafa723 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -120,7 +120,7 @@ To enable quantization of the key-value cache, one needs to indicate `cache_impl Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`~QuantizedCacheConfig`] class. One has to indicate which quantization backend to use in the [`~QuantizedCacheConfig`], the default is `quanto`. -It is recommended to set `axis-key/axis-value` parameters in the cache config to `0` if you're using the `quanto` backend and to `1` if you're using the `HQQ` backend. For other config values, please use the defaults unless you're running out of memory. In that case, you may consider decreasing the residual length. +It is recommended to set `axis-key/axis-value` parameters in the cache config to `0` if you're using the `quanto` backend and to `1` if you're using the `HQQ` backend. For other config values, please use the defaults unless you're running out of memory. In that case, you may consider decreasing the residual length. @@ -308,7 +308,7 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac ### Encoder-Decoder Cache -The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper) models but we will be adding more models soon. +The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper) models but we will be adding more models soon. In terms of usage, there is nothing special to be done and calling `generate()` or `forward()` will handle everything for you. @@ -379,7 +379,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda") >>> tokenizer = AutoTokenizer.from_pretrained(model_id) ->>> # Init StaticCache with big enough max-length (1024 tokens for the below example) +>>> # Init StaticCache with big enough max-length (1024 tokens for the below example) >>> # You can also init a DynamicCache, if that suits you better >>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16) @@ -394,10 +394,35 @@ Sometimes you would want to first fill-in cache object with key/values for certa >>> for prompt in prompts: ... new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda") ... past_key_values = copy.deepcopy(prompt_cache) -... outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) +... outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) ... response = tokenizer.batch_decode(outputs)[0] ... responses.append(response) >>> print(responses) [' You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', ' You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.'] ``` + + +## Legacy cache format + +Prior to the introduction of the `Cache` object, the cache of LLMs used to be a tuple of tuples of tensors. The legacy +format has a dynamic size, growing as we generate text -- very similar to `DynamicCache`. If your project depend on +this legacy format, you can seamlessly convert it to a `DynamicCache` and back. + +```python +>>> import torch +>>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache + +>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") +>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) + +>>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache +>>> # to be of the legacy type +>>> generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5) + +>>> # We can convert a legacy cache to a DynamicCache -- and the other way around. This is helpful if you have custom +>>> # logic to manipulate a cache in a specific format. +>>> cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values) +>>> legacy_format_cache = cache.to_legacy_cache() +``` diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index d32ab95f51c7..b5b221b6b37f 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -687,14 +687,18 @@ def forward( inputs_embeds = self.word_embeddings(input_ids) # kept for BC (non `Cache` `past_key_values` inputs) - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) batch_size, seq_length, _ = inputs_embeds.shape past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 @@ -765,9 +769,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 46eea43e1285..be57838975c0 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -526,14 +526,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.wte(input_ids) - use_legacy_cache = False + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False if use_cache and not isinstance(past_key_values, Cache): - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - if not self.training: + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) seq_length = inputs_embeds.shape[1] @@ -608,9 +612,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index ae84a9ec2d1a..cb1b3f885798 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -910,16 +910,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 43bac44ba1be..7263713c0840 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1059,16 +1059,19 @@ def forward( inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 73e8806352eb..9a37fe22e177 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1031,17 +1031,21 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) - # Compute alibi tensor: check build_alibi_tensor documentation - use_legacy_cache = False + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False if use_cache and not isinstance(past_key_values, Cache): - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - if not self.training: + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) + # Compute alibi tensor: check build_alibi_tensor documentation alibi = None past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 batch_size, seq_length, _ = inputs_embeds.shape @@ -1126,9 +1130,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/gemma/diff_gemma.py b/src/transformers/models/gemma/diff_gemma.py index fcdb0f0b3d7d..36f2d1c594ab 100644 --- a/src/transformers/models/gemma/diff_gemma.py +++ b/src/transformers/models/gemma/diff_gemma.py @@ -476,12 +476,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False # noqa: F841 - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True # noqa: F841 - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index b14e0a4b3d8c..dd4c899d13d4 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -828,12 +828,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - return_legacy_cache = False # noqa: F841 - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) - return_legacy_cache = True # noqa: F841 - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 @@ -856,15 +863,6 @@ def forward( # See https://github.com/huggingface/transformers/pull/29402 normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) - return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 43333e3d3338..7471eec811a0 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -417,14 +417,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -463,9 +468,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 72590862b749..28309f7738eb 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -741,14 +741,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.wte(input_ids) - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - if not self.training: + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) seq_length = inputs_embeds.shape[1] @@ -822,9 +826,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index e88302efa7bb..274c571fa893 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -943,14 +943,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_in(input_ids) - use_legacy_cache = False + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False if use_cache and not isinstance(past_key_values, Cache): - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - if not self.training: + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) seq_length = inputs_embeds.shape[1] @@ -1021,9 +1025,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None) diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index bf832195b4ef..048e108a8ec2 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -663,14 +663,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_in(input_ids) - use_legacy_cache = False + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False if use_cache and not isinstance(past_key_values, Cache): - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - if not self.training: + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) seq_length = inputs_embeds.shape[1] @@ -725,9 +729,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index bd7ce5696fa0..84f6d985f764 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -813,14 +813,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.wte(input_ids) - use_legacy_cache = False + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False if use_cache and not isinstance(past_key_values, Cache): - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - if not self.training: + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) seq_length = inputs_embeds.shape[1] @@ -917,9 +921,9 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 876f5ed2a7c8..f62de411a4fa 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -834,14 +834,19 @@ def forward( inputs_embeds = inputs_embeds * self.embedding_multiplier + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 0d26c4fc65de..1289bda2d0fd 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1239,15 +1239,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False if use_cache and not isinstance(past_key_values, Cache): - if not self.training: + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" ) - return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) batch_size, seq_length, _ = inputs_embeds.shape past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 6108f0e8a42e..41be300095e7 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1345,11 +1345,19 @@ def forward( raise ValueError("You have to specify either input_ids or inputs_embeds") past_seen_tokens = 0 + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if use_cache: - if not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) - return_legacy_cache = True + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) past_seen_tokens = past_key_values.get_seq_length() if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0: diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index c273b021d736..7c4394d0e1a1 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1033,12 +1033,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index c7017832b932..0bc44f314b5e 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -944,16 +944,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index ffe16b272033..80992734046a 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -762,14 +762,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index c7062e75b108..fcc0d66e19c4 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1018,14 +1018,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -1095,9 +1100,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index b4bda8e2db52..03fa524532a0 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -866,16 +866,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if ( - use_cache and not isinstance(past_key_values, Cache) and not self.training - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index a33338365312..2cbde7dc8631 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1007,14 +1007,19 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): return_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" - ) + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index ccaa2c7fd29a..a6fd2284afb6 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -685,14 +685,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -761,9 +766,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 648d1653a3b5..4d0a076b5f9a 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -976,14 +976,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -1053,9 +1058,10 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index ec395679ae62..e0ca84be1848 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1003,14 +1003,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -1074,9 +1079,10 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index d0ea8ef0e376..aafecb95b6aa 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -915,14 +915,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -991,9 +996,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 6f483e50cde0..bc06b406bf43 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1079,14 +1079,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -1161,9 +1166,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple( diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index d91c0832ed33..13641ecb37f2 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -960,14 +960,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -1036,9 +1041,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 0be37c4e1fb9..5eaf50f090fa 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -889,14 +889,19 @@ def forward( ) use_cache = False - use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache) and not self.training: - use_legacy_cache = True - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " - "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" - ) + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -966,9 +971,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)