huggingface · zucchini-nlp · Apr 10, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -157,6 +157,13 @@ def __init__(
         self.generation_config.return_dict_in_generate = True
         self.generation_config.output_scores = True
 
+        # avoid unnecessary warnings that min_length is more than max_new_tokens
+        input_length = input_ids.shape[-1]
+        min_new_tokens = self.generation_config.min_new_tokens if self.generation_config.min_new_tokens else 0
+        self.min_length = min(self.generation_config.min_length, input_length + min_new_tokens)
+        self.generation_config.min_length = 0
+        self.generation_config.min_new_tokens = None
+
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
         Fetches the candidates to be tried for the current input.
@@ -175,6 +182,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         # Don't generate more than `max_length - 1` candidates since the target model generates one extra token.
         new_cur_len = input_ids.shape[-1]
         max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1)
+        min_new_tokens = min(max_new_tokens, self.min_length - new_cur_len)
         if max_new_tokens == 0:
             return input_ids, None
 
@@ -195,6 +203,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         # 2. Forecast next N tokens using the assistant model.
         assistant_generation_kwargs = {
             self.input_ids_key: input_ids,
+            "min_new_tokens": min_new_tokens,
             "max_new_tokens": max_new_tokens,
             "generation_config": self.generation_config,
             "logits_processor": self.logits_processor,

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -1501,7 +1501,7 @@ def generate(
             )
 
             # 12. run assisted generate
-            result = self.assisted_decoding(
+            result = self._assisted_decoding(
                 input_ids,
                 candidate_generator=candidate_generator,
                 do_sample=generation_config.do_sample,

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -3252,6 +3252,28 @@ def test_default_max_length_warning(self):
             model.generate(input_ids)
             self.assertEqual(len(warning_list), 0)
 
+    def test_length_warning_assisted_generation(self):
+        # PT-only test: TF doesn't support assisted decoding yet.
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model.config.pad_token_id = tokenizer.eos_token_id
+        assistant.config.pad_token_id = tokenizer.eos_token_id
+
+        text = "Hello world"
+        tokenized_inputs = tokenizer([text], return_tensors="pt")
+        input_ids = tokenized_inputs.input_ids.to(torch_device)
+
+        # This should not raise any warning that min length is not feasible in candidate generation
+        with warnings.catch_warnings(record=True) as warning_list:
+            model.generate(
+                input_ids,
+                assistant_model=assistant,
+                min_new_tokens=10,
+                max_length=20,
+            )
+            self.assertEqual(len(warning_list), 0)
+
     def test_model_kwarg_assisted_decoding_decoder_only(self):
         # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)