diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 9f7fe8b0bfb1..41d78f1289df 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -381,9 +381,11 @@ def prepare_inputs_for_generation( # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. # (we can't check exception 3 while compiling) + # Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and + # generate the first token for each sequence. Later use the generated Input ids for continuation. if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None and input_ids.shape[1] == 0: + if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4 inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] elif ( inputs_embeds is not None # Exception 1