Skip to content

Commit

Permalink
Minicpm patch (huggingface#1567)
Browse files Browse the repository at this point in the history
Signed-off-by: Daniel Huang <daniel1.huang@intel.com>
  • Loading branch information
pi314ever authored and Liangyx2 committed Jan 20, 2025
1 parent 30beb3e commit 0f2767b
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions examples/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,13 +606,12 @@ def setup_tokenizer(args, model, assistant_model, logger):
tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)

# HACK: MiniCPM3 has multiple eos_tokens and does not specify padding token. Set both to second one.
if model.config.model_type == "minicpm3":
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = model.generation_config.eos_token_id[-1]
# HACK: MiniCPM3 does not support list EOS token ID generation config.
if model.config.model_type == "minicpm3" and isinstance(model.generation_config.eos_token_id, list):
logger.warning(
f"Model type {model.config.model_type} does not support list style EOS token ID in generation config. Only last eos token id will be used."
)
model.generation_config.eos_token_id = model.generation_config.eos_token_id[-1]
if len(model.generation_config.eos_token_id) > 1:
logger.warning("Multiple EOS token IDs found. Only last eos token id will be used.")

# Some models like GPT2 do not have a PAD token so we have to set it if necessary
if tokenizer.pad_token is None:
Expand Down

0 comments on commit 0f2767b

Please sign in to comment.