Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug fixes] update chatglm tokenizer #7797

Merged
merged 5 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llm/docs/inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ PaddleNLP 中已经添加高性能推理模型相关实现,支持:
| [ChatGLM2](../chatglm2) | ✅ | ❌ | ❌ | ❌ |
| [Bloom](../bloom) | ✅ | ✅ | ✅ | ❌ |
| [GPT-3](../gpt-3) | ✅ | ❌ | ❌ | ❌ |
| [Qwen](../qwen) | | ❌ | ❌ | ❌ |
| [Qwen](../qwen) | | ❌ | ❌ | ❌ |
| [BaiChuan-7B](../llama) | ✅ | ✅ | ✅ | 🚧 |
| [BaiChuan-13B](../llama) | ❌ | ❌ | ❌ | ❌ |
| [BaiChuan2-7B](../llama) | ✅ | ✅ | ✅ | 🚧 |
| [BaiChuan2-13B](../llama) | 🚧 | 🚧 | 🚧 | 🚧 |

* ✅: Supported
* 🚧: In Progress
Expand Down
4 changes: 2 additions & 2 deletions llm/gradio_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def get_shown_context(context):
with gr.Column(scale=1):
top_k = gr.Slider(
minimum=0,
maximum=default_params.get("top_k", 20),
maximum=100,
value=0,
step=1,
label="Top-k",
Expand Down Expand Up @@ -218,7 +218,7 @@ def get_shown_context(context):
default_src_length = default_params["src_length"]
total_length = default_params["src_length"] + default_params["max_length"]
src_length = create_src_slider(default_src_length, total_length)
max_length = create_max_slider(50, total_length)
max_length = create_max_slider(min(total_length - default_src_length, 50), total_length)

def src_length_change_event(src_length_value, max_length_value):
return create_max_slider(
Expand Down
28 changes: 23 additions & 5 deletions llm/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,18 @@ def get_eos_token_id(
Returns:
int | List[int]: eos_token_id to stop the generation
"""
if generation_config is None or generation_config.eos_token_id is None:
return tokenizer.eos_token_id
eos_token_ids = []
if tokenizer.eos_token_id is not None:
eos_token_ids.append(tokenizer.eos_token_id)

return generation_config.eos_token_id
if generation_config is not None and generation_config.eos_token_id is not None:
if isinstance(generation_config.eos_token_id, int):
eos_token_ids.append(generation_config.eos_token_id)
else:
eos_token_ids.extend(generation_config.eos_token_id)

eos_token_ids_dict = {str(item): item for item in eos_token_ids}
return list(eos_token_ids_dict.values())


class BasePredictor:
Expand Down Expand Up @@ -959,8 +967,18 @@ def predict():
with open(model_args.data_file, "r", encoding="utf-8") as f:
for line in f:
example = json.loads(line)
source_texts.append(example["src"])
target_texts.append(example["tgt"])
if isinstance(example["src"], str) or predictor.tokenizer.chat_template is None:
if isinstance(example["src"], str):
source_texts.append(example["src"])
target_texts.append(example["tgt"])
else:
# load multi-rounds dataset
source_texts.append(example["src"][0])
target_texts.append(example["tgt"][0])
else:
source_texts.append(list(zip(example["src"], example["tgt"])))
target_texts.append("")

else:
source_texts = ["解释一下“温故而知新”", "你好,请问你是谁?"]
target_texts = ["", ""]
Expand Down
41 changes: 36 additions & 5 deletions paddlenlp/transformers/chatglm_v2/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from __future__ import annotations

import os
import re
from typing import Any, Dict, List, Optional, Union

import numpy as np
Expand All @@ -36,16 +37,43 @@ def __init__(self, model_path: str):
self.pad_id: int = self.sp_model.unk_id()
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"]
special_tokens = [
"[MASK]",
"[gMASK]",
"[sMASK]",
"sop",
"eop",
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|observation|>",
]

self.special_tokens = {}
self.index_special_tokens = {}
for token in special_tokens:
self.special_tokens[token] = self.n_words
self.index_special_tokens[self.n_words] = token
self.n_words += 1

def tokenize(self, s: str):
return self.sp_model.EncodeAsPieces(s)
# add eos/pad/unk token to special_token_expression
all_special_tokens = list(self.special_tokens.keys()) + ["</s>", "<unk>"]
self.special_token_expression = "|".join([re.escape(token) for token in all_special_tokens])

def tokenize(self, s: str, encode_special_tokens=False):
if encode_special_tokens:
last_index = 0
t = []
for match in re.finditer(self.special_token_expression, s):
if last_index < match.start():
t.extend(self.sp_model.EncodeAsPieces(s[last_index : match.start()]))
t.append(s[match.start() : match.end()])
last_index = match.end()
if last_index < len(s):
t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
return t
else:
return self.sp_model.EncodeAsPieces(s)

def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
assert type(s) is str
Expand Down Expand Up @@ -85,7 +113,8 @@ class ChatGLMv2Tokenizer(PretrainedTokenizer):
}
}

def __init__(self, vocab_file, padding_side="left", **kwargs):
# always encode special tokens, eg: </s>, [gMASK], [MASK] ...
def __init__(self, vocab_file, padding_side="left", encode_special_tokens=True, **kwargs):
super().__init__(padding_side=padding_side, **kwargs)
self.name = "ChatGLMv2Tokenizer"

Expand All @@ -94,8 +123,10 @@ def __init__(self, vocab_file, padding_side="left", **kwargs):
self.special_tokens = {
"<bos>": self.tokenizer.bos_id,
"<eos>": self.tokenizer.eos_id,
"<unk>": self.tokenizer.pad_id,
"<pad>": self.tokenizer.pad_id,
}
self.encode_special_tokens = encode_special_tokens

def get_command(self, token):
if token in self.special_tokens:
Expand Down Expand Up @@ -130,7 +161,7 @@ def get_vocab(self):
return vocab

def _tokenize(self, text, **kwargs):
return self.tokenizer.tokenize(text)
return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
Expand Down
6 changes: 6 additions & 0 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,8 @@ def __call__(self, conversations: list[list[str]] | str, context_data: Dict[str,
final_query = self.render_system(context_data=context_data)
context_data["length"] = len(conversations)
for index, conversation in enumerate(conversations[:-1]):
context_data["is_first"] = index == 0
context_data["is_last"] = False
final_query += "".join(self.render_conversation(conversation, index=index, context_data=context_data))

if not isinstance(conversations[-1], list) and not len(conversations[-1]) != 1:
Expand Down Expand Up @@ -668,6 +670,10 @@ def encode_chat_inputs(self, conversations: List[List[str, str]], context_data:
# encode conversation
conversation_ids = []
for index, conversation in enumerate(conversations):
# give more control to chat_template
context_data["is_first"] = index == 0
context_data["is_last"] = index == len(conversations) - 1

user_input, bot_output = self.chat_template.render_conversation(
conversation, index=index, context_data=context_data
)
Expand Down
57 changes: 57 additions & 0 deletions tests/transformers/chatglm_v2/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from paddlenlp.transformers.chatglm_v2.tokenizer import ChatGLMv2Tokenizer
from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer

VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
}


class ChatGLMv2TokenizationTest(unittest.TestCase):

tokenizer_class = ChatGLMv2Tokenizer
test_decode_token = True

def get_tokenizer(self, **kwargs) -> PretrainedTokenizer:
tokenizer = ChatGLMv2Tokenizer.from_pretrained("THUDM/chatglm2-6b", **kwargs)
return tokenizer

def test_encode_special_tokens(self):
tokenizer = self.get_tokenizer()

query = "[gMASK]</s>"
tokens = tokenizer.tokenize(query)
self.assertEqual(len(tokens), 2)

outputs = tokenizer.encode(query, add_special_tokens=False)
self.assertEqual(len(outputs["input_ids"]), 2)


class ChatGLMv3TokenizationTest(unittest.TestCase):
tokenizer_class = ChatGLMv2Tokenizer

def get_tokenizer(self, **kwargs) -> PretrainedTokenizer:
return ChatGLMv2Tokenizer.from_pretrained("THUDM/chatglm3-6b", **kwargs)

def test_encode_special_tokens(self):
tokenizer = self.get_tokenizer()

query = "[gMASK]<|user|><|assistant|></s>"
tokens = tokenizer.tokenize(query)
self.assertEqual(len(tokens), 4)
12 changes: 11 additions & 1 deletion tests/transformers/test_generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,17 @@ def test_get_unfinished_flag(self):
self.assertEqual(unfinish_flag.reshape([2]).tolist(), [False, True])

eos_token_id = [7, 11, 3]
unfinish_flag = paddle.to_tensor([[False], [False]], dtype="bool")
unfinish_flag = paddle.to_tensor([[True], [True]], dtype="bool")
unfinish_flag = get_unfinished_flag(input_ids, unfinish_flag, eos_token_id)
self.assertEqual(unfinish_flag.reshape([2]).tolist(), [False, False])

eos_token_id = [[7], [11], [3]]
unfinish_flag = paddle.to_tensor([[True], [True]], dtype="bool")
unfinish_flag = get_unfinished_flag(input_ids, unfinish_flag, eos_token_id)
self.assertEqual(unfinish_flag.reshape([2]).tolist(), [False, False])

eos_token_id = [7, [11], [3]]
unfinish_flag = paddle.to_tensor([[True], [True]], dtype="bool")
unfinish_flag = get_unfinished_flag(input_ids, unfinish_flag, eos_token_id)
self.assertEqual(unfinish_flag.reshape([2]).tolist(), [False, False])

Expand Down