From 75f08e49c220d24a875c4ed54617c60a8edc3d97 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 31 Jan 2025 13:13:30 +0100 Subject: [PATCH] update Readme and tests --- src/README.md | 69 ++++++++++++++++++-- src/cpp/include/openvino/genai/tokenizer.hpp | 1 - src/cpp/src/make_tokenizer_stateful.hpp | 8 +-- src/cpp/src/tokenizer.cpp | 2 - tests/python_tests/test_tokenizer.py | 27 +++----- 5 files changed, 77 insertions(+), 30 deletions(-) diff --git a/src/README.md b/src/README.md index 403e0a01fe..f3ac937024 100644 --- a/src/README.md +++ b/src/README.md @@ -400,18 +400,18 @@ For more examples of how metrics are used, please refer to the Python [benchmark ### Tokenization -OpenVINO™ GenAI provides a simple way to tokenize and detokenize text using the `ov::genai::Tokenizer` class. The `Tokenizer` is a high level abstraction over the OpenVINO Tokenizers library, which is used to tokenize and detokenize text. The `Tokenizer` class provides a simple interface to tokenize and detokenize text. +OpenVINO™ GenAI provides a way to tokenize and detokenize text using the `ov::genai::Tokenizer` class. The `Tokenizer` is a high level abstraction over the OpenVINO Tokenizers library. -It can be initialized from the path as a string or from the `ov::genai::LLMPipeline` object. +It can be initialized from the path, in-memory IR representation or obtained from the `ov::genai::LLMPipeline` object. ```cpp +// Initialize from the path #include "openvino/genai/llm_pipeline.hpp" auto tokenizer = ov::genai::Tokenizer(models_path); -// Or from LLMPipeline. +// Get instance of Tokenizer from LLMPipeline. auto pipe = ov::genai::LLMPipeline pipe(models_path, "CPU"); auto tokenzier = pipe.get_tokenizer(); - ```` ```python @@ -425,7 +425,66 @@ tokenizer = pipe.get_tokenizer() `Tokenizer` has `encode` and `decode` methods which support the following arguments: `add_special_tokens`, `skip_special_tokens`, `padding_mode`, `max_length` arguments. -For padding use +In order to disable adding special tokens do the followings, in C++: +```cpp +auto tokens = tokenizer.encode("The Sun is yellow because", ov::genai::add_special_tokens(false)); +``` + +In Python: +```python +tokens = tokenizer.encode("The Sun is yellow because", add_special_tokens=False) +``` + +Example of getting padded outputs: +```cpp +#include "openvino/genai/llm_pipeline.hpp" +auto tokenizer = ov::genai::Tokenizer(models_path); +std::vector prompts = {"The Sun is yellow because", "The"}; + +// With LONGEST string will be padded to the lengh of the longest sequence, max_length=1024 will be ignored. +tokens = tokenizer.encode(prompts, ov::genai::padding_mode(ov::genai::PaddingMode::LONGEST), ov::genai::max_length(1024)) +// out_shape: [2, 6] + +// With MAX_LENGTH string are short and resulting tokens sequence will be padded to 1024. +tokens = tokenizer.encode(prompts, ov::genai::padding_mode(ov::genai::PaddingMode::MAX_LENGTH), ov::genai::max_length(1024)) +// out_shape: [2, 1024] + +// With TRUNCATE long string will be truncated to the same 1024 length for the longer sequences +// and padded to the same 1024 length for the shorter sequences. +tokens = tokenizer.encode( + ["This is THE LONGESTS string ever of all times " * 1000, "The"], + ov::genai::padding_mode(ov::genai::PaddingMode::TRUNCATE), + ov::genai::max_length(1024) +) +// out_shape: [2, 1024] +``` + +```python +import openvino_genai as ov_genai + +tokenizer = ov_genai.Tokenizer(models_path) +prompts = ["The Sun is yellow because", "The"] + +# String will be padded to the lengh of the longest sequence, max_length=1024 will be ignored. +tokens = tokenizer.encode(prompts, padding_mode=ov_genai.PaddingMode.LONGEST, max_length=1024) +print(tokens.input_ids.shape) +# out_shape: [2, 6] + +# String are short and resulting tokens sequence will be padded to 1024. +tokens = tokenizer.encode(prompts, padding_mode=ov_genai.PaddingMode.MAX_LENGTH, max_length=1024) +print(tokens.input_ids.shape) +# out_shape: [2, 1024] + +# Very long string will be truncated to the same 1024 length for the longer sequences +# and padded to the same 1024 length for the shorter sequences. +tokens = tokenizer.encode( + ["This is THE LONGESTS string ever of all times " * 1000, "The"], + padding_mode=ov_genai.PaddingMode.TRUNCATE, + max_length=1024) +print(tokens.input_ids.shape) +# out_shape: [2, 1024] +``` + ## How It Works For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md). diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 642097de6b..077b162674 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -271,6 +271,5 @@ static constexpr ov::Property add_special_tokens{"add_special_tokens"}; static constexpr ov::Property skip_special_tokens{"skip_special_tokens"}; static constexpr ov::Property padding_mode{"padding_mode"}; - } // namespace genai } // namespace ov diff --git a/src/cpp/src/make_tokenizer_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp index e8fec1b416..1238cef713 100644 --- a/src/cpp/src/make_tokenizer_stateful.hpp +++ b/src/cpp/src/make_tokenizer_stateful.hpp @@ -95,10 +95,10 @@ class MakeVocabDecoderSatateful : public ov::pass::ModelPass { bool run_on_model(const std::shared_ptr& model) override; }; -const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens"; -const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens"; -const std::string MAX_PAD_LENGTH_VAR_ID = "max_pad_length"; -const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_truncation_length"; +inline const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens"; +inline const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens"; +inline const std::string MAX_PAD_LENGTH_VAR_ID = "max_pad_length"; +inline const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_truncation_length"; } // namespace genai } // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 2aa415b103..55290ab05a 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -205,11 +205,9 @@ class Tokenizer::TokenizerImpl { if (ov_tokenizer) { ov::pass::Manager manager; - manager.register_pass("before.svg"); manager.register_pass(); manager.register_pass(); manager.register_pass(); - manager.register_pass("after.svg"); manager.run_passes(ov_tokenizer); m_tokenizer = core.compile_model(ov_tokenizer, device, properties); ov::genai::utils::print_compiled_model_properties(m_tokenizer, "OV Tokenizer"); diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 734932a3ee..85db8f4a96 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -258,10 +258,11 @@ def test_encode_decode_with_special_tokens_option(prompt): assert decoded_hf_skip_spec != decoded_hf_no_skip prompts = [ - # ['1+1=', 'What is the previous answer?'], - # 'What is the previous answers? ' * 1000, # long sentence exceeding max_length - # 'what', # test that short sentence is padded to long - [ # chech that large bathc with multilangual data is correctly padded + ['1+1=', 'What is the previous answer?'], + 'What is the previous answers? ' * 1000, # long sentence exceeding max_length, check that is truncated + 'what', # check that short sentence is padded to long + # check that large batch with multilangual data is correctly padded + [ '1+1=', 'What is the previous answer?', 'Why is the Sun yellow?', @@ -286,10 +287,11 @@ def test_padding(add_special_tokens, max_length, pad_mode, prompt): # to the longest sequence in the batch since resulting tokenization is stored as a signe ov::Tensor # which cannot store irregular/ragged array. # Therefore, for the truncation mode need to sete padding to 'longest' and truncation=True. + # ALso MAX_LENGTH mode truncation also applied so that we have the same lengths for each sequence in the batch. pad_modes_map = { PaddingMode.TRUNCATE: dict(padding="longest", truncation=True), PaddingMode.LONGEST: dict(padding="longest"), - PaddingMode.MAX_LENGTH: dict(padding="max_length"), + PaddingMode.MAX_LENGTH: dict(padding="max_length", truncation=True), } hf_pad_truncation_modes = pad_modes_map[pad_mode] @@ -299,19 +301,8 @@ def test_padding(add_special_tokens, max_length, pad_mode, prompt): ov_res = genai_tokenzier.encode(prompt, **ov_params) hf_res = hf_tokenizer(prompt, return_tensors="np", **hf_params) - # HF instead of a single blob of data gives a list of numpy with different sizes - # Some are padded to max_len some exceed. - # Since openvino_tokenizers cannot store ragged arrays, we compare - # them individually. - if max_length < 64 and isinstance(prompt, list) and len(prompt) > 2 and pad_mode == PaddingMode.MAX_LENGTH: - for field_name in "input_ids", "attention_mask": - for i in range(len(hf_res[field_name])): - ov_data = getattr(ov_res, field_name).data[i] - assert np.all(ov_data == hf_res[field_name][i][:len(ov_data)]) - else: - # regular comparision - assert np.all(ov_res.input_ids.data == hf_res["input_ids"]) - assert np.all(ov_res.attention_mask.data == hf_res["attention_mask"]) + assert np.all(ov_res.input_ids.data == hf_res["input_ids"]) + assert np.all(ov_res.attention_mask.data == hf_res["attention_mask"]) @pytest.mark.precommit