From 75f08e49c220d24a875c4ed54617c60a8edc3d97 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Fri, 31 Jan 2025 13:13:30 +0100
Subject: [PATCH] update Readme and tests

---
 src/README.md                                | 69 ++++++++++++++++++--
 src/cpp/include/openvino/genai/tokenizer.hpp |  1 -
 src/cpp/src/make_tokenizer_stateful.hpp      |  8 +--
 src/cpp/src/tokenizer.cpp                    |  2 -
 tests/python_tests/test_tokenizer.py         | 27 +++-----
 5 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/src/README.md b/src/README.md
index 403e0a01fe..f3ac937024 100644
--- a/src/README.md
+++ b/src/README.md
@@ -400,18 +400,18 @@ For more examples of how metrics are used, please refer to the Python [benchmark
 
 ### Tokenization
 
-OpenVINO™ GenAI provides a simple way to tokenize and detokenize text using the `ov::genai::Tokenizer` class. The `Tokenizer` is a high level abstraction over the OpenVINO Tokenizers library, which is used to tokenize and detokenize text. The `Tokenizer` class provides a simple interface to tokenize and detokenize text.
+OpenVINO™ GenAI provides a way to tokenize and detokenize text using the `ov::genai::Tokenizer` class. The `Tokenizer` is a high level abstraction over the OpenVINO Tokenizers library.
 
-It can be initialized from the path as a string or from the `ov::genai::LLMPipeline` object.
+It can be initialized from the path, in-memory IR representation or obtained from the `ov::genai::LLMPipeline` object.
 
 ```cpp
+// Initialize from the path
 #include "openvino/genai/llm_pipeline.hpp"
 auto tokenizer = ov::genai::Tokenizer(models_path);
 
-// Or from LLMPipeline.
+// Get instance of Tokenizer from LLMPipeline.
 auto pipe = ov::genai::LLMPipeline pipe(models_path, "CPU");
 auto tokenzier = pipe.get_tokenizer();
-
 ````
 
 ```python
@@ -425,7 +425,66 @@ tokenizer = pipe.get_tokenizer()
 
 `Tokenizer` has `encode` and `decode` methods which support the following arguments: `add_special_tokens`, `skip_special_tokens`, `padding_mode`, `max_length` arguments.
 
-For padding use 
+In order to disable adding special tokens do the followings, in C++:
+```cpp
+auto tokens = tokenizer.encode("The Sun is yellow because", ov::genai::add_special_tokens(false));
+```
+
+In Python:
+```python
+tokens = tokenizer.encode("The Sun is yellow because", add_special_tokens=False)
+```
+
+Example of getting padded outputs:
+```cpp
+#include "openvino/genai/llm_pipeline.hpp"
+auto tokenizer = ov::genai::Tokenizer(models_path);
+std::vector<std::string> prompts = {"The Sun is yellow because", "The"};
+
+// With LONGEST string will be padded to the lengh of the longest sequence, max_length=1024 will be ignored.
+tokens = tokenizer.encode(prompts, ov::genai::padding_mode(ov::genai::PaddingMode::LONGEST), ov::genai::max_length(1024))
+// out_shape: [2, 6]
+
+// With MAX_LENGTH string are short and resulting tokens sequence will be padded to 1024.
+tokens = tokenizer.encode(prompts, ov::genai::padding_mode(ov::genai::PaddingMode::MAX_LENGTH), ov::genai::max_length(1024))
+// out_shape: [2, 1024]
+
+// With TRUNCATE long string will be truncated to the same 1024 length for the longer sequences 
+// and padded to the same 1024 length for the shorter sequences.
+tokens = tokenizer.encode(
+    ["This is THE LONGESTS string ever of all times " * 1000, "The"], 
+    ov::genai::padding_mode(ov::genai::PaddingMode::TRUNCATE), 
+    ov::genai::max_length(1024)
+)
+// out_shape: [2, 1024]
+```
+
+```python
+import openvino_genai as ov_genai
+
+tokenizer = ov_genai.Tokenizer(models_path)
+prompts = ["The Sun is yellow because", "The"]
+
+# String will be padded to the lengh of the longest sequence, max_length=1024 will be ignored.
+tokens = tokenizer.encode(prompts, padding_mode=ov_genai.PaddingMode.LONGEST, max_length=1024)
+print(tokens.input_ids.shape)
+# out_shape: [2, 6]
+
+# String are short and resulting tokens sequence will be padded to 1024.
+tokens = tokenizer.encode(prompts, padding_mode=ov_genai.PaddingMode.MAX_LENGTH, max_length=1024)
+print(tokens.input_ids.shape)
+# out_shape: [2, 1024]
+
+# Very long string will be truncated to the same 1024 length for the longer sequences 
+# and padded to the same 1024 length for the shorter sequences.
+tokens = tokenizer.encode(
+    ["This is THE LONGESTS string ever of all times " * 1000, "The"], 
+    padding_mode=ov_genai.PaddingMode.TRUNCATE, 
+    max_length=1024)
+print(tokens.input_ids.shape)
+# out_shape: [2, 1024]
+```
+
 ## How It Works
 
 For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md).
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 642097de6b..077b162674 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -271,6 +271,5 @@ static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
 static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
 static constexpr ov::Property<PaddingMode> padding_mode{"padding_mode"};
 
-
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/make_tokenizer_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp
index e8fec1b416..1238cef713 100644
--- a/src/cpp/src/make_tokenizer_stateful.hpp
+++ b/src/cpp/src/make_tokenizer_stateful.hpp
@@ -95,10 +95,10 @@ class MakeVocabDecoderSatateful : public ov::pass::ModelPass {
     bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
 };
 
-const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
-const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
-const std::string MAX_PAD_LENGTH_VAR_ID = "max_pad_length";
-const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_truncation_length";
+inline const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
+inline const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
+inline const std::string MAX_PAD_LENGTH_VAR_ID = "max_pad_length";
+inline const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_truncation_length";
 
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 2aa415b103..55290ab05a 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -205,11 +205,9 @@ class Tokenizer::TokenizerImpl {
 
         if (ov_tokenizer) {
             ov::pass::Manager manager;
-            manager.register_pass<ov::pass::VisualizeTree>("before.svg");
             manager.register_pass<MakeAddSpecialTokensSatateful>();
             manager.register_pass<MakePaddingSatateful>();
             manager.register_pass<MakeTruncationSatateful>();
-            manager.register_pass<ov::pass::VisualizeTree>("after.svg");
             manager.run_passes(ov_tokenizer);
             m_tokenizer = core.compile_model(ov_tokenizer, device, properties);
             ov::genai::utils::print_compiled_model_properties(m_tokenizer, "OV Tokenizer");
diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py
index 734932a3ee..85db8f4a96 100644
--- a/tests/python_tests/test_tokenizer.py
+++ b/tests/python_tests/test_tokenizer.py
@@ -258,10 +258,11 @@ def test_encode_decode_with_special_tokens_option(prompt):
     assert decoded_hf_skip_spec != decoded_hf_no_skip
 
 prompts = [
-    # ['1+1=', 'What is the previous answer?'],
-    # 'What is the previous answers? ' * 1000,  # long sentence exceeding max_length
-    # 'what',  # test that short sentence is padded to long
-    [   # chech that large bathc with multilangual data is correctly padded
+    ['1+1=', 'What is the previous answer?'],
+    'What is the previous answers? ' * 1000,  # long sentence exceeding max_length, check that is truncated
+    'what',                                   # check that short sentence is padded to long
+    # check that large batch with multilangual data is correctly padded
+    [   
         '1+1=',
         'What is the previous answer?',
         'Why is the Sun yellow?',
@@ -286,10 +287,11 @@ def test_padding(add_special_tokens, max_length, pad_mode, prompt):
     # to the longest sequence in the batch since resulting tokenization is stored as a signe ov::Tensor 
     # which cannot store irregular/ragged array.
     # Therefore, for the truncation mode need to sete padding to 'longest' and truncation=True.
+    # ALso MAX_LENGTH mode truncation also applied so that we have the same lengths for each sequence in the batch.
     pad_modes_map = {
         PaddingMode.TRUNCATE: dict(padding="longest", truncation=True),
         PaddingMode.LONGEST: dict(padding="longest"),
-        PaddingMode.MAX_LENGTH: dict(padding="max_length"),
+        PaddingMode.MAX_LENGTH: dict(padding="max_length", truncation=True),
     }
     hf_pad_truncation_modes = pad_modes_map[pad_mode]
 
@@ -299,19 +301,8 @@ def test_padding(add_special_tokens, max_length, pad_mode, prompt):
     ov_res = genai_tokenzier.encode(prompt, **ov_params)
     hf_res = hf_tokenizer(prompt, return_tensors="np", **hf_params)
 
-    # HF instead of a single blob of data gives a list of numpy with different sizes
-    # Some are padded to max_len some exceed.
-    # Since openvino_tokenizers cannot store ragged arrays, we compare
-    # them individually.
-    if max_length < 64 and isinstance(prompt, list) and len(prompt) > 2 and pad_mode == PaddingMode.MAX_LENGTH:
-        for field_name in "input_ids", "attention_mask":
-            for i in range(len(hf_res[field_name])):
-                ov_data = getattr(ov_res, field_name).data[i]
-                assert np.all(ov_data == hf_res[field_name][i][:len(ov_data)])
-    else:
-        # regular comparision
-        assert np.all(ov_res.input_ids.data == hf_res["input_ids"])
-        assert np.all(ov_res.attention_mask.data == hf_res["attention_mask"])
+    assert np.all(ov_res.input_ids.data == hf_res["input_ids"])
+    assert np.all(ov_res.attention_mask.data == hf_res["attention_mask"])
 
 
 @pytest.mark.precommit