significant improvements; extended tests; all regimes match

openvinotoolkit · Jan 31, 2025 · 57f26ae · 57f26ae
1 parent 85e3260
commit 57f26ae
Show file tree

Hide file tree

Showing 10 changed files with 209 additions and 138 deletions.
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -245,7 +245,24 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     std::shared_ptr<TokenizerImpl> m_pimpl;
 };
 
-enum class PaddingMode { TRUNCATE = 0, LONGEST = 1, MAX_LENGTH = 2, DO_NOT_PAD = 3, NONE = 4};
+/**
+ * @enum PaddingMode
+ * @brief Enum class representing different padding modes for tokenization.
+ *
+ * This enum class defines modes that can be used to pad tokenized sequences. 
+ * IMPORTANT NOTICE, even in truncation mode the padding is applied to the longest sequence in the batch
+ * since resulting tokenization is stored as a signe ov::Tensor which cannot store irregular/ragged array.
+ *
+ * @var PaddingMode::TRUNCATE
+ * Truncate the sequence to the maximum length specified. (But also LONGEST mode is implicitly applied.)
+ *
+ * @var PaddingMode::LONGEST
+ * Pad the sequence to the length of the longest sequence in the batch. In this mode truncation is switched off.
+ *
+ * @var PaddingMode::MAX_LENGTH
+ * Pad the sequence to a specified maximum length. In this mode truncation is switched off.
+ */
+enum class PaddingMode { TRUNCATE, LONGEST, MAX_LENGTH };
 
 static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
 static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};

diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp
@@ -3,10 +3,12 @@
 
 #include "make_tokenizer_stateful.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/concat.hpp"
 #include "openvino/op/select.hpp"
 #include "openvino/op/maximum.hpp"
 #include "openvino/op/minimum.hpp"
 #include "openvino/op/add.hpp"
+#include "openvino/op/subtract.hpp"
 #include "openvino/op/slice.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/read_value.hpp"
@@ -16,8 +18,7 @@
 using namespace ov;
 using namespace ov::op;
 
-bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
-
+bool ov::genai::MakeAddSpecialTokensSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
     std::shared_ptr<ov::Node> combine_seg_node;
     for (auto node: model->get_ordered_ops()) {
         if (strcmp(node->get_type_info().name, "CombineSegments") == 0) {
@@ -137,25 +138,28 @@ bool ov::genai::MakeTruncationSatateful::run_on_model(const std::shared_ptr<ov::
         return false;
     }
 
-    auto add = ov::as_type_ptr<ov::op::v1::Add>(combine_segments_node->input_value(4).get_node_shared_ptr());
-    if (!add) {
-        return false;
-    }
-    auto min_node = ov::as_type_ptr<ov::op::v1::Minimum>(add->get_input_node_shared_ptr(0));
-    if (!min_node) {
-        min_node = ov::as_type_ptr<ov::op::v1::Minimum>(add->get_input_node_shared_ptr(1));
-    }
-    if (!min_node) {
+    std::shared_ptr<Node> add_or_sub_node = combine_segments_node->input_value(4).get_node_shared_ptr();
+    // If Add then it's a right truncation, if Subtract then it's a left truncation.
+    if (!ov::as_type_ptr<v1::Add>(add_or_sub_node) && !ov::as_type_ptr<v1::Subtract>(add_or_sub_node)) {
+        // Exit if it's neither, because in that case it's not a truncation.
         return false;
     }
+
+    // auto pattern_2 = ov::pass::pattern::wrap_type<ov::op::v0::Constant>(ov::pass::pattern::rank_equals(1));
+    // auto unsqueeze = ov::pass::pattern::wrap_type<ov::op::v1::Reshape, ov::op::v0::Unsqueeze>({cell, pattern_2});
+    // ov::pass::pattern::Matcher matcher(unsqueeze);
+
+    // Minimum between max_length and length of token sequence.
+    auto min_node = ov::as_type_ptr<v1::Minimum>(add_or_sub_node->get_input_node_shared_ptr(1));
+    if (!min_node) { return false; }
 
-    auto const_node = ov::as_type_ptr<v0::Constant>(min_node->get_input_node_shared_ptr(0));
-    if (!const_node) {
-        const_node = ov::as_type_ptr<v0::Constant>(min_node->get_input_node_shared_ptr(1));
-    }
-    if (!const_node) {
-        return false;
-    }
+    // Node which subtracts from max_truncation_length number of added_tokens.
+    auto sub_node = ov::as_type_ptr<v1::Subtract>(min_node->get_input_node_shared_ptr(1));
+    if (!sub_node) { return false; }
+
+    // max_truncation_length constant containing final length at the end of pipeline.
+    auto const_node = ov::as_type_ptr<v0::Constant>(sub_node->get_input_node_shared_ptr(0));
+    if (!const_node) { return false; }
 
     op::util::VariableInfo var_info{const_node->get_output_shape(0), const_node->get_output_element_type(0), MAX_TRUNCATION_LENGTH_VAR_ID};
     auto variable = std::make_shared<op::util::Variable>(var_info);
@@ -168,6 +172,28 @@ bool ov::genai::MakeTruncationSatateful::run_on_model(const std::shared_ptr<ov::
         target_input.replace_source_output(read_trunc_value->output(0));
     }
 
+    // We need to check if user requested to not add special tokens.
+    std::shared_ptr<v6::ReadValue> read_value_spec_tokens;
+    for (const auto& sink : model->get_sinks()) {
+        // Check if sink accepts input from Assign, and if that't the case get the ReadValus node input.
+        if (auto read_value = ov::as_type_ptr<v6::ReadValue>(sink->get_input_node_shared_ptr(0))) {
+            if (read_value->get_variable()->get_info().variable_id == ADD_SPECIAL_TOKENS_VAR_ID) {
+                read_value_spec_tokens = read_value;
+                break;
+            }
+        }
+    }
+
+    // Constant which stores number of added_tokens.
+    auto num_added_tokens_const = ov::as_type_ptr<v0::Constant>(sub_node->get_input_node_shared_ptr(1));
+    // If user requested to not add special tokens in order to correctly calculate 
+    // truncation we need to enforce num_added_tokens to 0 regardless the hardcoded value of Constant.
+    if (read_value_spec_tokens && num_added_tokens_const) {
+        auto zero_constant = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{}, std::vector{0});
+        auto select_node = std::make_shared<v1::Select>(read_value_spec_tokens, num_added_tokens_const, zero_constant);
+        sub_node->input(1).replace_source_output(select_node->output(0));
+    }
+
     auto assign = std::make_shared<v6::Assign>(read_trunc_value, variable);
     model->add_sinks({assign});
     model->add_variables({variable});

diff --git a/src/cpp/src/make_tokenizer_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp
@@ -3,6 +3,7 @@
 
 #include "openvino/op/constant.hpp"
 #include "openvino/pass/pass.hpp"
+#include "openvino/pass/matcher_pass.hpp"
 
 namespace ov {
 namespace genai {
@@ -32,9 +33,9 @@ namespace genai {
  *          |     CombineSegments     |
  *          +-------------------------+
 **/
-class MakeCombineSegmentsSatateful : public ov::pass::ModelPass {
+class MakeAddSpecialTokensSatateful : public ov::pass::ModelPass {
 public:
-    OPENVINO_MODEL_PASS_RTTI("MakeCombineSegmentsSatateful");
+    OPENVINO_MODEL_PASS_RTTI("MakeAddSpecialTokensSatateful");
     bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
 };
 
@@ -97,7 +98,7 @@ class MakeVocabDecoderSatateful : public ov::pass::ModelPass {
 const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
 const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
 const std::string MAX_PAD_LENGTH_VAR_ID = "max_pad_length";
-const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_pad_length";
+const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_truncation_length";
 
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
@@ -23,39 +23,6 @@
 
 namespace {
 
-// todo: remove when openvino-tokenizers will support left padding
-ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention_mask) {
-    const size_t batch_size = input_ids.get_shape()[0];
-    const size_t sequence_length = input_ids.get_shape()[1];
-    int64_t* inputs_data = input_ids.data<int64_t>();
-    int64_t* attention_mask_data = attention_mask.data<int64_t>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * sequence_length;
-
-        // last token in the sequence is not a PAD_TOKEN, skipping
-        if (attention_mask_data[batch_offset + sequence_length - 1] == 1)
-            continue;
-
-        size_t pad_tokens_number = 0;
-        for (int i = sequence_length - 1; i >= 0; i--) {
-            const size_t token_offset = batch_offset + i;
-
-            // count pad tokens
-            if (attention_mask_data[token_offset] == 0)
-                continue;
-
-            if (pad_tokens_number == 0)
-                pad_tokens_number = sequence_length - i - 1;
-
-            std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
-            std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
-        }
-    }
-
-    return {input_ids, attention_mask};
-}
-
 void check_arguments(const ov::AnyMap& parameters, std::set<std::string> allowed_argnames) {
     for (const auto& [key, value] : parameters) {
         if (allowed_argnames.find(key) == allowed_argnames.end()) {
@@ -119,11 +86,7 @@ class Tokenizer::TokenizerImpl {
             case PaddingMode::LONGEST:
                 return {std::numeric_limits<int32_t>::max(), 0};
             case PaddingMode::MAX_LENGTH:
-                return {std::numeric_limits<int32_t>::max(), max_length};
-            case PaddingMode::DO_NOT_PAD:
-                // behaves exactly as longest
-                // TODO: need to find a way to disable padding automatically so that it will match to HF.
-                return {std::numeric_limits<int32_t>::max(), 0};
+                return {max_length, max_length};
             default:
                 OPENVINO_THROW("Unknown padding mode");
         }
@@ -132,8 +95,8 @@ class Tokenizer::TokenizerImpl {
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = m_add_special_tokens;
         bool skip_special_tokens_flag = m_skip_special_tokens;
-        size_t max_length_val;
-        PaddingMode padding_mode_val = PaddingMode::NONE;
+        std::optional<size_t> max_length_val;
+        PaddingMode padding_mode_val = PaddingMode::TRUNCATE;
 
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
@@ -143,23 +106,22 @@ class Tokenizer::TokenizerImpl {
         int max_trunc_length_val = m_max_trunc_length;
         int max_pad_length_val = m_max_pad_length;
 
-        std::tie(max_trunc_length_val, max_pad_length_val) = get_padding_values(padding_mode_val, max_length_val);
+        std::tie(max_trunc_length_val, max_pad_length_val) = get_padding_values(padding_mode_val, *max_length_val);
 
-        // If user requested add_special_tokens mode different from the current one,
-        // need to set state variable.
-        // If requested mode matches the stored state set, then don't touch states.
+        // If requested add[skip]_special_tokens, max_length or pading mode 
+        // is different from the stored state, need to set state variable.
         if (add_special_tokens_flag == m_add_special_tokens 
             && skip_special_tokens_flag == m_skip_special_tokens
             && max_trunc_length_val == m_max_trunc_length
             && max_pad_length_val == m_max_pad_length) {
             return;
         }
-        if (m_older_than_24_5) {
-            // Changing add_special_tokens at runtime was introduced in
-            // 24.5. Older tokenizers still allow manipulating their
-            // state but the effect is incorrect.
-            return;
-        }
+        // if (m_older_than_24_5) {
+        //     // Changing add_special_tokens at runtime was introduced in
+        //     // 24.5. Older tokenizers still allow manipulating their
+        //     // state but the effect is incorrect.
+        //     return;
+        // }
 
         // add_special_tokens is managed by Select op with a bool input.
         ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
@@ -173,15 +135,20 @@ class Tokenizer::TokenizerImpl {
         *max_trunc_length_tensor.data<int>() = max_trunc_length_val;
         ov::Tensor max_pad_length_tensor = ov::Tensor(ov::element::i32, {1});
         *max_pad_length_tensor.data<int>() = max_pad_length_val;
+
+        bool set_padding = max_length_val.has_value();
+        // Even if max_length is not set in order to disable truncation
+        // MAX_TRUNCATION_LENGTH_VAR_ID should be updated to max numeric limit.
+        bool set_truncation = padding_mode_val != PaddingMode::TRUNCATE || max_length_val.has_value();
 
         for (auto& state: infer_request_guard.get().query_state()) {
             if (state.get_name().find(add_special_tokens.name()) != std::string::npos) {
                 state.set_state(add_special_tensor);
             } else if (state.get_name().find(skip_special_tokens.name()) != std::string::npos) {
                 state.set_state(skip_special_tensor);
-            } else if (state.get_name().find(MAX_TRUNCATION_LENGTH_VAR_ID) != std::string::npos && padding_mode_val != PaddingMode::NONE) {
+            } else if (state.get_name().find(MAX_TRUNCATION_LENGTH_VAR_ID) != std::string::npos && set_truncation) {
                 state.set_state(max_trunc_length_tensor);
-            } else if (state.get_name().find(MAX_PAD_LENGTH_VAR_ID) != std::string::npos) {
+            } else if (state.get_name().find(MAX_PAD_LENGTH_VAR_ID) != std::string::npos && set_padding) {
                 state.set_state(max_pad_length_tensor);
             }
         }
@@ -238,7 +205,8 @@ class Tokenizer::TokenizerImpl {
 
         if (ov_tokenizer) {
             ov::pass::Manager manager;
-            manager.register_pass<MakeCombineSegmentsSatateful>();
+            manager.register_pass<ov::pass::VisualizeTree>("before.svg");
+            manager.register_pass<MakeAddSpecialTokensSatateful>();
             manager.register_pass<MakePaddingSatateful>();
             manager.register_pass<MakeTruncationSatateful>();
             manager.register_pass<ov::pass::VisualizeTree>("after.svg");
@@ -451,7 +419,7 @@ class Tokenizer::TokenizerImpl {
             );
         }
 
-        return pad_left(unpadded.input_ids, unpadded.attention_mask);
+        return {unpadded.input_ids, unpadded.attention_mask};
     }
 
     TokenizedInputs get_copied_results(ov::Tensor input_ids, ov::Tensor attention_mask) {

diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi
@@ -23,6 +23,7 @@ from openvino_genai.py_openvino_genai import Image2ImagePipeline
 from openvino_genai.py_openvino_genai import ImageGenerationConfig
 from openvino_genai.py_openvino_genai import InpaintingPipeline
 from openvino_genai.py_openvino_genai import LLMPipeline
+from openvino_genai.py_openvino_genai import PaddingMode
 from openvino_genai.py_openvino_genai import PerfMetrics
 from openvino_genai.py_openvino_genai import RawPerfMetrics
 from openvino_genai.py_openvino_genai import SD3Transformer2DModel
@@ -45,5 +46,5 @@ from openvino_genai.py_openvino_genai import draft_model
 from openvino_genai.py_openvino_genai import get_version
 import os as os
 from . import py_openvino_genai
-__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai']
+__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PaddingMode', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai']
 __version__: str