Skip to content

Commit

Permalink
significant improvements; extended tests; all regimes match
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Jan 31, 2025
1 parent 85e3260 commit 57f26ae
Show file tree
Hide file tree
Showing 10 changed files with 209 additions and 138 deletions.
19 changes: 18 additions & 1 deletion src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,24 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
std::shared_ptr<TokenizerImpl> m_pimpl;
};

enum class PaddingMode { TRUNCATE = 0, LONGEST = 1, MAX_LENGTH = 2, DO_NOT_PAD = 3, NONE = 4};
/**
* @enum PaddingMode
* @brief Enum class representing different padding modes for tokenization.
*
* This enum class defines modes that can be used to pad tokenized sequences.
* IMPORTANT NOTICE, even in truncation mode the padding is applied to the longest sequence in the batch
* since resulting tokenization is stored as a signe ov::Tensor which cannot store irregular/ragged array.
*
* @var PaddingMode::TRUNCATE
* Truncate the sequence to the maximum length specified. (But also LONGEST mode is implicitly applied.)
*
* @var PaddingMode::LONGEST
* Pad the sequence to the length of the longest sequence in the batch. In this mode truncation is switched off.
*
* @var PaddingMode::MAX_LENGTH
* Pad the sequence to a specified maximum length. In this mode truncation is switched off.
*/
enum class PaddingMode { TRUNCATE, LONGEST, MAX_LENGTH };

static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
Expand Down
62 changes: 44 additions & 18 deletions src/cpp/src/make_tokenizer_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

#include "make_tokenizer_stateful.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/concat.hpp"
#include "openvino/op/select.hpp"
#include "openvino/op/maximum.hpp"
#include "openvino/op/minimum.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/subtract.hpp"
#include "openvino/op/slice.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/read_value.hpp"
Expand All @@ -16,8 +18,7 @@
using namespace ov;
using namespace ov::op;

bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {

bool ov::genai::MakeAddSpecialTokensSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
std::shared_ptr<ov::Node> combine_seg_node;
for (auto node: model->get_ordered_ops()) {
if (strcmp(node->get_type_info().name, "CombineSegments") == 0) {
Expand Down Expand Up @@ -137,25 +138,28 @@ bool ov::genai::MakeTruncationSatateful::run_on_model(const std::shared_ptr<ov::
return false;
}

auto add = ov::as_type_ptr<ov::op::v1::Add>(combine_segments_node->input_value(4).get_node_shared_ptr());
if (!add) {
return false;
}
auto min_node = ov::as_type_ptr<ov::op::v1::Minimum>(add->get_input_node_shared_ptr(0));
if (!min_node) {
min_node = ov::as_type_ptr<ov::op::v1::Minimum>(add->get_input_node_shared_ptr(1));
}
if (!min_node) {
std::shared_ptr<Node> add_or_sub_node = combine_segments_node->input_value(4).get_node_shared_ptr();
// If Add then it's a right truncation, if Subtract then it's a left truncation.
if (!ov::as_type_ptr<v1::Add>(add_or_sub_node) && !ov::as_type_ptr<v1::Subtract>(add_or_sub_node)) {
// Exit if it's neither, because in that case it's not a truncation.
return false;
}

// auto pattern_2 = ov::pass::pattern::wrap_type<ov::op::v0::Constant>(ov::pass::pattern::rank_equals(1));
// auto unsqueeze = ov::pass::pattern::wrap_type<ov::op::v1::Reshape, ov::op::v0::Unsqueeze>({cell, pattern_2});
// ov::pass::pattern::Matcher matcher(unsqueeze);

// Minimum between max_length and length of token sequence.
auto min_node = ov::as_type_ptr<v1::Minimum>(add_or_sub_node->get_input_node_shared_ptr(1));
if (!min_node) { return false; }

auto const_node = ov::as_type_ptr<v0::Constant>(min_node->get_input_node_shared_ptr(0));
if (!const_node) {
const_node = ov::as_type_ptr<v0::Constant>(min_node->get_input_node_shared_ptr(1));
}
if (!const_node) {
return false;
}
// Node which subtracts from max_truncation_length number of added_tokens.
auto sub_node = ov::as_type_ptr<v1::Subtract>(min_node->get_input_node_shared_ptr(1));
if (!sub_node) { return false; }

// max_truncation_length constant containing final length at the end of pipeline.
auto const_node = ov::as_type_ptr<v0::Constant>(sub_node->get_input_node_shared_ptr(0));
if (!const_node) { return false; }

op::util::VariableInfo var_info{const_node->get_output_shape(0), const_node->get_output_element_type(0), MAX_TRUNCATION_LENGTH_VAR_ID};
auto variable = std::make_shared<op::util::Variable>(var_info);
Expand All @@ -168,6 +172,28 @@ bool ov::genai::MakeTruncationSatateful::run_on_model(const std::shared_ptr<ov::
target_input.replace_source_output(read_trunc_value->output(0));
}

// We need to check if user requested to not add special tokens.
std::shared_ptr<v6::ReadValue> read_value_spec_tokens;
for (const auto& sink : model->get_sinks()) {
// Check if sink accepts input from Assign, and if that't the case get the ReadValus node input.
if (auto read_value = ov::as_type_ptr<v6::ReadValue>(sink->get_input_node_shared_ptr(0))) {
if (read_value->get_variable()->get_info().variable_id == ADD_SPECIAL_TOKENS_VAR_ID) {
read_value_spec_tokens = read_value;
break;
}
}
}

// Constant which stores number of added_tokens.
auto num_added_tokens_const = ov::as_type_ptr<v0::Constant>(sub_node->get_input_node_shared_ptr(1));
// If user requested to not add special tokens in order to correctly calculate
// truncation we need to enforce num_added_tokens to 0 regardless the hardcoded value of Constant.
if (read_value_spec_tokens && num_added_tokens_const) {
auto zero_constant = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{}, std::vector{0});
auto select_node = std::make_shared<v1::Select>(read_value_spec_tokens, num_added_tokens_const, zero_constant);
sub_node->input(1).replace_source_output(select_node->output(0));
}

auto assign = std::make_shared<v6::Assign>(read_trunc_value, variable);
model->add_sinks({assign});
model->add_variables({variable});
Expand Down
7 changes: 4 additions & 3 deletions src/cpp/src/make_tokenizer_stateful.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "openvino/op/constant.hpp"
#include "openvino/pass/pass.hpp"
#include "openvino/pass/matcher_pass.hpp"

namespace ov {
namespace genai {
Expand Down Expand Up @@ -32,9 +33,9 @@ namespace genai {
* | CombineSegments |
* +-------------------------+
**/
class MakeCombineSegmentsSatateful : public ov::pass::ModelPass {
class MakeAddSpecialTokensSatateful : public ov::pass::ModelPass {
public:
OPENVINO_MODEL_PASS_RTTI("MakeCombineSegmentsSatateful");
OPENVINO_MODEL_PASS_RTTI("MakeAddSpecialTokensSatateful");
bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
};

Expand Down Expand Up @@ -97,7 +98,7 @@ class MakeVocabDecoderSatateful : public ov::pass::ModelPass {
const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
const std::string MAX_PAD_LENGTH_VAR_ID = "max_pad_length";
const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_pad_length";
const std::string MAX_TRUNCATION_LENGTH_VAR_ID = "max_truncation_length";

} // namespace genai
} // namespace ov
76 changes: 22 additions & 54 deletions src/cpp/src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,39 +23,6 @@

namespace {

// todo: remove when openvino-tokenizers will support left padding
ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention_mask) {
const size_t batch_size = input_ids.get_shape()[0];
const size_t sequence_length = input_ids.get_shape()[1];
int64_t* inputs_data = input_ids.data<int64_t>();
int64_t* attention_mask_data = attention_mask.data<int64_t>();

for (size_t batch = 0; batch < batch_size; batch++) {
const size_t batch_offset = batch * sequence_length;

// last token in the sequence is not a PAD_TOKEN, skipping
if (attention_mask_data[batch_offset + sequence_length - 1] == 1)
continue;

size_t pad_tokens_number = 0;
for (int i = sequence_length - 1; i >= 0; i--) {
const size_t token_offset = batch_offset + i;

// count pad tokens
if (attention_mask_data[token_offset] == 0)
continue;

if (pad_tokens_number == 0)
pad_tokens_number = sequence_length - i - 1;

std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]);
std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]);
}
}

return {input_ids, attention_mask};
}

void check_arguments(const ov::AnyMap& parameters, std::set<std::string> allowed_argnames) {
for (const auto& [key, value] : parameters) {
if (allowed_argnames.find(key) == allowed_argnames.end()) {
Expand Down Expand Up @@ -119,11 +86,7 @@ class Tokenizer::TokenizerImpl {
case PaddingMode::LONGEST:
return {std::numeric_limits<int32_t>::max(), 0};
case PaddingMode::MAX_LENGTH:
return {std::numeric_limits<int32_t>::max(), max_length};
case PaddingMode::DO_NOT_PAD:
// behaves exactly as longest
// TODO: need to find a way to disable padding automatically so that it will match to HF.
return {std::numeric_limits<int32_t>::max(), 0};
return {max_length, max_length};
default:
OPENVINO_THROW("Unknown padding mode");
}
Expand All @@ -132,8 +95,8 @@ class Tokenizer::TokenizerImpl {
void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
bool add_special_tokens_flag = m_add_special_tokens;
bool skip_special_tokens_flag = m_skip_special_tokens;
size_t max_length_val;
PaddingMode padding_mode_val = PaddingMode::NONE;
std::optional<size_t> max_length_val;
PaddingMode padding_mode_val = PaddingMode::TRUNCATE;

ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
Expand All @@ -143,23 +106,22 @@ class Tokenizer::TokenizerImpl {
int max_trunc_length_val = m_max_trunc_length;
int max_pad_length_val = m_max_pad_length;

std::tie(max_trunc_length_val, max_pad_length_val) = get_padding_values(padding_mode_val, max_length_val);
std::tie(max_trunc_length_val, max_pad_length_val) = get_padding_values(padding_mode_val, *max_length_val);

// If user requested add_special_tokens mode different from the current one,
// need to set state variable.
// If requested mode matches the stored state set, then don't touch states.
// If requested add[skip]_special_tokens, max_length or pading mode
// is different from the stored state, need to set state variable.
if (add_special_tokens_flag == m_add_special_tokens
&& skip_special_tokens_flag == m_skip_special_tokens
&& max_trunc_length_val == m_max_trunc_length
&& max_pad_length_val == m_max_pad_length) {
return;
}
if (m_older_than_24_5) {
// Changing add_special_tokens at runtime was introduced in
// 24.5. Older tokenizers still allow manipulating their
// state but the effect is incorrect.
return;
}
// if (m_older_than_24_5) {
// // Changing add_special_tokens at runtime was introduced in
// // 24.5. Older tokenizers still allow manipulating their
// // state but the effect is incorrect.
// return;
// }

// add_special_tokens is managed by Select op with a bool input.
ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
Expand All @@ -173,15 +135,20 @@ class Tokenizer::TokenizerImpl {
*max_trunc_length_tensor.data<int>() = max_trunc_length_val;
ov::Tensor max_pad_length_tensor = ov::Tensor(ov::element::i32, {1});
*max_pad_length_tensor.data<int>() = max_pad_length_val;

bool set_padding = max_length_val.has_value();
// Even if max_length is not set in order to disable truncation
// MAX_TRUNCATION_LENGTH_VAR_ID should be updated to max numeric limit.
bool set_truncation = padding_mode_val != PaddingMode::TRUNCATE || max_length_val.has_value();

for (auto& state: infer_request_guard.get().query_state()) {
if (state.get_name().find(add_special_tokens.name()) != std::string::npos) {
state.set_state(add_special_tensor);
} else if (state.get_name().find(skip_special_tokens.name()) != std::string::npos) {
state.set_state(skip_special_tensor);
} else if (state.get_name().find(MAX_TRUNCATION_LENGTH_VAR_ID) != std::string::npos && padding_mode_val != PaddingMode::NONE) {
} else if (state.get_name().find(MAX_TRUNCATION_LENGTH_VAR_ID) != std::string::npos && set_truncation) {
state.set_state(max_trunc_length_tensor);
} else if (state.get_name().find(MAX_PAD_LENGTH_VAR_ID) != std::string::npos) {
} else if (state.get_name().find(MAX_PAD_LENGTH_VAR_ID) != std::string::npos && set_padding) {
state.set_state(max_pad_length_tensor);
}
}
Expand Down Expand Up @@ -238,7 +205,8 @@ class Tokenizer::TokenizerImpl {

if (ov_tokenizer) {
ov::pass::Manager manager;
manager.register_pass<MakeCombineSegmentsSatateful>();
manager.register_pass<ov::pass::VisualizeTree>("before.svg");
manager.register_pass<MakeAddSpecialTokensSatateful>();
manager.register_pass<MakePaddingSatateful>();
manager.register_pass<MakeTruncationSatateful>();
manager.register_pass<ov::pass::VisualizeTree>("after.svg");
Expand Down Expand Up @@ -451,7 +419,7 @@ class Tokenizer::TokenizerImpl {
);
}

return pad_left(unpadded.input_ids, unpadded.attention_mask);
return {unpadded.input_ids, unpadded.attention_mask};
}

TokenizedInputs get_copied_results(ov::Tensor input_ids, ov::Tensor attention_mask) {
Expand Down
3 changes: 2 additions & 1 deletion src/python/openvino_genai/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ from openvino_genai.py_openvino_genai import Image2ImagePipeline
from openvino_genai.py_openvino_genai import ImageGenerationConfig
from openvino_genai.py_openvino_genai import InpaintingPipeline
from openvino_genai.py_openvino_genai import LLMPipeline
from openvino_genai.py_openvino_genai import PaddingMode
from openvino_genai.py_openvino_genai import PerfMetrics
from openvino_genai.py_openvino_genai import RawPerfMetrics
from openvino_genai.py_openvino_genai import SD3Transformer2DModel
Expand All @@ -45,5 +46,5 @@ from openvino_genai.py_openvino_genai import draft_model
from openvino_genai.py_openvino_genai import get_version
import os as os
from . import py_openvino_genai
__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai']
__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PaddingMode', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai']
__version__: str
Loading

0 comments on commit 57f26ae

Please sign in to comment.