Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix InputsEmbedderPhi3V #1

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 17 additions & 24 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1407,15 +1407,15 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
return tokenized;
}

ov::Tensor insert_image_placeholders(const std::vector<ov::Tensor>& chunks, size_t tokens_per_image) {
ov::Tensor insert_image_placeholders(const std::vector<ov::Tensor>& chunks, const std::vector<size_t>& tokens_per_images) {
size_t merged_length = 0;
for (const ov::Tensor& chunk : chunks) {
merged_length += chunk.get_shape().at(1);
}
merged_length += chunks.empty() ? 0 : (chunks.size() - 1) * tokens_per_image;
merged_length += std::accumulate(tokens_per_images.begin(), tokens_per_images.end(), 0);
ov::Tensor merged{ov::element::i64, {1, merged_length}};
size_t offset = 0;
int64_t image_id = -1;
int64_t image_id = 0;
for (const ov::Tensor& chunk : chunks) {
size_t length = chunk.get_shape().at(1);
std::copy_n(
Expand All @@ -1427,11 +1427,11 @@ ov::Tensor insert_image_placeholders(const std::vector<ov::Tensor>& chunks, size
if (offset < merged_length) {
std::fill_n(
merged.data<int64_t>() + offset,
tokens_per_image,
image_id
tokens_per_images.at(image_id),
-image_id - 1 // It could be just -image_id. -1 is for consistency with the original implementation.
);
offset += tokens_per_image;
--image_id;
offset += tokens_per_images.at(image_id);
++image_id;
}
}
return merged;
Expand Down Expand Up @@ -1460,17 +1460,15 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
public:
ov::InferRequest m_hd_feature_transformer;
ov::InferRequest m_vision_projection;
// Used to insert <|image_i|>\n per image (not a slice).
size_t m_image_id = 1;
size_t m_tokens_per_image = 0;
std::vector<size_t> m_tokens_per_images;

InputsEmbedderPhi3V(
const VLMConfig& vlm_config,
const std::filesystem::path& model_dir,
const std::string& device,
const ov::AnyMap device_config
):
IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0},
IInputsEmbedder(vlm_config, model_dir, device, device_config),
m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device, {}).create_infer_request()} {}

Expand All @@ -1481,17 +1479,16 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
for (const ov::Tensor& image : to_single_image_tensors(images)) {
EncodedImage encoded_image = m_vision_encoder.encode(image);
images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
images_prompt << "<|image_" << m_image_id << "|>\n";
++m_image_id;
m_tokens_per_images.push_back(images_features_proj.back().get_shape().at(1));
images_prompt << "<|image_" << m_tokens_per_images.size() << "|>\n";
}
images_prompt << prompt;
std::vector<ov::Tensor> new_chat_tokens;
std::vector<ov::Tensor> prev_chat_tokens;
if (m_is_chat_conversation) {
m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}});
constexpr bool add_generation_prompt = true;
std::string new_templated_chat_history;
new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
auto start_tokenizer_time = std::chrono::steady_clock::now();
new_chat_tokens = phi3_v::split_tokenize(new_templated_chat_history, m_tokenizer);
prev_chat_tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer);
Expand All @@ -1504,19 +1501,15 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
auto end_tokenizer_time = std::chrono::steady_clock::now();
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
}
if (0 == m_tokens_per_image && !images_features_proj.empty()) {
m_tokens_per_image = images_features_proj.at(0).get_shape().at(1);
}
ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, m_tokens_per_image);
ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, m_tokens_per_image);
ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, m_tokens_per_images);
ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, m_tokens_per_images);
ov::Tensor new_tokens = update_history(new_merged_tokens, prev_merged_tokens);
std::vector<ov::Tensor> tokens = phi3_v::drop_image_placeholders(new_tokens);
OPENVINO_ASSERT(tokens.size() == images_features_proj.size() + 1);
size_t features_length = 0;
for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) {
size_t text_length = tokens.at(im_id).get_shape().at(1);
size_t im_length = images_features_proj.at(im_id).get_shape().at(1);
OPENVINO_ASSERT(im_length == m_tokens_per_image);
features_length += text_length + im_length;
}
features_length += tokens.back().get_shape().at(1);
Expand Down Expand Up @@ -1549,20 +1542,20 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
);

if (!m_is_chat_conversation) {
m_image_id = 0;
m_tokens_per_images.clear();
}

return inputs_embeds;
}

virtual void start_chat(const std::string& system_message) override {
IInputsEmbedder::start_chat(system_message);
m_image_id = 0;
m_tokens_per_images.clear();
}

virtual void finish_chat() override {
IInputsEmbedder::finish_chat();
m_image_id = 0;
m_tokens_per_images.clear();
}
};

Expand Down