Skip to content

Commit

Permalink
Merge branch 'master' into cb-by-default
Browse files Browse the repository at this point in the history
  • Loading branch information
Wovchena authored Feb 5, 2025
2 parents 9d71fad + a224c1e commit 062e339
Show file tree
Hide file tree
Showing 15 changed files with 25 additions and 22 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ options = {"BUILD_TOKENIZERS" = "OFF", "ENABLE_SAMPLES" = "OFF"}

[build-system]
requires = [
"py-build-cmake==0.3.4",
"py-build-cmake==0.4.0",
"openvino~=2025.1.0.0.dev",
"pybind11-stubgen==2.5.1",
"cmake~=3.23.0; platform_system != 'Darwin' or platform_machine == 'x86_64'",
Expand Down
2 changes: 1 addition & 1 deletion samples/export-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
openvino-tokenizers~=2025.1.0.0.dev
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@faeebf3416d17e3a6761db5f2e05569e0319311b
numpy<2.0.0; sys_platform == 'darwin'
einops==0.8.0 # For Qwen
transformers_stream_generator==0.0.5 # For Qwen
Expand Down
1 change: 0 additions & 1 deletion src/cpp/include/openvino/genai/generation_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {

void drop();

GenerationOutputs back();
// Reads result of a generation for single iteration
GenerationOutputs read();
// Reads all generated tokens for all sequences
Expand Down
5 changes: 0 additions & 5 deletions src/cpp/src/generation_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@ void GenerationHandleImpl::drop() {
m_generation_stream->drop();
}

std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::back() {
OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped.");
return m_generation_stream->back();
}

std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::read() {
OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped.");
return m_generation_stream->read();
Expand Down
5 changes: 0 additions & 5 deletions src/cpp/src/generation_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@ class GenerationStream {
m_output_queue.push(std::move(outputs));
}

// Retrieving vector of pairs <sequence_id, token_ids> as we can generate multiple outputs for a single prompt
GenerationOutputs back() {
return m_output_queue.back();
}

GenerationOutputs read() {
return m_output_queue.pull();
}
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) {
void stream_generated_tokens(std::shared_ptr<ov::genai::StreamerBase> streamer_ptr,
ov::genai::GenerationHandle& handle) {
if (streamer_ptr && handle->can_read()) {
std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->back();
std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->read();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (streamer_ptr->put(gen_token)) {
handle->drop();
Expand Down
12 changes: 11 additions & 1 deletion src/cpp/src/paged_attention_transformations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ std::vector<KVHeadConfig> apply_paged_attention_transformations(std::shared_ptr<
config.v_head_size = value_shape[2].get_length();
}

// reset information in KV cache parameters
// reset information in KV cache parameters and set PagedAttention's rt_info
for (size_t idx = 0; idx < num_decoder_layers; idx++) {
auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)];
auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)];
Expand All @@ -59,6 +59,16 @@ std::vector<KVHeadConfig> apply_paged_attention_transformations(std::shared_ptr<
// order of dimensions within shapes are not required for plugin during compilation
k->set_partial_shape(ov::PartialShape::dynamic(4));
v->set_partial_shape(ov::PartialShape::dynamic(4));

// set KV cache parameters as rt_info for PagedAttention op, so plugins can apply
// model compile-time optimizations based on them
const KVHeadConfig& config = kv_cache_config[idx];

auto pa_op = k->get_output_target_inputs(0).begin()->get_node();
pa_op->get_rt_info()["num_k_heads"] = config.num_k_heads;
pa_op->get_rt_info()["k_head_size"] = config.k_head_size;
pa_op->get_rt_info()["num_v_heads"] = config.num_v_heads;
pa_op->get_rt_info()["v_head_size"] = config.v_head_size;
}

model->validate_nodes_and_infer_types();
Expand Down
6 changes: 4 additions & 2 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,8 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
image_embeds.reserve(single_images.size());

for (const auto& image : single_images) {
EncodedImage encoded_image = m_vision_encoder.encode(image);
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
EncodedImage encoded_image = m_vision_encoder.encode(image, vision_config);
image_embeds.push_back(std::move(encoded_image.resized_source));
formatted_prompt += image_token + "\n";
}
Expand Down Expand Up @@ -775,7 +776,8 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
ov::Tensor image_newline;

for (const auto& image : single_images) {
EncodedImage encoded_image = m_vision_encoder.encode(image);
ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}};
EncodedImage encoded_image = m_vision_encoder.encode(image, vision_config);

if (!image_newline) {
size_t embed_dim = encoded_image.resized_source.get_shape().at(2);
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/visual_language/vlm_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {

// Setting llava_next specific config params
read_json_param(parsed, "image_newline", image_newline);
if (parsed.contains("vision_config")) {
read_json_param(parsed.at("vision_config"), "patch_size", vision_config_patch_size);
}
// phi3_v
if (parsed.contains("sub_GN")) {
sub_GN = parsed.at("sub_GN").get<std::vector<std::vector<std::vector<std::vector<float>>>>>().at(0).at(0).at(0);
Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/visual_language/vlm_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class VLMConfig {

// llava_next specific config params
std::vector<float> image_newline;
size_t vision_config_patch_size = 14;

/// @brief A string token denoting start of image embeddings for InternVL2 model.
std::string image_start_token = "<img>";
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ std::pair<ov::genai::EncodedResults, bool> decode(std::shared_ptr<ov::genai::Whi
return;
}

std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->back();
std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->read();
for (const auto& gen_token : token.begin()->second.generated_ids) {
if (streamer_ptr->put(gen_token)) {
handle->drop();
Expand Down
2 changes: 0 additions & 2 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -673,8 +673,6 @@ class GenerationFinishReason:
def value(self) -> int:
...
class GenerationHandle:
def back(self) -> dict[int, GenerationOutput]:
...
def can_read(self) -> bool:
...
def drop(self) -> None:
Expand Down
1 change: 0 additions & 1 deletion src/python/py_continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ void init_continuous_batching_pipeline(py::module_& m) {
.def("get_status", &GenerationHandleImpl::get_status)
.def("can_read", &GenerationHandleImpl::can_read)
.def("drop", &GenerationHandleImpl::drop)
.def("back", &GenerationHandleImpl::back)
.def("read", &GenerationHandleImpl::read)
.def("read_all", &GenerationHandleImpl::read_all);

Expand Down
1 change: 1 addition & 0 deletions tests/python_tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def get_ov_model(model_id, cache):
"katuni4ka/tiny-random-minicpmv-2_6",
"katuni4ka/tiny-random-phi3-vision",
"katuni4ka/tiny-random-llava",
"katuni4ka/tiny-random-llava-next",
"katuni4ka/tiny-random-qwen2vl",
])
def test_vlm_pipeline(model_id, cache):
Expand Down
2 changes: 1 addition & 1 deletion tools/llm_bench/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ torch
transformers>=4.40.0
diffusers>=0.22.0
#optimum is in dependency list of optimum-intel
git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
git+https://github.com/huggingface/optimum-intel.git@faeebf3416d17e3a6761db5f2e05569e0319311b#egg=optimum-intel
git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
packaging
psutil
Expand Down

0 comments on commit 062e339

Please sign in to comment.