From 5cbadd1603c4019a046bbf46b0dd87feab1e7cbd Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 29 Jan 2025 12:42:41 +0400 Subject: [PATCH] CB: preparation for relying on KV cache precisions from plugins (#1634) - Currently we have logic to detect KV cache precision and this logic become more and more complex - The idea is to rely on plugin's logic and compiled PA model with `ov::element::dynamic` precisions for KV cache inputs. - Later, take `ov::CompiledModel` and extract precisions from its `inputs()` - Then create tensors based on computed `num_kv_blocks` which depends on KV cache precisions. Currently, logic to mimic plugin's logic for KV cache precisions is still here, but will be dropped once plugin will support `ov::element::dynamic` --- .github/labeler.yml | 4 +- src/cpp/src/cache_manager.hpp | 169 ++++++++++++------ src/cpp/src/continuous_batching_impl.cpp | 127 +++++++++++-- src/cpp/src/continuous_batching_impl.hpp | 4 +- src/cpp/src/device_config.hpp | 115 +----------- .../paged_attention_transformations.cpp | 24 ++- .../paged_attention_transformations.hpp | 0 src/cpp/src/scheduler.hpp | 30 ++-- ...batching_for_speculative_decoding_impl.cpp | 3 +- ...batching_for_speculative_decoding_impl.hpp | 3 +- .../speculative_decoding_impl.cpp | 11 +- tests/cpp/CMakeLists.txt | 6 +- tests/cpp/cache_manager.cpp | 91 ++++------ tests/cpp/device_config.cpp | 33 ---- tests/cpp/helper.cpp | 27 +++ tests/cpp/helper.hpp | 8 + tests/cpp/scheduler.cpp | 34 +--- tests/cpp/speculative_decoding.cpp | 3 +- 18 files changed, 352 insertions(+), 340 deletions(-) rename src/cpp/src/{utils => }/paged_attention_transformations.cpp (80%) rename src/cpp/src/{utils => }/paged_attention_transformations.hpp (100%) delete mode 100644 tests/cpp/device_config.cpp create mode 100644 tests/cpp/helper.cpp create mode 100644 tests/cpp/helper.hpp diff --git a/.github/labeler.yml b/.github/labeler.yml index 2bfe4248c1..a75abd795c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -103,8 +103,8 @@ - 'src/cpp/src/generation_handle.cpp' - 'src/cpp/src/generation_stream.hpp' - 'src/cpp/src/model_runner.hpp' -- 'src/cpp/src/utils/paged_attention_transformations.cpp' -- 'src/cpp/src/utils/paged_attention_transformations.hpp' +- 'src/cpp/src/paged_attention_transformations.cpp' +- 'src/cpp/src/paged_attention_transformations.hpp' - 'src/cpp/src/scheduler.hpp' - 'src/cpp/src/sequence_group.cpp' - 'src/cpp/src/sequence_group.hpp' diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 5a0ff9b9f3..255bb926be 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -45,19 +45,19 @@ class TensorMmapAllocator { #endif namespace ov::genai { + class CacheManager { - DeviceConfig m_device_config; - std::vector m_key_cache; - std::vector m_value_cache; - size_t m_num_allocated_kv_blocks = 0; + size_t m_num_decoder_layers = 0; + std::string m_device; + std::vector m_key_precisions, m_value_precisions; + std::vector m_key_shapes, m_value_shapes; + std::vector m_key_cache, m_value_cache; + size_t m_num_allocated_kv_blocks = 0, m_block_size_in_bytes = 0; ov::InferRequest m_request; - ov::Core m_core; - ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) { - ov::PartialShape res_shape = shape; - res_shape[0] = dim; - OPENVINO_ASSERT(res_shape.is_static()); - return res_shape.to_shape(); + static ov::Shape set_kv_blocks(ov::PartialShape pshape, size_t num_kv_blocks) { + pshape[0] = num_kv_blocks; + return pshape.get_shape(); } void update_request_tensor(size_t decoder_layer_id) { @@ -65,41 +65,106 @@ class CacheManager { m_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_value_cache[decoder_layer_id]); } + ov::PartialShape patch_shape(ov::PartialShape pshape, ov::element::Type cache_type) { + OPENVINO_ASSERT(!m_device.empty(), "Internal error: device is not set"); + + if (m_device.find("CPU") != std::string::npos && cache_type == ov::element::u8) { + // Scale, zero point and quantized data will be stored together. + // The layout for per token per head: + // |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| + // so, we have to extend head_size by 8, which is sizeof(float) + // for scale and sizeof(float) for zeropoint + pshape[3] += 2 * sizeof(float); + } + + return pshape; + } + public: - explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) : - m_device_config(device_config), - m_request(request), - m_core(core) { - m_key_cache.reserve(m_device_config.get_num_layers()); - m_value_cache.reserve(m_device_config.get_num_layers()); + CacheManager(ov::InferRequest request, const DeviceConfig& device_config) : + m_request(request) { + // extract information about inference device + ov::CompiledModel compiled_model = request.get_compiled_model(); + std::vector execution_devices = compiled_model.get_property(ov::execution_devices); + OPENVINO_ASSERT(execution_devices.size() == 1, "Contituous batching: execution device is expected to be CPU or GPU, but got ", execution_devices.size(), " devices"); + m_device = execution_devices[0]; + + // extract information about KV cache precisions and shapes + size_t kv_input_index = 0; + for (const auto& input : compiled_model.inputs()) { + for (auto & name : input.get_names()) { + auto cache_precision = input.get_element_type(); + + if (name.find("key_cache.") == 0) { + auto pshape = patch_shape(device_config.get_key_cache_shape(kv_input_index), cache_precision); + m_key_shapes.push_back(pshape); + m_key_precisions.push_back(cache_precision); + m_block_size_in_bytes += pshape[1].get_length() * pshape[2].get_length() * pshape[3].get_length() * cache_precision.size(); + break; + } else if (name.find("value_cache.") == 0) { + auto pshape = patch_shape(device_config.get_value_cache_shape(kv_input_index), cache_precision); + m_value_shapes.push_back(pshape); + m_value_precisions.push_back(cache_precision); + m_block_size_in_bytes += pshape[1].get_length() * pshape[2].get_length() * pshape[3].get_length() * cache_precision.size(); + ++kv_input_index; + break; + } + } + } + + m_num_decoder_layers = m_value_precisions.size(); + OPENVINO_ASSERT(m_num_decoder_layers == m_key_precisions.size(), "Invalid case: a different number of K and V caches in a LLM model"); + } + + size_t get_num_decoder_layers() const { + return m_num_decoder_layers; + } + + std::string get_device() const { + return m_device; + } + + ov::element::Type get_key_cache_precision(size_t decoder_layer_id) const { + OPENVINO_ASSERT(decoder_layer_id < m_key_precisions.size()); + return m_key_precisions[decoder_layer_id]; + } + + ov::element::Type get_value_cache_precision(size_t decoder_layer_id) const { + OPENVINO_ASSERT(decoder_layer_id < m_value_precisions.size()); + return m_value_precisions[decoder_layer_id]; + } + + size_t get_block_size_in_bytes() const { + return m_block_size_in_bytes; } void allocate_cache_if_needed(size_t num_kv_blocks) { if (m_num_allocated_kv_blocks >= num_kv_blocks) { return; } - OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size()); - m_num_allocated_kv_blocks = num_kv_blocks; - const std::string device_name = m_device_config.get_device(); + m_num_allocated_kv_blocks = num_kv_blocks; ov::Coordinate start_key{0,0,0,0}; ov::Coordinate start_value{0,0,0,0}; - if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks); - ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks); + if (m_device.find("GPU") == std::string::npos) {// Allocate KV caches + for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) { + ov::Shape value_cache_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], num_kv_blocks); + ov::Shape key_cache_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], num_kv_blocks); + + ov::element::Type key_precision = get_key_cache_precision(decoder_layer_id); + ov::element::Type value_precision = get_value_cache_precision(decoder_layer_id); + #ifdef _WIN32 - ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape); - ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape); + ov::Tensor key_cache(key_precision, key_cache_shape); + ov::Tensor value_cache(value_precision, value_cache_shape); #else - auto key_size = ov::shape_size(key_cache_shape) * m_device_config.get_cache_precision().size(); - auto value_size = ov::shape_size(value_cache_shape) * m_device_config.get_cache_precision().size(); - - ov::Tensor key_cache = ov::Tensor(m_device_config.get_cache_precision(), key_cache_shape, TensorMmapAllocator(key_size)); - ov::Tensor value_cache = ov::Tensor(m_device_config.get_cache_precision(), value_cache_shape, TensorMmapAllocator(value_size)); + auto key_size = ov::shape_size(key_cache_shape) * key_precision.size(); + auto value_size = ov::shape_size(value_cache_shape) * value_precision.size(); + ov::Tensor key_cache(key_precision, key_cache_shape, TensorMmapAllocator(key_size)); + ov::Tensor value_cache(value_precision, value_cache_shape, TensorMmapAllocator(value_size)); #endif auto key_cache_roi_end = static_cast(key_cache.data()); @@ -137,8 +202,7 @@ class CacheManager { if (m_key_cache.size() > decoder_layer_id) { m_key_cache[decoder_layer_id] = key_cache; m_value_cache[decoder_layer_id] = value_cache; - } - else { + } else { m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); } @@ -146,15 +210,15 @@ class CacheManager { update_request_tensor(decoder_layer_id); } } else { - auto remote_context = m_core.get_default_context(device_name); - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks); - ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks); - ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), - key_cache_shape); - ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), - value_cache_shape); - + auto remote_context = m_request.get_compiled_model().get_context(); + + for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) { + ov::Shape value_cache_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], num_kv_blocks); + ov::Shape key_cache_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], num_kv_blocks); + + ov::Tensor key_cache = remote_context.create_tensor(get_key_cache_precision(decoder_layer_id), key_cache_shape); + ov::Tensor value_cache = remote_context.create_tensor(get_value_cache_precision(decoder_layer_id), value_cache_shape); + if (m_key_cache.size() > decoder_layer_id) { ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); @@ -167,23 +231,23 @@ class CacheManager { m_key_cache[decoder_layer_id] = key_cache; m_value_cache[decoder_layer_id] = value_cache; - } - else { + } else { m_key_cache.emplace_back(key_cache); m_value_cache.emplace_back(value_cache); } + update_request_tensor(decoder_layer_id); } } } ov::Tensor get_key_cache(size_t decoder_layer_id) const { - OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size()); + OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size(), "decoder_layer_id = ", decoder_layer_id, ", num_layers = ", m_key_cache.size()); return m_key_cache[decoder_layer_id]; } ov::Tensor get_value_cache(size_t decoder_layer_id) const { - OPENVINO_ASSERT(decoder_layer_id < m_value_cache.size()); + OPENVINO_ASSERT(decoder_layer_id < m_value_cache.size(), "decoder_layer_id = ", decoder_layer_id, ", num_layers = ", m_value_cache.size()); return m_value_cache[decoder_layer_id]; } @@ -192,9 +256,9 @@ class CacheManager { size_t src_block_id = blocks_pair.first; const std::list& dst_block_ids = blocks_pair.second; for (size_t dst_block_id : dst_block_ids) { - for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { - ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks); - ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks); + for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) { + ov::Shape key_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], m_num_allocated_kv_blocks); + ov::Shape value_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], m_num_allocated_kv_blocks); ov::Coordinate key_src_start_roi(key_shape.size(), 0); ov::Coordinate key_src_end_roi = key_shape; ov::Coordinate key_dst_start_roi(key_shape.size(), 0); @@ -221,13 +285,6 @@ class CacheManager { } } } - - std::shared_ptr get_core() { - return std::make_shared(m_core); - } - - std::shared_ptr get_device_config() { - return std::make_shared(m_device_config); - } }; + } diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 99df043090..b4100f8aec 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -7,9 +7,95 @@ #include "text_callback_streamer.hpp" #include "continuous_batching_impl.hpp" #include "utils.hpp" -#include "utils/paged_attention_transformations.hpp" +#include "paged_attention_transformations.hpp" #include "lora_helper.hpp" #include "cache_state_dumper.hpp" +#include "utils.hpp" + +namespace { + +ov::element::Type get_model_kv_cache_precision(std::shared_ptr model) { + const std::vector kv_cache_precision_path = { "runtime_options", ov::hint::kv_cache_precision.name() }; + ov::element::Type ir_kv_cache_precision = ov::element::undefined; + + if (model->has_rt_info(kv_cache_precision_path)) { + ir_kv_cache_precision = model->get_rt_info(kv_cache_precision_path); + } + + return ir_kv_cache_precision; +} + +void apply_kv_cache_precision(const std::shared_ptr& model, const std::string& device, const ov::AnyMap& plugin_config) { + ov::element::Type m_kv_cache_type = ov::element::undefined, ir_kv_cache_precision = get_model_kv_cache_precision(model); + ov::Core core = ov::genai::utils::singleton_core(); + + auto inference_precision = core.get_property(device, ov::hint::inference_precision); + // if user sets properties affecting KV cache precision + const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name()); + const auto kv_cache_precision_it = plugin_config.find(ov::hint::kv_cache_precision.name()); + const auto execution_mode_it = plugin_config.find(ov::hint::execution_mode.name()); + const bool accuracy_mode = execution_mode_it != plugin_config.end() && + execution_mode_it->second.as() == ov::hint::ExecutionMode::ACCURACY; + + if (device == "CPU") { + if (kv_cache_precision_it != plugin_config.end()) { + const auto kv_cache_precision = kv_cache_precision_it->second.as(); + m_kv_cache_type = kv_cache_precision; + } else if (accuracy_mode) { + // ACCURACY mode will use f32 KV cache type + m_kv_cache_type = ov::element::f32; + } else if (ir_kv_cache_precision != ov::element::undefined) { + // check that kv_cache_precision is set in runtime_info section of OpenVINO IR + // but in case it's set to FP16, we need to patch it to be BF16 for Xeon platforms + m_kv_cache_type = ir_kv_cache_precision == ov::element::f16 && inference_precision == ov::element::bf16 ? + inference_precision : ir_kv_cache_precision; + } else { + // x86 and ARM have different default kv cache type, take this information from the plugin + m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision); + } + + // TEMP WA: currently FP16 / BF16 KV cache is faster than U8 for PagedAttention + if (m_kv_cache_type == ov::element::u8) { + m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; + } + } else if (device.find("GPU") != std::string::npos) { + if (accuracy_mode) { + inference_precision = ov::element::f32; + } + if (inference_precision_it != plugin_config.end()) { + inference_precision = inference_precision_it->second.as(); + } + + m_kv_cache_type = inference_precision; + } else { + OPENVINO_THROW(device, " is not supported by OpenVINO Continuous Batching"); + } + + std::map> key_cache_params, value_cache_params; + for (const auto& param_ptr : model->get_parameters()) { + const auto& name = param_ptr->get_friendly_name(); + if (name.find("key_cache.") == 0) { + key_cache_params[name] = param_ptr; + } else if (name.find("value_cache.") == 0) { + value_cache_params[name] = param_ptr; + } + } + + OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size() && key_cache_params.size() > 0); + + size_t num_decoder_layers = key_cache_params.size(); + for (size_t idx = 0; idx < num_decoder_layers; idx++) { + auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)]; + auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)]; + + k->set_element_type(m_kv_cache_type); + v->set_element_type(m_kv_cache_type); + } + + model->validate_nodes_and_infer_types(); +} + +} // namespace namespace ov::genai { template struct overloaded : Ts... {using Ts::operator()...;}; @@ -27,15 +113,14 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( m_generation_config = generation_config; m_is_validation_mode_enabled = is_validation_mode_enabled; - ov::Core core = utils::singleton_core(); - DeviceConfig device_config(core, scheduler_config, device, properties); + DeviceConfig device_config(device); bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; bool allow_cache_rotation = scheduler_config.cache_eviction_config.apply_rotation; utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control, allow_cache_rotation); utils::apply_gather_before_matmul_transformation(model); - initialize_pipeline(model, scheduler_config, properties, device_config, core); + initialize_pipeline(model, scheduler_config, properties, device_config); } ContinuousBatchingPipeline::ContinuousBatchingImpl::~ContinuousBatchingImpl() { @@ -55,10 +140,13 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline( std::shared_ptr model, const SchedulerConfig& scheduler_config, const ov::AnyMap& properties, - const DeviceConfig& device_config, - ov::Core& core) { + const DeviceConfig& device_config) { + ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model; + // TODO: remove once plugin automatically set KV cache precisions + apply_kv_cache_precision(model, device_config.get_device(), properties); + // apply LoRA if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) { m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model."); @@ -71,24 +159,27 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline( ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention"); ov::InferRequest infer_request = compiled_model.create_infer_request(); - m_num_decoder_layers = device_config.get_num_layers(); - - // setup KV caches - std::shared_ptr cache_manager = std::make_shared(device_config, infer_request, core); + // Cache manager + std::shared_ptr cache_manager = std::make_shared(infer_request, device_config); + m_num_decoder_layers = cache_manager->get_num_decoder_layers(); - SchedulerConfig updated_config = scheduler_config; - // update KV blocks number in scheduler config - if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) { - updated_config.num_kv_blocks = device_config.get_num_kv_blocks(); + // Scheduler + SchedulerConfig normalized_config = scheduler_config; + if (normalized_config.num_kv_blocks == 0 && normalized_config.cache_size > 0) { + size_t size_in_bytes = normalized_config.cache_size * 1024 * 1024 * 1024; // convert GBs to bytes + normalized_config.num_kv_blocks = size_in_bytes / cache_manager->get_block_size_in_bytes(); } bool can_use_partial_preemption = true; - if (device_config.get_device().find("GPU") != std::string::npos && !updated_config.dynamic_split_fuse) { + if (device_config.get_device().find("GPU") != std::string::npos && !normalized_config.dynamic_split_fuse) { // in case of executing a `vLLM-like` pipeline, it's better not to use partial eviction on the GPU, // as it may lead to performance slowdown can_use_partial_preemption = false; } - m_scheduler = std::make_shared(device_config.get_block_size(), cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption); + + m_scheduler = std::make_shared(device_config.get_block_size(), cache_manager, normalized_config, m_num_decoder_layers, can_use_partial_preemption); + + // Model Runner bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; if (is_use_cache_eviction) { const auto& eviction_config = m_scheduler->get_config().cache_eviction_config; @@ -101,14 +192,14 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline( /* is_use_rotation_inputs = */ is_apply_rotation); if (eviction_config.apply_rotation) { m_rotation_deltas_stores.reserve(m_num_decoder_layers); - ov::Shape rotation_deltas_store_shape{scheduler_config.num_kv_blocks, 1}; // last dim can be later changed to BLOCK_SIZE for per-token granularity + ov::Shape rotation_deltas_store_shape{normalized_config.num_kv_blocks, 1}; // last dim can be later changed to BLOCK_SIZE for per-token granularity for (size_t i = 0; i < m_num_decoder_layers; i++) { ov::Tensor store(ov::element::i32, rotation_deltas_store_shape); std::memset(store.data(), 0, store.get_byte_size()); m_rotation_deltas_stores.push_back(store); } - size_t max_sequence_cache_occupation_length_in_blocks = scheduler_config.max_num_batched_tokens / m_scheduler->get_block_size() + 1; + size_t max_sequence_cache_occupation_length_in_blocks = normalized_config.max_num_batched_tokens / m_scheduler->get_block_size() + 1; size_t embedding_size = device_config.get_k_head_size(0); m_cache_rotation_calculator = std::make_shared( m_scheduler->get_block_size(), diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index f64657bc7a..9fa6c9c660 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -59,9 +59,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void initialize_pipeline(std::shared_ptr model, const SchedulerConfig& scheduler_config, const ov::AnyMap& plugin_config, - const DeviceConfig& device_config, - ov::Core& core); - + const DeviceConfig& device_config); /** * Pulls requests from awaiting queue to running queue diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 3d41960c5e..09020da9a8 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -20,11 +20,8 @@ struct KVHeadConfig { }; class DeviceConfig { - ov::element::Type m_kv_cache_type; std::vector m_key_cache_shape, m_value_cache_shape; std::vector m_kv_heads_config; - size_t m_num_decoder_layers = 0; - size_t m_num_kv_blocks = 0, m_cache_size = 0; // KV cache sizes in either blocks or GBs size_t m_block_size = 0; // block size is per inference device std::string m_device; @@ -35,90 +32,17 @@ class DeviceConfig { } public: - DeviceConfig(ov::Core& core, const SchedulerConfig& scheduling_config, const std::string& device, const ov::AnyMap& plugin_config = {}) { + explicit DeviceConfig(const std::string& device) { m_device = device; - - // keep information about blocsk m_block_size = get_block_size_by_device(device); - - if (m_device == "CPU") { - auto inference_precision = core.get_property(device, ov::hint::inference_precision); - m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - - // if user sets precision hint, kv cache type should be changed - const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name()); - if (inference_precision_it != plugin_config.end()) { - const auto inference_precision = inference_precision_it->second.as(); - if (inference_precision == ov::element::f32) { - m_kv_cache_type = ov::element::f32; - } else if (inference_precision == ov::element::f16) { - m_kv_cache_type = ov::element::f16; - } else if (inference_precision == ov::element::bf16) { - m_kv_cache_type = ov::element::bf16; - } else { - // use default f32 - m_kv_cache_type = ov::element::f32; - } - } - - // if user sets ov::kv_cache_precision hint - const auto kv_cache_precision_it = plugin_config.find(ov::hint::kv_cache_precision.name()); - if (kv_cache_precision_it != plugin_config.end()) { - const auto kv_cache_precision = kv_cache_precision_it->second.as(); - m_kv_cache_type = kv_cache_precision; - } - } else if (m_device.find("GPU") != std::string::npos) { - auto inference_precision = core.get_property(device, ov::hint::inference_precision); - m_kv_cache_type = inference_precision == ov::element::f16 ? ov::element::f16 : ov::element::f32; - - // if user sets precision hint, kv cache type should be changed - const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name()); - if (inference_precision_it != plugin_config.end()) { - const auto inference_precision = inference_precision_it->second.as(); - if (inference_precision == ov::element::f16) { - m_kv_cache_type = ov::element::f16; - } else { - // use default f32 - m_kv_cache_type = ov::element::f32; - } - } - } else { - OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching"); - } - - if (scheduling_config.num_kv_blocks > 0) { - m_num_kv_blocks = scheduling_config.num_kv_blocks; - } else if (scheduling_config.cache_size > 0) { - m_cache_size = scheduling_config.cache_size; - } } - void set_kv_head_configs(std::vector kv_heads_config) { + void set_kv_head_configs(const std::vector& kv_heads_config) { m_kv_heads_config = kv_heads_config; - m_num_decoder_layers = m_kv_heads_config.size(); - m_key_cache_shape.reserve(m_num_decoder_layers); - m_value_cache_shape.reserve(m_num_decoder_layers); - - if (m_device == "CPU") { - // Scale, zero point and quantized data will be stored together. - // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| - // so, we have to extend head_size by 8, which is sizeof(float) - // for scale and sizeof(float) for zeropoint - if (m_kv_cache_type == ov::element::u8) { - for (size_t layer_id = 0; layer_id < m_num_decoder_layers; ++layer_id) { - m_kv_heads_config[layer_id].k_head_size += 8; - m_kv_heads_config[layer_id].v_head_size += 8; - } - } - } + m_key_cache_shape.reserve(m_kv_heads_config.size()); + m_value_cache_shape.reserve(m_kv_heads_config.size()); - if (m_num_kv_blocks == 0 && m_cache_size > 0) { - size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; // convert GBs to bytes - m_num_kv_blocks = size_in_bytes / get_block_size_in_bytes(); - } - - for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + for (size_t layer_id = 0; layer_id < kv_heads_config.size(); layer_id++) { const KVHeadConfig& config = m_kv_heads_config[layer_id]; m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), @@ -126,7 +50,7 @@ class DeviceConfig { ov::Dimension(m_block_size), ov::Dimension(config.v_head_size)}); - if (m_device.find("GPU") == std::string::npos) { + if (m_device.find("CPU") != std::string::npos) { m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension(config.num_k_heads), ov::Dimension(m_block_size), @@ -145,44 +69,23 @@ class DeviceConfig { return m_device; } - ov::element::Type get_cache_precision() const { - return m_kv_cache_type; - } - - size_t get_num_layers() const { - return m_num_decoder_layers; - } - ov::PartialShape get_key_cache_shape(size_t id) const { OPENVINO_ASSERT(m_key_cache_shape.size()); return m_key_cache_shape[id]; } - size_t get_k_head_size(size_t layer_id) const { - return m_kv_heads_config[layer_id].k_head_size; - } - ov::PartialShape get_value_cache_shape(size_t id) const { OPENVINO_ASSERT(m_value_cache_shape.size()); return m_value_cache_shape[id]; } - size_t get_num_kv_blocks() const { - return m_num_kv_blocks; + size_t get_k_head_size(size_t layer_id) const { + return m_kv_heads_config[layer_id].k_head_size; } size_t get_block_size() const { return m_block_size; } - - size_t get_block_size_in_bytes() const { - size_t block_size_in_bytes = 0; - for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { - const KVHeadConfig& config = m_kv_heads_config[layer_id]; - block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads; - } - block_size_in_bytes *= get_block_size() * get_cache_precision().size(); - return block_size_in_bytes; - } }; + } diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/paged_attention_transformations.cpp similarity index 80% rename from src/cpp/src/utils/paged_attention_transformations.cpp rename to src/cpp/src/paged_attention_transformations.cpp index 17a3fdddbe..6d337136dc 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/paged_attention_transformations.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "utils/paged_attention_transformations.hpp" +#include "paged_attention_transformations.hpp" #include "openvino/pass/manager.hpp" #include "openvino/pass/sdpa_to_paged_attention.hpp" @@ -10,7 +10,6 @@ namespace ov { namespace genai { namespace utils { - size_t get_hidden_size(const std::shared_ptr model) { const auto& parameters = model->get_parameters(); // extract num_kv_heads and head_size @@ -50,23 +49,32 @@ void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& for (size_t idx = 0; idx < num_decoder_layers; idx++) { KVHeadConfig& config = kv_heads_config[idx]; - auto key_shape = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape(); + auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)]; + auto key_shape = k->get_partial_shape(); config.num_k_heads = key_shape[1].get_length(); config.k_head_size = key_shape[2].get_length(); - auto value_shape = value_cache_params[std::string("value_cache.") + std::to_string(idx)]->get_partial_shape(); + auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)]; + auto value_shape = v->get_partial_shape(); config.num_v_heads = value_shape[1].get_length(); config.v_head_size = value_shape[2].get_length(); } + + // save information about KV caches in device_config + // and create device dependent KV cache shapes device_config.set_kv_head_configs(kv_heads_config); for (size_t idx = 0; idx < num_decoder_layers; idx++) { auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)]; auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)]; - k->set_element_type(device_config.get_cache_precision()); - v->set_element_type(device_config.get_cache_precision()); - k->set_partial_shape(device_config.get_key_cache_shape(idx)); - v->set_partial_shape(device_config.get_value_cache_shape(idx)); + + // allow a plugin to automatically set KV cache precisions + k->set_element_type(ov::element::dynamic); + v->set_element_type(ov::element::dynamic); + + // set device specific KV cache shapes back to a PA model + k->set_partial_shape(ov::PartialShape::dynamic(4)); + v->set_partial_shape(ov::PartialShape::dynamic(4)); } model->validate_nodes_and_infer_types(); diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/paged_attention_transformations.hpp similarity index 100% rename from src/cpp/src/utils/paged_attention_transformations.hpp rename to src/cpp/src/paged_attention_transformations.hpp diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index ba6fe44cff..23db68deab 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -14,6 +14,7 @@ #include "sequence_group.hpp" #include "cache_manager.hpp" #include "timer.hpp" +#include "utils.hpp" namespace ov::genai { class Scheduler { @@ -462,12 +463,12 @@ class Scheduler { } size_t _get_available_gpu_memory() { - auto device_config = m_cache_manager->get_device_config(); - auto core = m_cache_manager->get_core(); - auto device = device_config->get_device(); + auto device = m_cache_manager->get_device(); OPENVINO_ASSERT(device.find("GPU") != std::string::npos, "_get_available_gpu_memory() is applicable for GPU only."); - auto memory_statistics = core->get_property(device, ov::intel_gpu::memory_statistics); - auto device_type = core->get_property(device, ov::device::type); + + ov::Core core = utils::singleton_core(); + auto memory_statistics = core.get_property(device, ov::intel_gpu::memory_statistics); + auto device_type = core.get_property(device, ov::device::type); // sum up all used device memory std::vector device_memory_types = {"cl_mem", "usm_device"}; @@ -487,7 +488,7 @@ class Scheduler { used_device_mem *= used_memory_threshold; // total device memory in bytes - auto total_device_memory = core->get_property(device, ov::intel_gpu::device_total_mem_size); + auto total_device_memory = core.get_property(device, ov::intel_gpu::device_total_mem_size); return total_device_memory - used_device_mem; } @@ -514,32 +515,29 @@ class Scheduler { if (!m_dynamic_memory_allocation) { return false; } - auto device_config = m_cache_manager->get_device_config(); - auto device = device_config->get_device(); + auto device = m_cache_manager->get_device(); size_t current_num_of_kv_blocks = m_block_manager->get_total_number_of_kv_blocks(); size_t new_blocks_num = current_num_of_kv_blocks * m_cache_growth_factor; if (device.find("GPU") == std::string::npos) { m_block_manager->increase_kv_blocks_number(new_blocks_num); - } - else { - size_t available_gpu_memory = _get_available_gpu_memory(); - size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * device_config->get_block_size_in_bytes(); + } else { + const size_t available_gpu_memory = _get_available_gpu_memory(); + const size_t block_size_in_bytes = m_cache_manager->get_block_size_in_bytes(); + size_t required_memory = (new_blocks_num - current_num_of_kv_blocks) * block_size_in_bytes; if (required_memory <= available_gpu_memory) { m_block_manager->increase_kv_blocks_number(new_blocks_num); } else { - size_t possible_blocks_to_add = available_gpu_memory / device_config->get_block_size_in_bytes(); + size_t possible_blocks_to_add = available_gpu_memory / block_size_in_bytes; if (possible_blocks_to_add > 0) { m_block_manager->increase_kv_blocks_number(current_num_of_kv_blocks + possible_blocks_to_add); - } - else { + } else { return false; } } } return true; } - }; } diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp index bec2b75e0d..2ecdbd66f3 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp @@ -5,7 +5,6 @@ namespace ov::genai { ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::ContinuousBatchingForSpeculativeDecodingImpl( - ov::Core& core, const std::shared_ptr& model, const Tokenizer& tokenizer, const GenerationConfig& generation_config, @@ -17,7 +16,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::Contin m_tokenizer = tokenizer; m_generation_config = generation_config; m_is_validation_mode_enabled = is_validation_mode_enabled; - initialize_pipeline(model, scheduler_config, plugin_config, device_config, core); + initialize_pipeline(model, scheduler_config, plugin_config, device_config); } void diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp index e4e4be63d8..b714316e75 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp @@ -13,8 +13,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : public: ContinuousBatchingForSpeculativeDecodingImpl() = default; - ContinuousBatchingForSpeculativeDecodingImpl(ov::Core& core, - const std::shared_ptr& model, + ContinuousBatchingForSpeculativeDecodingImpl(const std::shared_ptr& model, const Tokenizer& tokenizer, const GenerationConfig& generation_config, const DeviceConfig& device_config, diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index ddb3d0ae10..32d13feed1 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -5,8 +5,8 @@ #include "text_callback_streamer.hpp" #include "speculative_decoding_impl.hpp" +#include "paged_attention_transformations.hpp" #include "utils.hpp" -#include "utils/paged_attention_transformations.hpp" namespace ov::genai { @@ -35,6 +35,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con utils::apply_paged_attention_transformations(main_model, main_model_desc.scheduler_config.use_cache_eviction); utils::apply_paged_attention_transformations(draft_model, main_model_desc.scheduler_config.use_cache_eviction); + utils::apply_gather_before_matmul_transformation(main_model); utils::apply_gather_before_matmul_transformation(draft_model); @@ -63,9 +64,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con ov::AnyMap draft_properties = draft_model_desc.properties.empty() ? main_model_desc.properties : draft_model_desc.properties; - ov::Core core = utils::singleton_core(); - DeviceConfig main_device_config(core, main_scheduler_config_updated, main_device, main_model_desc.properties), - draft_device_config(core, draft_scheduler_config, draft_device, draft_properties); + DeviceConfig main_device_config(main_device), draft_device_config(draft_device); utils::set_kv_cache_type_and_shape(main_model, main_device_config); utils::set_kv_cache_type_and_shape(draft_model, draft_device_config); @@ -81,10 +80,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con m_tokenizer = main_model_tokenizer; // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode - m_main_pipeline = std::make_shared(core, + m_main_pipeline = std::make_shared( main_model, main_model_tokenizer, main_model_desc.generation_config, main_device_config, main_scheduler_config_updated, main_device, main_model_desc.properties, true); - m_draft_pipeline = std::make_shared(core, + m_draft_pipeline = std::make_shared( draft_model, draft_model_tokenizer, draft_model_desc.generation_config, draft_device_config, draft_scheduler_config, draft_device, draft_properties, false); } diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index d63ae17dcf..29e481cec3 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -20,15 +20,15 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sampler.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/speculative_decoding/*.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/prompt_lookup/*.cpp" - "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils/*.cpp" + "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/paged_attention_transformations.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/icontinuous_batching.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/lora_helper.cpp" "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/text_callback_streamer.cpp") -add_executable(${TEST_TARGET_NAME} ${tests_src} - block_allocator.cpp) +add_executable(${TEST_TARGET_NAME} ${tests_src}) + target_link_libraries(${TEST_TARGET_NAME} PRIVATE openvino::genai gtest_main gmock_main) target_include_directories(${TEST_TARGET_NAME} PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src") target_sources(${TEST_TARGET_NAME} PRIVATE ${src_files}) diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 0c483f0ec1..864a7b43af 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -7,37 +7,13 @@ #include "scheduler.hpp" #include "device_config.hpp" #include "cache_manager.hpp" -#include "openvino/op/concat.hpp" +#include "helper.hpp" using namespace ov::genai; -std::shared_ptr get_dummy_model(ov::Core core, size_t num_layers) { - ov::NodeVector keys; - ov::NodeVector values; - ov::ParameterVector params; - ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision); - ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - - auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); - for (size_t i = 0; i < num_layers; i++) { - auto key = std::make_shared(kv_cache_type, shape); - auto value = std::make_shared(kv_cache_type, shape); - key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); - value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); - keys.push_back(key); - values.push_back(value); - params.push_back(key); - params.push_back(value); - } - const auto& concat1 = std::make_shared(keys, 1); - const auto& concat2 = std::make_shared(values, 1); - auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); - return std::make_shared(ov::NodeVector{concat1, concat2}, params); -} - -size_t get_total_allocated_bytes(std::shared_ptr cache_manager, size_t num_decoder_layers) { +size_t get_total_allocated_bytes(std::shared_ptr cache_manager) { size_t allocated_bytes = 0; - for (size_t i = 0; i < num_decoder_layers; i++) { + for (size_t i = 0; i < cache_manager->get_num_decoder_layers(); i++) { auto key_cache = cache_manager->get_key_cache(i); auto value_cache = cache_manager->get_value_cache(i); allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); @@ -45,93 +21,98 @@ size_t get_total_allocated_bytes(std::shared_ptr cache_ return allocated_bytes; } +size_t get_num_kv_blocks(size_t cache_size, size_t block_size_bytes) { + size_t kv_cache_size_in_bytes = cache_size * 1024 * 1024 * 1024; // convert GBs to bytes + return kv_cache_size_in_bytes / block_size_bytes; +} TEST(TestCacheManager, test_cache_size_param) { ov::Core core; - ov::genai::SchedulerConfig scheduler_config; + SchedulerConfig scheduler_config; scheduler_config.max_num_batched_tokens = 32; scheduler_config.num_kv_blocks = 0; scheduler_config.cache_size = 2; scheduler_config.max_num_seqs = 2; const std::string device = "CPU"; - ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + DeviceConfig device_config("CPU"); const size_t num_decoder_layers = 12; const std::vector kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 }); device_config.set_kv_head_configs(kv_heads_config); ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); - auto cache_manager = std::make_shared(device_config, request, core); - auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + auto cache_manager = std::make_shared(request, device_config); + ASSERT_EQ(num_decoder_layers, cache_manager->get_num_decoder_layers()); + const size_t num_kv_blocks = get_num_kv_blocks(scheduler_config.cache_size, cache_manager->get_block_size_in_bytes()); + + auto block_manager = BlockManager(num_kv_blocks, false, device_config.get_block_size(), cache_manager->get_num_decoder_layers()); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - - ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); + + const size_t kv_cache_total_size = scheduler_config.cache_size * 1024 * 1024 * 1024; + const size_t cpu_block_size_total = cache_manager->get_block_size_in_bytes(); + size_t expected_size = kv_cache_total_size / cpu_block_size_total * cpu_block_size_total; + ASSERT_EQ(get_total_allocated_bytes(cache_manager), expected_size); } TEST(TestCacheManager, test_kv_blocks_param) { ov::Core core; - ov::genai::SchedulerConfig scheduler_config; + SchedulerConfig scheduler_config; scheduler_config.max_num_batched_tokens = 32; scheduler_config.num_kv_blocks = 150; scheduler_config.cache_size = 0; scheduler_config.max_num_seqs = 2; const std::string device = "CPU"; - ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + DeviceConfig device_config("CPU"); const size_t num_decoder_layers = 12; const std::vector kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 }); device_config.set_kv_head_configs(kv_heads_config); - ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); - auto cache_manager = std::make_shared(device_config, request, core); - auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); - OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks); + auto block_manager = BlockManager(scheduler_config.num_kv_blocks, false, device_config.get_block_size(), num_decoder_layers); + ASSERT_EQ(block_manager.get_total_number_of_kv_blocks(), scheduler_config.num_kv_blocks); } TEST(TestCacheManager, test_dynamic_cache_increase) { ov::Core core; - ov::genai::SchedulerConfig scheduler_config; + SchedulerConfig scheduler_config; scheduler_config.max_num_batched_tokens = 32; scheduler_config.num_kv_blocks = 0; scheduler_config.cache_size = 0; scheduler_config.max_num_seqs = 2; const std::string device = "CPU"; - ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + DeviceConfig device_config("CPU"); const size_t num_decoder_layers = 12; const std::vector kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 }); device_config.set_kv_head_configs(kv_heads_config); - size_t block_size_in_bytes = 0; - for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) { - KVHeadConfig config = kv_heads_config[layer_id]; - block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads; - } - block_size_in_bytes *= device_config.get_block_size() * device_config.get_cache_precision().size(); - ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); - auto cache_manager = std::make_shared(device_config, request, core); - auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); + auto cache_manager = std::make_shared(request, device_config); + size_t block_size_in_bytes = cache_manager->get_block_size_in_bytes(); + const size_t num_kv_blocks = get_num_kv_blocks(scheduler_config.cache_size, block_size_in_bytes); + + auto block_manager = BlockManager(num_kv_blocks, false, device_config.get_block_size(), cache_manager->get_num_decoder_layers()); + ASSERT_EQ(num_decoder_layers, cache_manager->get_num_decoder_layers()); // check initial cache allocation block_manager.increase_kv_blocks_number(100); - OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 100); + ASSERT_EQ(block_manager.get_total_number_of_kv_blocks(), 100); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 100 * block_size_in_bytes); + ASSERT_EQ(get_total_allocated_bytes(cache_manager), 100 * block_size_in_bytes); // check cache increase block_manager.increase_kv_blocks_number(200); - OPENVINO_ASSERT(block_manager.get_total_number_of_kv_blocks(), 200); + ASSERT_EQ(block_manager.get_total_number_of_kv_blocks(), 200); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); + ASSERT_EQ(get_total_allocated_bytes(cache_manager), 200 * block_size_in_bytes); // check that cache does not increase if new blocks were not allocated cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); + ASSERT_EQ(get_total_allocated_bytes(cache_manager), 200 * block_size_in_bytes); } \ No newline at end of file diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp deleted file mode 100644 index a97037b1e8..0000000000 --- a/tests/cpp/device_config.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (C) 2018-2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include "openvino/runtime/core.hpp" -#include "scheduler.hpp" -#include "device_config.hpp" - -TEST(TestDeviceConfig, kv_cache_precision_u8) { - ov::Core core; - ov::genai::SchedulerConfig scheduler_config; - scheduler_config.max_num_batched_tokens = 32; - scheduler_config.num_kv_blocks = 0; - scheduler_config.cache_size = 2; - scheduler_config.max_num_seqs = 2; - - const std::string device = "CPU"; - size_t num_decoder_layers = 12; - size_t head_size = 64, head_size_u8 = head_size + 8; - - ov::genai::KVHeadConfig kv_head_config { 12, 12, head_size_u8, head_size_u8 }; - ov::genai::KVHeadConfig kv_head_config_u8 { 12, 12, head_size, head_size }; - - ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); - ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) }); - - device_config_default.set_kv_head_configs(std::vector(num_decoder_layers, kv_head_config)); - device_config_u8.set_kv_head_configs(std::vector(num_decoder_layers, kv_head_config_u8)); - - const auto ratio = ov::element::f16.size() / ov::element::u8.size(); - ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks()); -} diff --git a/tests/cpp/helper.cpp b/tests/cpp/helper.cpp new file mode 100644 index 0000000000..da242da479 --- /dev/null +++ b/tests/cpp/helper.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "helper.hpp" +#include "openvino/op/concat.hpp" + +std::shared_ptr get_dummy_model(ov::Core core, size_t num_layers) { + ov::NodeVector keys, values; + ov::ParameterVector params; + ov::element::Type kv_cache_type = core.get_property("CPU", ov::hint::kv_cache_precision); + + auto shape = ov::PartialShape::dynamic(4); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(kv_cache_type, shape); + auto value = std::make_shared(kv_cache_type, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} diff --git a/tests/cpp/helper.hpp b/tests/cpp/helper.hpp new file mode 100644 index 0000000000..1fafe8bcf6 --- /dev/null +++ b/tests/cpp/helper.hpp @@ -0,0 +1,8 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/runtime/core.hpp" + +std::shared_ptr get_dummy_model(ov::Core core, size_t num_layers); \ No newline at end of file diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 201318347a..b6aa5a9b53 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -9,6 +9,7 @@ #include "openvino/genai/generation_config.hpp" #include "sequence_group.hpp" #include "scheduler.hpp" +#include "helper.hpp" using namespace ov::genai; @@ -18,39 +19,16 @@ void clear_finished_sequences(std::vector& requests) { }); requests.erase(new_end, requests.end()); } -std::shared_ptr get_model(ov::Core core, size_t num_layers) { - ov::NodeVector keys; - ov::NodeVector values; - ov::ParameterVector params; - ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision); - ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - - auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); - for (size_t i = 0; i < num_layers; i++) { - auto key = std::make_shared(kv_cache_type, shape); - auto value = std::make_shared(kv_cache_type, shape); - key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); - value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); - keys.push_back(key); - values.push_back(value); - params.push_back(key); - params.push_back(value); - } - const auto& concat1 = std::make_shared(keys, 1); - const auto& concat2 = std::make_shared(values, 1); - auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); - return std::make_shared(ov::NodeVector{concat1, concat2}, params); -} std::shared_ptr init_cache_manager(SchedulerConfig scheduler_config) { ov::Core core = ov::Core(); size_t num_decoder_layers = 12; - ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request(); - size_t head_size = 64, head_size_u8 = head_size + 8; - std::vector kv_head_configs(num_decoder_layers, KVHeadConfig { 12, 12, head_size_u8, head_size_u8 }); - ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); + const size_t head_size = 64; + std::vector kv_head_configs(num_decoder_layers, KVHeadConfig { 12, 12, head_size, head_size }); + ov::genai::DeviceConfig device_config("CPU"); device_config.set_kv_head_configs(kv_head_configs); - return std::make_shared(device_config, request, core); + return std::make_shared(request, device_config); } TEST(TestScheduler, general_test) { diff --git a/tests/cpp/speculative_decoding.cpp b/tests/cpp/speculative_decoding.cpp index 1cf8db0fab..114f16800b 100644 --- a/tests/cpp/speculative_decoding.cpp +++ b/tests/cpp/speculative_decoding.cpp @@ -13,8 +13,7 @@ class CBForSDTest : public testing::Test, public ov::genai::ContinuousBatchingPi m_sampler = std::make_shared(); }; - ov::genai::GenerationHandle - add_request(uint64_t request_id, const ov::Tensor& input_ids) { + ov::genai::GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids) { auto sampling_params = ov::genai::greedy(); sampling_params.num_assistant_tokens = 1;