Skip to content

Commit

Permalink
CB: rely on KV cache precisions from plugins
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Jan 27, 2025
1 parent 48613d5 commit b380ccc
Show file tree
Hide file tree
Showing 16 changed files with 302 additions and 292 deletions.
138 changes: 86 additions & 52 deletions src/cpp/src/cache_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,19 @@ class TensorMmapAllocator {
#endif

namespace ov::genai {

class CacheManager {
DeviceConfig m_device_config;
std::vector<ov::Tensor> m_key_cache;
std::vector<ov::Tensor> m_value_cache;
size_t m_num_allocated_kv_blocks = 0;
size_t m_num_decoder_layers = 0;
std::string m_device;
std::vector<ov::element::Type> m_key_precisions, m_value_precisions;
std::vector<ov::PartialShape> m_key_shapes, m_value_shapes;
std::vector<ov::Tensor> m_key_cache, m_value_cache;
size_t m_num_allocated_kv_blocks = 0, m_block_size_in_bytes = 0;
ov::InferRequest m_request;
ov::Core m_core;

ov::Shape set_first_dim_and_make_static(const ov::PartialShape& shape, size_t dim) {
ov::PartialShape res_shape = shape;
res_shape[0] = dim;
OPENVINO_ASSERT(res_shape.is_static());
return res_shape.to_shape();
static ov::Shape set_kv_blocks(ov::PartialShape pshape, size_t num_kv_blocks) {
pshape[0] = num_kv_blocks;
return pshape.get_shape();
}

void update_request_tensor(size_t decoder_layer_id) {
Expand All @@ -66,40 +66,83 @@ class CacheManager {
}

public:
explicit CacheManager(const DeviceConfig &device_config, ov::InferRequest request, ov::Core core) :
m_device_config(device_config),
m_request(request),
m_core(core) {
m_key_cache.reserve(m_device_config.get_num_layers());
m_value_cache.reserve(m_device_config.get_num_layers());
explicit CacheManager(ov::InferRequest request) :
m_request(request) {
// extract information about inference device
ov::CompiledModel compiled_model = request.get_compiled_model();
std::vector<std::string> execution_devices = compiled_model.get_property(ov::execution_devices);
OPENVINO_ASSERT(execution_devices.size() == 1, "Contituous batching: execution device is expected to be CPU or GPU, but got ", execution_devices.size(), " devices");
m_device = execution_devices[0];

// extract information about KV cache precisions and shapes
for (const auto& input : compiled_model.inputs()) {
for (auto & name : input.get_names()) {
const auto& pshape = input.get_partial_shape();
auto element_type = input.get_element_type();

if (name.find("key_cache.") == 0) {
m_key_shapes.push_back(pshape);
m_key_precisions.push_back(element_type);
m_block_size_in_bytes += pshape[1].get_length() * pshape[2].get_length() * pshape[3].get_length() * element_type.size();
break;
} else if (name.find("value_cache.") == 0) {
m_value_shapes.push_back(pshape);
m_value_precisions.push_back(element_type);
m_block_size_in_bytes += pshape[1].get_length() * pshape[2].get_length() * pshape[3].get_length() * element_type.size();
break;
}
}
}

m_num_decoder_layers = m_value_precisions.size();
OPENVINO_ASSERT(m_num_decoder_layers == m_key_precisions.size(), "Invalid case: a different number of K and V caches in a LLM model");
}

size_t get_num_decoder_layers() const {
return m_num_decoder_layers;
}

std::string get_device() const {
return m_device;
}

ov::element::Type get_key_cache_precision(size_t decoder_layer_id) const {
OPENVINO_ASSERT(decoder_layer_id < m_key_precisions.size());
return m_key_precisions[decoder_layer_id];
}

ov::element::Type get_value_cache_precision(size_t decoder_layer_id) const {
OPENVINO_ASSERT(decoder_layer_id < m_value_precisions.size());
return m_value_precisions[decoder_layer_id];
}

size_t get_block_size_in_bytes() const {
return m_block_size_in_bytes;
}

void allocate_cache_if_needed(size_t num_kv_blocks) {
if (m_num_allocated_kv_blocks >= num_kv_blocks) {
return;
}
OPENVINO_ASSERT(m_key_cache.size() == m_value_cache.size());
m_num_allocated_kv_blocks = num_kv_blocks;

const std::string device_name = m_device_config.get_device();
m_num_allocated_kv_blocks = num_kv_blocks;

ov::Coordinate start_key{0,0,0,0};
ov::Coordinate start_value{0,0,0,0};

if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
if (m_device.find("GPU") == std::string::npos) {// Allocate KV caches
for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) {
ov::Shape value_cache_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], num_kv_blocks);
ov::Shape key_cache_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], num_kv_blocks);
#ifdef _WIN32
ov::Tensor key_cache(m_device_config.get_cache_precision(), key_cache_shape);
ov::Tensor value_cache(m_device_config.get_cache_precision(), value_cache_shape);
ov::Tensor key_cache(get_key_cache_precision(decoder_layer_id), key_cache_shape);
ov::Tensor value_cache(m_device_config.get_cache_precision(decoder_layer_id), value_cache_shape);
#else
auto key_size = ov::shape_size(key_cache_shape) * m_device_config.get_cache_precision().size();
auto value_size = ov::shape_size(value_cache_shape) * m_device_config.get_cache_precision().size();

ov::Tensor key_cache = ov::Tensor(m_device_config.get_cache_precision(), key_cache_shape, TensorMmapAllocator(key_size));
ov::Tensor value_cache = ov::Tensor(m_device_config.get_cache_precision(), value_cache_shape, TensorMmapAllocator(value_size));
auto key_size = ov::shape_size(key_cache_shape) * get_key_cache_precision(decoder_layer_id).size();
auto value_size = ov::shape_size(value_cache_shape) * get_value_cache_precision(decoder_layer_id).size();

ov::Tensor key_cache = ov::Tensor(get_key_cache_precision(decoder_layer_id), key_cache_shape, TensorMmapAllocator(key_size));
ov::Tensor value_cache = ov::Tensor(get_value_cache_precision(decoder_layer_id), value_cache_shape, TensorMmapAllocator(value_size));
#endif

auto key_cache_roi_end = static_cast<unsigned char*>(key_cache.data());
Expand Down Expand Up @@ -146,15 +189,13 @@ class CacheManager {
update_request_tensor(decoder_layer_id);
}
} else {
auto remote_context = m_core.get_default_context(device_name);
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Shape value_cache_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Shape key_cache_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), num_kv_blocks);
ov::Tensor key_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
key_cache_shape);
ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(),
value_cache_shape);

auto remote_context = m_request.get_compiled_model().get_context();
for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) {
ov::Shape value_cache_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], num_kv_blocks);
ov::Shape key_cache_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], num_kv_blocks);
ov::Tensor key_cache = remote_context.create_tensor(get_key_cache_precision(decoder_layer_id), key_cache_shape);
ov::Tensor value_cache = remote_context.create_tensor(get_value_cache_precision(decoder_layer_id), value_cache_shape);

if (m_key_cache.size() > decoder_layer_id) {
ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape();
ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape();
Expand All @@ -178,12 +219,12 @@ class CacheManager {
}

ov::Tensor get_key_cache(size_t decoder_layer_id) const {
OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size());
OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size(), "decoder_layer_id = ", decoder_layer_id, ", num_layers = ", m_key_cache.size());
return m_key_cache[decoder_layer_id];
}

ov::Tensor get_value_cache(size_t decoder_layer_id) const {
OPENVINO_ASSERT(decoder_layer_id < m_value_cache.size());
OPENVINO_ASSERT(decoder_layer_id < m_value_cache.size(), "decoder_layer_id = ", decoder_layer_id, ", num_layers = ", m_value_cache.size());
return m_value_cache[decoder_layer_id];
}

Expand All @@ -192,9 +233,9 @@ class CacheManager {
size_t src_block_id = blocks_pair.first;
const std::list<size_t>& dst_block_ids = blocks_pair.second;
for (size_t dst_block_id : dst_block_ids) {
for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) {
ov::Shape key_shape = set_first_dim_and_make_static(m_device_config.get_key_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
ov::Shape value_shape = set_first_dim_and_make_static(m_device_config.get_value_cache_shape(decoder_layer_id), m_num_allocated_kv_blocks);
for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) {
ov::Shape key_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], m_num_allocated_kv_blocks);
ov::Shape value_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], m_num_allocated_kv_blocks);
ov::Coordinate key_src_start_roi(key_shape.size(), 0);
ov::Coordinate key_src_end_roi = key_shape;
ov::Coordinate key_dst_start_roi(key_shape.size(), 0);
Expand All @@ -221,13 +262,6 @@ class CacheManager {
}
}
}

std::shared_ptr<Core> get_core() {
return std::make_shared<Core>(m_core);
}

std::shared_ptr<DeviceConfig> get_device_config() {
return std::make_shared<DeviceConfig>(m_device_config);
}
};

}
114 changes: 99 additions & 15 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,85 @@
#include "utils/paged_attention_transformations.hpp"
#include "lora_helper.hpp"
#include "cache_state_dumper.hpp"
#include "utils.hpp"

namespace {

ov::element::Type get_model_kv_cache_precision(std::shared_ptr<ov::Model> model) {
const std::vector<std::string> kv_cache_precision_path = { "runtime_options", ov::hint::kv_cache_precision.name() };
ov::element::Type ir_kv_cache_precision = ov::element::undefined;

if (model->has_rt_info(kv_cache_precision_path)) {
ir_kv_cache_precision = model->get_rt_info<ov::element::Type>(kv_cache_precision_path);
}

return ir_kv_cache_precision;
}

void apply_kv_cache_precision(const std::shared_ptr<ov::Model>& model, const std::string& device, const ov::AnyMap& plugin_config) {
ov::element::Type m_kv_cache_type = ov::element::undefined, ir_kv_cache_precision = get_model_kv_cache_precision(model);
ov::Core core = ov::genai::utils::singleton_core();

auto inference_precision = core.get_property(device, ov::hint::inference_precision);
// if user sets properties affecting KV cache precision
const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name());
const auto kv_cache_precision_it = plugin_config.find(ov::hint::kv_cache_precision.name());
const auto execution_mode_it = plugin_config.find(ov::hint::execution_mode.name());
const bool accuracy_mode = execution_mode_it != plugin_config.end() &&
execution_mode_it->second.as<ov::hint::ExecutionMode>() == ov::hint::ExecutionMode::ACCURACY;

if (device == "CPU") {
if (kv_cache_precision_it != plugin_config.end()) {
const auto kv_cache_precision = kv_cache_precision_it->second.as<ov::element::Type>();
m_kv_cache_type = kv_cache_precision;
} else if (accuracy_mode) {
// ACCURACY mode will use f32 KV cache type
m_kv_cache_type = ov::element::f32;
} else if (ir_kv_cache_precision != ov::element::undefined) {
// check that kv_cache_precision is set in runtime_info section of OpenVINO IR
// but in case it's set to FP16, we need to patch it to be BF16 for Xeon platforms
m_kv_cache_type = ir_kv_cache_precision == ov::element::f16 && inference_precision == ov::element::bf16 ?
inference_precision : ir_kv_cache_precision;
} else {
// x86 and ARM have different default kv cache type, take this information from the plugin
m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision);
}
} else if (device.find("GPU") != std::string::npos) {
if (accuracy_mode) {
inference_precision = ov::element::f32;
}
if (inference_precision_it != plugin_config.end()) {
inference_precision = inference_precision_it->second.as<ov::element::Type>();
}

m_kv_cache_type = inference_precision;
} else {
OPENVINO_THROW(device, " is not supported by OpenVINO Continuous Batching");
}

std::map<std::string, std::shared_ptr<ov::op::v0::Parameter>> key_cache_params, value_cache_params;
for (const auto& param_ptr : model->get_parameters()) {
const auto& name = param_ptr->get_friendly_name();
if (name.find("key_cache.") == 0) {
key_cache_params[name] = param_ptr;
} else if (name.find("value_cache.") == 0) {
value_cache_params[name] = param_ptr;
}
}

OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size() && key_cache_params.size() > 0);

size_t num_decoder_layers = key_cache_params.size();
for (size_t idx = 0; idx < num_decoder_layers; idx++) {
auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)];
auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)];

k->set_element_type(m_kv_cache_type);
v->set_element_type(m_kv_cache_type);
}
}

} // namespace

namespace ov::genai {
template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
Expand All @@ -27,15 +106,14 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
m_generation_config = generation_config;
m_is_validation_mode_enabled = is_validation_mode_enabled;

ov::Core core = utils::singleton_core();
DeviceConfig device_config(core, scheduler_config, device, properties);
DeviceConfig device_config(scheduler_config, device, properties);

bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
bool allow_cache_rotation = scheduler_config.cache_eviction_config.apply_rotation;
utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control, allow_cache_rotation);
utils::apply_gather_before_matmul_transformation(model);

initialize_pipeline(model, scheduler_config, properties, device_config, core);
initialize_pipeline(model, scheduler_config, properties, device_config);
}

ContinuousBatchingPipeline::ContinuousBatchingImpl::~ContinuousBatchingImpl() {
Expand All @@ -55,10 +133,13 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline(
std::shared_ptr<ov::Model> model,
const SchedulerConfig& scheduler_config,
const ov::AnyMap& properties,
const DeviceConfig& device_config,
ov::Core& core) {
const DeviceConfig& device_config) {
ov::Core core = utils::singleton_core();
ov::CompiledModel compiled_model;

// TODO: remove once plugin automatically set KV cache precisions
apply_kv_cache_precision(model, device_config.get_device(), properties);

// apply LoRA
if (auto filtered_properties = extract_adapters_from_properties(properties, &m_generation_config.adapters)) {
m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
Expand All @@ -71,24 +152,27 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::initialize_pipeline(
ov::genai::utils::print_compiled_model_properties(compiled_model, "LLM with Paged Attention");
ov::InferRequest infer_request = compiled_model.create_infer_request();

m_num_decoder_layers = device_config.get_num_layers();
// Cache manager
std::shared_ptr<CacheManager> cache_manager = std::make_shared<CacheManager>(infer_request);
m_num_decoder_layers = cache_manager->get_num_decoder_layers();

// setup KV caches
std::shared_ptr<CacheManager> cache_manager = std::make_shared<CacheManager>(device_config, infer_request, core);

SchedulerConfig updated_config = scheduler_config;
// update KV blocks number in scheduler config
if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) {
updated_config.num_kv_blocks = device_config.get_num_kv_blocks();
// Scheduler
SchedulerConfig normalized_config = scheduler_config;
if (normalized_config.num_kv_blocks == 0 && normalized_config.cache_size > 0) {
size_t size_in_bytes = normalized_config.cache_size * 1024 * 1024 * 1024; // convert GBs to bytes
normalized_config.num_kv_blocks = cache_manager->get_block_size_in_bytes();
}

bool can_use_partial_preemption = true;
if (device_config.get_device().find("GPU") != std::string::npos && !updated_config.dynamic_split_fuse) {
if (device_config.get_device().find("GPU") != std::string::npos && !normalized_config.dynamic_split_fuse) {
// in case of executing a `vLLM-like` pipeline, it's better not to use partial eviction on the GPU,
// as it may lead to performance slowdown
can_use_partial_preemption = false;
}
m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), cache_manager, updated_config, device_config.get_num_layers(), can_use_partial_preemption);

m_scheduler = std::make_shared<Scheduler>(device_config.get_block_size(), cache_manager, normalized_config, m_num_decoder_layers, can_use_partial_preemption);

// Model Runner
bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
if (is_use_cache_eviction) {
const auto& eviction_config = m_scheduler->get_config().cache_eviction_config;
Expand Down
4 changes: 1 addition & 3 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
void initialize_pipeline(std::shared_ptr<ov::Model> model,
const SchedulerConfig& scheduler_config,
const ov::AnyMap& plugin_config,
const DeviceConfig& device_config,
ov::Core& core);

const DeviceConfig& device_config);

/**
* Pulls requests from awaiting queue to running queue
Expand Down
Loading

0 comments on commit b380ccc

Please sign in to comment.