diff --git a/samples/cpp/text_generation/benchmark_genai.cpp b/samples/cpp/text_generation/benchmark_genai.cpp index d389e94432..76f2cbef2c 100644 --- a/samples/cpp/text_generation/benchmark_genai.cpp +++ b/samples/cpp/text_generation/benchmark_genai.cpp @@ -8,7 +8,7 @@ int main(int argc, char* argv[]) try { cxxopts::Options options("benchmark_vanilla_genai", "Help command"); options.add_options() - ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) + ("m,model", "Path to model and tokenizers base directory", cxxopts::value()) ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(3))) @@ -35,15 +35,15 @@ int main(int argc, char* argv[]) try { std::string device = result["device"].as(); size_t num_warmup = result["num_warmup"].as(); size_t num_iter = result["num_iter"].as(); - + ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); ov::genai::LLMPipeline pipe(models_path, device); - + for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, config); - + ov::genai::DecodedResults res = pipe.generate(prompt, config); ov::genai::PerfMetrics metrics = res.perf_metrics; for (size_t i = 0; i < num_iter - 1; i++) { @@ -60,7 +60,7 @@ int main(int argc, char* argv[]) try { std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl; std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl; - return 0; + return EXIT_SUCCESS; } catch (const std::exception& error) { try { std::cerr << error.what() << '\n'; diff --git a/samples/python/text_generation/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py index d2cc91e04d..a5ed4acfc1 100755 --- a/samples/python/text_generation/benchmark_genai.py +++ b/samples/python/text_generation/benchmark_genai.py @@ -6,7 +6,7 @@ def main(): parser = argparse.ArgumentParser(description="Help command") - parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") + parser.add_argument("-m", "--model", type=str, required=True, help="Path to model and tokenizers base directory") parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 706986deff..ed9fc3a30d 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -48,11 +48,6 @@ struct PipelineMetrics { * Running average of the KV cache usage during the lifetime of the pipeline, with max window size of 1000 steps */ float avg_cache_usage = 0.0; - - /** - * Number of tokens scheduled for processing at the previous step of the pipeline. - */ - size_t total_num_scheduled_tokens; }; class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 948baab6f4..e7a7c40f9b 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -121,10 +121,6 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { const ov::genai::GenerationConfig& generation_config = {} ); - OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release") - explicit LLMPipeline(const std::filesystem::path& path) : - LLMPipeline(path, "CPU") { } - /** * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. * Accepts arbitrary list of optional properties. @@ -153,7 +149,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { LLMPipeline( const ov::InferRequest& request, const ov::genai::Tokenizer& tokenizer, - OptionalGenerationConfig generation_config=std::nullopt + OptionalGenerationConfig generation_config = std::nullopt ); /** @@ -172,10 +168,6 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { const ov::AnyMap& properties = {} ); - OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release") - LLMPipeline(const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer) : - LLMPipeline(models_path, tokenizer, "CPU") { } - ~LLMPipeline(); /** @@ -211,7 +203,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { DecodedResults operator()( StringInputs inputs, - OptionalGenerationConfig generation_config=std::nullopt, + OptionalGenerationConfig generation_config = std::nullopt, StreamerVariant streamer=std::monostate() ) { return generate(inputs, generation_config, streamer); diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp index a7c657eecb..efcd2ec382 100644 --- a/src/cpp/src/continuous_batching_adapter.hpp +++ b/src/cpp/src/continuous_batching_adapter.hpp @@ -101,13 +101,14 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies)); std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); } + PerfMetrics perf_metrics; // For GenerationResults, all perf_metrics are the same except tokenization and detokenization durations. // Since we return here only one perf_metrics, we should accumulate all tokenization and detokenization times. if (generated.size() > 0) { perf_metrics = generated[0].perf_metrics; } - + // Tokenizations and detokenization times are dispersed across GenerationResult vector. // Need to collect them into a single perf_metric for DecodedResult. auto& raw_metrics = perf_metrics.raw_metrics; diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index bfc1085a12..e778e55b93 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -151,13 +151,20 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { m_pipeline_metrics.max_cache_usage = std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage); _register_step_cache_usage(scheduler_output.m_cache_usage); m_pipeline_metrics.avg_cache_usage = _get_current_running_average_cache_usage(); - m_pipeline_metrics.total_num_scheduled_tokens = scheduler_output.m_total_num_scheduled_tokens; + + m_batch_size = 0; // total number of running sequences + for (size_t i = 0; i < scheduler_output.m_scheduled_sequence_groups_ids.size(); ++i) { + size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; + SequenceGroup::CPtr sequence_group = m_requests[seq_group_id]; + m_batch_size += sequence_group->num_running_seqs(); + } static ManualTimer copy_blocks_timer("scheduling"); copy_blocks_timer.start(); m_cache_manager->copy_blocks(scheduler_output.m_block_copy_map); copy_blocks_timer.end(); } + // if no tokens were scheduled, we are out of memory => free all requests and return if (scheduler_output.m_total_num_scheduled_tokens == 0) { for (size_t i = 0; i < m_requests.size(); ++i) { @@ -297,14 +304,13 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector 0) { + if (m_batch_size > 0) { const auto infer_end = std::chrono::steady_clock::now(); - const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start); + const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); raw_perf_counters.m_new_token_times.emplace_back(infer_end); - raw_perf_counters.m_batch_sizes.emplace_back(num_generated_tokens); + raw_perf_counters.m_batch_sizes.emplace_back(m_batch_size); } } catch (...) { drop_requests(); // remove all requests from pipeline state in case of exception @@ -360,10 +366,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorget_status(); - + // The same perf metrics for each sequence, only tokenization/detokenization will differ. perf_metrics.raw_metrics.generate_durations.clear(); perf_metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - start_time)); + perf_metrics.num_input_tokens = request->get_prompt_len(); perf_metrics.evaluate_statistics(start_time); result.perf_metrics = perf_metrics; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 99e999d19e..78e6638fbc 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -30,6 +30,10 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc static const size_t AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS = 1000; std::deque m_previous_step_cache_usages; + // for perf metrics + float m_load_time_ms = 0.0f; + size_t m_batch_size = 0; // stored number of scheduled sequences on last step + // flag to enable validation mode for sampler bool m_is_validation_mode_enabled = false; @@ -75,7 +79,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _register_step_cache_usage(float step_cache_usage); float _get_current_running_average_cache_usage() const; - void drop_requests() override; + virtual void drop_requests(); public: ContinuousBatchingImpl(const std::shared_ptr& model, diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index c1c0677ff3..d380d1c6c4 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -39,11 +39,18 @@ extract_prompt_lookup_from_config(ov::AnyMap& config) { return res; } +inline float get_load_time(std::chrono::steady_clock::time_point start_time) { + auto stop_time = std::chrono::steady_clock::now(); + return std::chrono::duration_cast(stop_time - start_time).count(); +} + ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties, const ov::AnyMap& tokenizer_properties) { + auto start_time = std::chrono::steady_clock::now(); + auto properties_without_draft_model = properties; auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model); @@ -61,6 +68,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p } else { m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } + + m_impl->m_load_time_ms = get_load_time(start_time); } ContinuousBatchingPipeline::ContinuousBatchingPipeline( @@ -69,6 +78,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& properties) { + auto start_time = std::chrono::steady_clock::now(); + auto properties_without_draft_model = properties; auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model); @@ -85,6 +96,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( } else { m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } + + m_impl->m_load_time_ms = get_load_time(start_time); } ContinuousBatchingPipeline::ContinuousBatchingPipeline( @@ -95,6 +108,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& device, const ov::AnyMap& properties, const ov::genai::GenerationConfig& generation_config) { + auto start_time = std::chrono::steady_clock::now(); + auto properties_without_draft_model = properties; auto draft_model_desr = extract_draft_model_from_config(properties_without_draft_model); auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model); @@ -109,6 +124,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( } else { m_impl = std::make_shared(model, tokenizer, scheduler_config, device, properties, generation_config); } + + m_impl->m_load_time_ms = get_load_time(start_time); } ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() { @@ -140,11 +157,23 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() { } std::vector ContinuousBatchingPipeline::generate(const std::vector& input_ids, const std::vector& sampling_params, const StreamerVariant& streamer) { - return m_impl->generate(input_ids, sampling_params, streamer); + auto encoded_results = m_impl->generate(input_ids, sampling_params, streamer); + + for (auto& encoded_result : encoded_results) { + encoded_result.perf_metrics.load_time = m_impl->m_load_time_ms; + } + + return encoded_results; } std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, const std::vector& sampling_params, const StreamerVariant& streamer) { - return m_impl->generate(prompts, sampling_params, streamer); + auto decoded_results = m_impl->generate(prompts, sampling_params, streamer); + + for (auto& decoded_result : decoded_results) { + decoded_result.perf_metrics.load_time = m_impl->m_load_time_ms; + } + + return decoded_results; } void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp index 0c0d62886b..8fbb9619ea 100644 --- a/src/cpp/src/icontinuous_batching.cpp +++ b/src/cpp/src/icontinuous_batching.cpp @@ -62,6 +62,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } std::vector encoded = generate(input_ids, sampling_params, streamer); + std::vector decoded; decoded.reserve(encoded.size()); for (size_t i = 0; i < encoded.size(); ++i) { diff --git a/src/cpp/src/icontinuous_batching.hpp b/src/cpp/src/icontinuous_batching.hpp index 9def4ae532..81fff3f40c 100644 --- a/src/cpp/src/icontinuous_batching.hpp +++ b/src/cpp/src/icontinuous_batching.hpp @@ -21,7 +21,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { // TODO (mzegla): GenerationConfig is request specific object // and pipeline only uses default rng_seed and some special tokens. - ov::genai::GenerationConfig m_generation_config; + GenerationConfig m_generation_config; PipelineMetrics m_pipeline_metrics; @@ -42,19 +42,21 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { bool m_is_chat_conversation = false; ChatHistory m_history; - virtual void drop_requests() = 0; + float m_load_time_ms = 0.0f; + // to access m_load_time_ms + friend class ContinuousBatchingPipeline; public: - ov::genai::GenerationConfig get_config() const; + GenerationConfig get_config() const; PipelineMetrics get_metrics() const; - ov::genai::Tokenizer get_tokenizer(); + Tokenizer get_tokenizer(); /** * Adds requests to awaiting queue using encoded inputs */ virtual GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, - ov::genai::GenerationConfig sampling_params) = 0; + GenerationConfig sampling_params) = 0; /** * Adds request to running queue based on string input @@ -62,7 +64,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { */ virtual GenerationHandle add_request(uint64_t request_id, const std::string& prompt, - ov::genai::GenerationConfig sampling_params) = 0; + GenerationConfig sampling_params) = 0; /** * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop @@ -87,7 +89,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { */ std::vector generate(const std::vector& prompts, - std::vector sampling_params, + std::vector sampling_params, const StreamerVariant& streamer); /** diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp index 5573272d7e..ed1b0fcee4 100644 --- a/src/cpp/src/llm_pipeline_base.hpp +++ b/src/cpp/src/llm_pipeline_base.hpp @@ -62,7 +62,7 @@ class LLMPipelineImplBase { GenerationConfig m_generation_config; std::optional m_adapter_controller; - float m_load_time_ms = 0; + float m_load_time_ms = 0.0f; }; } // namespace genai diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 3cec7d74a8..8451709092 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -361,7 +361,7 @@ void StatefulLLMPipeline::start_chat(const std::string& system_message) { if (!m_tokenized_chat_history.empty()) { reset_kv_state(); m_history = {}; - m_templated_chat_history = ""; + m_templated_chat_history.clear(); m_tokenized_chat_history.clear(); } if (system_message.empty()) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 10a32c4731..b29bec3b4a 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -475,17 +475,16 @@ std::optional extract_npu_descriptor(ov::Core& core) { } const auto arch = core.get_property("NPU", ov::device::architecture); const auto max_tiles = core.get_property("NPU", ov::intel_npu::max_tiles); - bool compiler_dq = false; - const auto device_caps = core.get_property("NPU", ov::device::capabilities); - if (std::find(device_caps.begin(), device_caps.end(), - "COMPILER_DYNAMIC_QUANTIZATION") != device_caps.end()) { + const auto supported_properties = core.get_property("NPU", ov::supported_properties); + if (std::find(supported_properties.begin(), supported_properties.end(), + "NPU_COMPILER_DYNAMIC_QUANTIZATION") != supported_properties.end()) { compiler_dq = true; } return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); } -ov::AnyMap get_baseline_common_config() { +ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { ov::AnyMap config = { { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, { "NPUW_DEVICES", "NPU" }, @@ -497,11 +496,20 @@ ov::AnyMap get_baseline_common_config() { { "NPUW_SLICE_OUT", "YES" }, { "NPUW_FUNCALL_ASYNC", "YES" } }; + // FIXME: this config logic is getting more and more complex + if (npudesc.has_value() && npudesc->compiler_dq) { + config.emplace("NPUW_DQ", "YES"); + config.emplace("NPUW_DQ_FULL", "NO"); + config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES"); + config.erase("NPUW_DCOFF_TYPE"); + config.erase("NPUW_DCOFF_SCALE"); + } return config; } -ov::AnyMap get_default_common_config(const std::shared_ptr& model) { - auto config = get_baseline_common_config(); +ov::AnyMap get_default_common_config(const std::shared_ptr& model, + const std::optional& npudesc) { + auto config = get_baseline_common_config(npudesc); const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); if (npu_l0 && std::atoi(npu_l0) == 1) { config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); @@ -513,19 +521,19 @@ ov::AnyMap get_default_common_config(const std::shared_ptr& model) { ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, const std::optional& npudesc) { - auto config = get_default_common_config(model); - if (is_cw_compressed(model)) { - config.emplace("NPUW_DQ", "YES"); - } else { - config.emplace("NPUW_PMM", "NO"); - } + auto config = get_default_common_config(model, npudesc); if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); + // Specify NPUW DQ if Compiler DQ is not enabled + if (!npudesc.has_value() || !npudesc->compiler_dq) { + if (is_cw_compressed(model)) { + config.emplace("NPUW_DQ", "YES"); + } else { + config.emplace("NPUW_PMM", "NO"); + } } return config; } @@ -533,20 +541,19 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const GenerateHint hint) { - auto config = get_default_common_config(model); + auto config = get_default_common_config(model, npudesc); if (hint == GenerateHint::BEST_PERF) { config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); } - // NB: Unconditionally set for generation model - config.emplace("NPUW_DQ", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } if (hint == GenerateHint::FAST_COMPILE) { config.emplace("NPUW_UNFOLD_IREQS", "YES"); } - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ_FULL", "NO"); + // Specify NPUW DQ if Compiler DQ is not enabled + if (!npudesc.has_value() || !npudesc->compiler_dq) { + config.emplace("NPUW_DQ", "YES"); } return config; } diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 152048fc47..071d49d2c4 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -152,6 +152,8 @@ std::pair> get_lm_encoded_results( int64_t * input_ids_data = new_input_ids.data(); std::vector next_beams; + size_t current_batch_size = 0; + for (auto& sequence_group : active_sequence_groups) { std::vector running_sequences = sequence_group->get_running_sequences(); size_t num_running_sequences = running_sequences.size(); @@ -176,6 +178,8 @@ std::pair> get_lm_encoded_results( // for different sequences iteration of beams started from 0, but we collect it to one input_ids next_beams.push_back(beam_idxs[sequence->get_id()] + beam_offets.at(sequence_group->get_request_id())); } + + current_batch_size += num_running_sequences; } for (size_t i = 0; i < active_sequence_groups.size(); i++) { @@ -209,7 +213,7 @@ std::pair> get_lm_encoded_results( raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); raw_perf_counters.m_new_token_times.emplace_back(infer_end); - raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + raw_perf_counters.m_batch_sizes.emplace_back(current_batch_size); sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits")); free_non_running_requests(); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index c1833c8b9f..f4ec0eb49a 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -100,10 +100,10 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { raw_metrics.m_durations.reserve(tok_times.size()); auto ttft = tok_times[0] - start_time_val; - raw_metrics.m_times_to_first_token = std::vector(); + raw_metrics.m_times_to_first_token.clear(); raw_metrics.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = batch_sizes[0]; - + // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens. // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token @@ -114,7 +114,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { num_generated_tokens += batch_sizes[i]; } } - + // calc_mean_and_std will convert microsecond to milliseconds. tpot = calc_mean_and_std(raw_metrics.m_durations); ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations); diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp index f52f3af9f8..e19042f44e 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp @@ -16,7 +16,7 @@ class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPi std::shared_ptr m_pipeline; SpeculativeDecodingMetrics m_sd_metrics; - void drop_requests() override; + void drop_requests(); public: PromptLookupImpl(const std::shared_ptr& model, diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 54850f657b..827309724e 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -129,7 +129,7 @@ MatchStopStringResult match_stop_string(Tokenizer& tokenizer, } // find token cnt to be removed from sequence by decoding token by token - std::string decoded_partially_string = ""; + std::string decoded_partially_string; for (size_t i = 0; i < buffer.size(); ++i) { decoded_partially_string += tokenizer.decode(TokenIds{buffer[i]}); if (decoded_partially_string.find(decoded_buffer) != std::string::npos) { diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 7796f93d1e..ca8937cb60 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -101,7 +101,7 @@ class Sampler::GroupBeamSearcher { return m_sequence->get_generated_len(); } }; - + static bool greater(const Beam& left, const Beam& right) { return left.m_score > right.m_score; } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index 3d948ad2da..b84b5b8590 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -42,7 +42,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat std::mutex m_draft_generations_mutex; std::map m_draft_generations; - void drop_requests() override; + void drop_requests(); bool is_requests_empty(); std::vector get_awaiting_requests(); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index e53be4e1cd..9f8718f14c 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -149,7 +149,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 03707deb6e..bba366401e 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1221,9 +1221,6 @@ class PipelineMetrics: :param avg_cache_usage: Running average of the KV cache usage (in %) during the lifetime of the pipeline, with max window size of 1000 steps :type avg_cache_usage: float - - :param total_num_scheduled_tokens: Number of tokens scheduled for processing at the previous step of the pipeline. - :type total_num_scheduled_tokens: int """ def __init__(self) -> None: ... @@ -1242,9 +1239,6 @@ class PipelineMetrics: @property def scheduled_requests(self) -> int: ... - @property - def total_num_scheduled_tokens(self) -> int: - ... class RawPerfMetrics: """ diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index b2fb81fb5b..d6888c4547 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -104,9 +104,6 @@ auto pipeline_metrics_docstring = R"( :param avg_cache_usage: Running average of the KV cache usage (in %) during the lifetime of the pipeline, with max window size of 1000 steps :type avg_cache_usage: float - - :param total_num_scheduled_tokens: Number of tokens scheduled for processing at the previous step of the pipeline. - :type total_num_scheduled_tokens: int )"; std::ostream& operator << (std::ostream& stream, const GenerationResult& generation_result) { @@ -217,8 +214,7 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readonly("scheduled_requests", &PipelineMetrics::scheduled_requests) .def_readonly("cache_usage", &PipelineMetrics::cache_usage) .def_readonly("avg_cache_usage", &PipelineMetrics::avg_cache_usage) - .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage) - .def_readonly("total_num_scheduled_tokens", &PipelineMetrics::total_num_scheduled_tokens); + .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); py::class_(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index b3e2f23f0b..d51bc8edc6 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -130,7 +130,12 @@ def analyze_args(args): model_args['output_dir'] = args.output_dir model_args['lora'] = args.lora model_args['lora_alphas'] = args.lora_alphas - model_args["use_cb"] = args.use_cb + use_cb = args.use_cb or args.draft_model + if args.device == "NPU" and use_cb: + log.warning("Continious batching and Speculative Decoding are not supported for NPU device") + use_cb = False + args.draft_model = None + model_args["use_cb"] = use_cb model_args['devices'] = args.device model_args['prompt_index'] = [] if args.prompt_index is not None else None if model_args['prompt_index'] is not None: @@ -163,7 +168,7 @@ def analyze_args(args): model_args['model_type'] = get_model_type(model_name, use_case, model_framework) model_args['model_name'] = model_name - if (args.use_cb or args.draft_model) and optimum: + if use_cb and optimum: raise RuntimeError("Continuous batching mode supported only via OpenVINO GenAI") cb_config = None if args.cb_config: