From d6fbb642d025e6ebfc5eedd95b8692f31f78a484 Mon Sep 17 00:00:00 2001 From: sbalandi <sofya.balandina@intel.com> Date: Wed, 22 Jan 2025 15:39:12 +0000 Subject: [PATCH] apply_chat_template as config property --- .github/workflows/causal_lm_cpp.yml | 42 +++++++------- .../openvino/genai/generation_config.hpp | 2 + .../include/openvino/genai/llm_pipeline.hpp | 4 +- .../genai/visual_language/pipeline.hpp | 8 +-- src/cpp/src/generation_config.cpp | 1 + src/cpp/src/icontinuous_batching.cpp | 5 +- src/cpp/src/llm_pipeline_stateful.cpp | 17 +++--- src/cpp/src/llm_pipeline_static.cpp | 4 +- .../src/visual_language/inputs_embedder.cpp | 55 ++++++++++--------- .../src/visual_language/inputs_embedder.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 2 +- .../openvino_genai/py_openvino_genai.pyi | 1 + src/python/py_generation_config.cpp | 1 + tests/python_tests/common.py | 2 +- tools/llm_bench/task/text_generation.py | 2 + .../task/visual_language_generation.py | 1 + tools/who_what_benchmark/whowhatbench/wwb.py | 3 +- 17 files changed, 83 insertions(+), 69 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index c5e8ec0996..5dff0a58d3 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -122,8 +122,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = 'Why is the Sun yellow?' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -141,8 +141,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = '69' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -161,7 +161,7 @@ jobs: prompt = 'Hi' if tokenizer.chat_template: prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -179,8 +179,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = 'return 0' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -198,8 +198,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompt = '你好! 你好嗎?' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -209,21 +209,21 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r', errors='ignore') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompts = [ - 'Alan Turing was a', + 'Why is the Sun yellow?', 'return 0', '你好! 你好嗎?' ] for prompt in prompts: if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -272,10 +272,10 @@ jobs: echo import transformers > ref.py echo predictions = open('cpp.txt', 'r').read() >> ref.py echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py - echo prompt = '69' - echo if tokenizer.chat_template: - echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - echo tokenized = tokenizer(prompt, return_tensors='pt') >> ref.py + echo prompt = '69' >> ref.py + echo if tokenizer.chat_template: >> ref.py + echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) >> ref.py + echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py @@ -584,8 +584,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') prompt = 'Alan Turing was a' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -642,8 +642,8 @@ jobs: tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') prompt = 'Alan Turing was a' if tokenizer.chat_template: - prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(prompt, return_tensors='pt') + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 3a75fc02ea..cd372e635d 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { std::optional<AdapterConfig> adapters; + bool apply_chat_template = true; + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. */ diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index fa1e51c7d9..26232574dc 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -178,7 +178,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param streamer optional streamer * @return DecodedResults decoded resulting text * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it. - * Use custom_chat_template = "" to disable it for non-chat mode. + * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. */ DecodedResults generate( StringInputs inputs, @@ -194,7 +194,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param properties properties * @return DecodedResults decoded resulting text * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it. - * Use custom_chat_template = "" to disable it for non-chat mode. + * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. */ template <typename... Properties> util::EnableIfAllStringAny<DecodedResults, Properties...> generate( diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 48535722e9..b6b1d5c7f6 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -99,7 +99,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. - /// Use custom_chat_template="" to disable it for non-chat mode. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const std::vector<ov::Tensor>& rgbs, @@ -114,7 +114,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. - /// Use custom_chat_template="" to disable it for non-chat mode. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const ov::Tensor& rgb, @@ -129,7 +129,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// images. /// @return A string generated by a model. /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. - /// Use custom_chat_template="" to disable it for non-chat mode. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const ov::AnyMap& config_map @@ -144,7 +144,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// ov::AnyMap. /// @return A string generated by a model. /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. - /// Use custom_chat_template="" to disable it for non-chat mode. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. template <typename... Properties> util::EnableIfAllStringAny<VLMDecodedResults, Properties...> generate( const std::string& prompt, diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 67682be787..dba5aaa5bd 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -125,6 +125,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { read_anymap_param(properties, "logprobs", logprobs); read_anymap_param(properties, "num_return_sequences", num_return_sequences); read_anymap_param(properties, "adapters", adapters); + read_anymap_param(properties, "apply_chat_template", apply_chat_template); // penalties read_anymap_param(properties, "frequency_penalty", frequency_penalty); diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp index d4fa7c3e5d..6b748d6665 100644 --- a/src/cpp/src/icontinuous_batching.cpp +++ b/src/cpp/src/icontinuous_batching.cpp @@ -53,10 +53,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { input_ids.reserve(prompts.size()); timer.start(); - for (const std::string& prompt : prompts) { + for (size_t i = 0; i < prompts.size(); i++) { + const std::string& prompt = prompts.at(i); const auto encode_start = std::chrono::steady_clock::now(); ov::Tensor encoded_inputs; - if (!m_tokenizer.get_chat_template().empty()) { + if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 18e9d30ebc..250468ef9f 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -9,6 +9,8 @@ #include "text_callback_streamer.hpp" #include "utils.hpp" +#include "debug_utils.hpp" + namespace ov::genai { StatefulLLMPipeline::StatefulLLMPipeline( @@ -88,19 +90,18 @@ DecodedResults StatefulLLMPipeline::generate( if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) { OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); - std::vector<std::string> templated_input_vector; - for (auto& input : *input_vector) { - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + std::vector<std::string> templated_input_vector; + for (auto& input : *input_vector) { ChatHistory history({{{"role", "user"}, {"content", input}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); templated_input_vector.push_back(templated_prompt); - } else { - // in case when chat_template was not found in tokenizer_config.json or set - templated_input_vector.push_back(input); } + encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false)); + } else { + encoded_input = m_tokenizer.encode(*input_vector); } - encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false)); } else if (auto input_prompt = std::get_if<std::string>(&inputs)) { std::string& prompt = *input_prompt; @@ -170,7 +171,7 @@ DecodedResults StatefulLLMPipeline::generate( // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied } else { std::string& prompt = *input_prompt; - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 3f0be25613..0d84ef4f3c 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -827,7 +827,7 @@ DecodedResults StatefulLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); @@ -1302,7 +1302,7 @@ DecodedResults StatelessLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - if (!m_tokenizer.get_chat_template().empty()) { + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { ChatHistory history({{{"role", "user"}, {"content", prompt}}}); constexpr bool add_generation_prompt = true; auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 616a12bb6f..a47f44a778 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -50,7 +50,7 @@ class InputsEmbedder::IInputsEmbedder { ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; public: - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0; + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) = 0; virtual std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }}; @@ -155,7 +155,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) { + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool apply_chat_template = true) { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. @@ -169,9 +169,9 @@ class InputsEmbedder::IInputsEmbedder { m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; std::string new_templated_chat_history; - if (!m_tokenizer.get_chat_template().empty()) { + try { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - } else { + } catch (const std::exception& error) { // Use fallback chat template if it was not found in tokenizer_config.json new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } @@ -223,19 +223,22 @@ class InputsEmbedder::IInputsEmbedder { m_tokenized_history.clear(); std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); } else { - std::string templated_prompt; - ChatHistory history({{{"role", "user"}, {"content", prompt}}}); - constexpr bool add_generation_prompt = true; + auto start_tokenizer_time = std::chrono::steady_clock::now(); + if (apply_chat_template) { + std::string templated_prompt; + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; - if (!m_tokenizer.get_chat_template().empty()) { - templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + if (!m_tokenizer.get_chat_template().empty()) { + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + } else { + // Use fallback chat template if it was not found in tokenizer_config.json + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback); + } + encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids; } else { - // Use fallback chat template if it was not found in tokenizer_config.json - templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback); + encoded_input_ids = m_tokenizer.encode(prompt).input_ids; } - - auto start_tokenizer_time = std::chrono::steady_clock::now(); - encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_tokenized_history.clear(); @@ -331,7 +334,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string images_prompt; std::vector<EncodedImage> embeds; @@ -366,7 +369,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { } images_prompt += prompt; - ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics); + ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics, {}, apply_chat_template); ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); OPENVINO_ASSERT( @@ -629,7 +632,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; @@ -647,7 +650,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -742,7 +745,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { const ov::AnyMap device_config) : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; @@ -774,7 +777,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1069,7 +1072,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; @@ -1097,7 +1100,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { } formatted_prompt += prompt; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, {}, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1214,7 +1217,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { ).create_infer_request(); } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override { std::string formatted_prompt; std::vector<ov::Tensor> single_images = to_single_image_tensors(images); @@ -1246,7 +1249,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { // Adapted from Qwen/Qwen2-7B-Instruct std::string chat_template_fallback = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback); + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template); ov::Tensor text_embeds = m_embedding.infer(input_ids); if (images.empty()) { @@ -1616,8 +1619,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) { - return m_impl->get_inputs_embeds(prompt, images, metrics); +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) { + return m_impl->get_inputs_embeds(prompt, images, metrics, apply_chat_template); } std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 223d090b22..b21d0198cc 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -32,7 +32,7 @@ class InputsEmbedder { const ov::AnyMap device_config); // compute input embedding for prompt and multiple images - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template); // compute position ids for language model input std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 95e3064548..415e7fcae9 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -166,7 +166,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { generation_config.validate(); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); + ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics, generation_config.apply_chat_template); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist(); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index bba366401e..5dbf595d24 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -572,6 +572,7 @@ class GenerationConfig: num_return_sequences: the number of sequences to generate from a single prompt. """ adapters: AdapterConfig | None + apply_chat_template: bool assistant_confidence_threshold: float diversity_penalty: float do_sample: bool diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index e2a6d7062c..a7d7789a55 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -115,6 +115,7 @@ void init_generation_config(py::module_& m) { .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) .def_readwrite("adapters", &GenerationConfig::adapters) + .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index ed6263a284..8f2d2b05b9 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -274,7 +274,7 @@ def run_hugging_face( else: processed_prompts = prompts # process all prompts as a single batch as we have a single generation config for all prompts - inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left') + inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left') input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 372a034148..398c531a9a 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -233,6 +233,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data gen_config.rng_seed = args["seed"] gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + gen_config.apply_chat_template = False if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get('num_assistant_tokens', None): @@ -379,6 +380,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + gen_config.apply_chat_template = False enable_prompt_permutations = not args.get("disable_prompt_permutation", False) if enable_prompt_permutations: log.warning( diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index a02b16b2bb..9cc6702999 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -211,6 +211,7 @@ def run_visual_language_generation_genai( gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + gen_config.apply_chat_template = False kwargs = {} if len(images) >= 1: kwargs["images"] = images[0] diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 7d4354f846..fa7dc40401 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -267,7 +267,7 @@ def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, us model.finish_chat() return result else: - return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens) + return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens, apply_chat_template=False) def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False): @@ -336,6 +336,7 @@ def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_to config = model.get_generation_config() config.max_new_tokens = max_new_tokens config.do_sample = False + config.apply_chat_template = False model.set_generation_config(config) if tokenizer.chat_template is not None: model.start_chat(tokenizer.chat_template)