From d6fbb642d025e6ebfc5eedd95b8692f31f78a484 Mon Sep 17 00:00:00 2001
From: sbalandi <sofya.balandina@intel.com>
Date: Wed, 22 Jan 2025 15:39:12 +0000
Subject: [PATCH] apply_chat_template as config property

---
 .github/workflows/causal_lm_cpp.yml           | 42 +++++++-------
 .../openvino/genai/generation_config.hpp      |  2 +
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +-
 .../genai/visual_language/pipeline.hpp        |  8 +--
 src/cpp/src/generation_config.cpp             |  1 +
 src/cpp/src/icontinuous_batching.cpp          |  5 +-
 src/cpp/src/llm_pipeline_stateful.cpp         | 17 +++---
 src/cpp/src/llm_pipeline_static.cpp           |  4 +-
 .../src/visual_language/inputs_embedder.cpp   | 55 ++++++++++---------
 .../src/visual_language/inputs_embedder.hpp   |  2 +-
 src/cpp/src/visual_language/pipeline.cpp      |  2 +-
 .../openvino_genai/py_openvino_genai.pyi      |  1 +
 src/python/py_generation_config.cpp           |  1 +
 tests/python_tests/common.py                  |  2 +-
 tools/llm_bench/task/text_generation.py       |  2 +
 .../task/visual_language_generation.py        |  1 +
 tools/who_what_benchmark/whowhatbench/wwb.py  |  3 +-
 17 files changed, 83 insertions(+), 69 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index c5e8ec0996..5dff0a58d3 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -122,8 +122,8 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           prompt = 'Why is the Sun yellow?'
           if tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
               ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
@@ -141,8 +141,8 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           prompt = '69'
           if tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
               ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
@@ -161,7 +161,7 @@ jobs:
           prompt = 'Hi'
           if tokenizer.chat_template:
             prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
               ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
@@ -179,8 +179,8 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           prompt = 'return 0'
           if tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
               ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
@@ -198,8 +198,8 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           prompt = '你好! 你好嗎?'
           if tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
               ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref.replace('�', ''))
@@ -209,21 +209,21 @@ jobs:
           "
           echo "你好! 你好嗎?" passed
 
-          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt
+          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" "return 0" "你好! 你好嗎?" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r', errors='ignore') as file:
               predictions = file.read()
           tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           prompts = [
-            'Alan Turing was a',
+            'Why is the Sun yellow?',
             'return 0',
             '你好! 你好嗎?'
           ]
           for prompt in prompts:
             if tokenizer.chat_template:
-              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-            tokenized = tokenizer(prompt, return_tensors='pt')
+                prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+            tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
             for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
                 ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
                 idx = predictions.find(ref.replace('�', ''))
@@ -272,10 +272,10 @@ jobs:
           echo import transformers > ref.py
           echo predictions = open('cpp.txt', 'r').read() >> ref.py
           echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
-          echo prompt = '69'
-          echo  if tokenizer.chat_template:
-          echo    prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          echo tokenized = tokenizer(prompt, return_tensors='pt') >> ref.py
+          echo prompt = '69' >> ref.py
+          echo if tokenizer.chat_template: >> ref.py
+          echo     prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) >> ref.py
+          echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py
           echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
           echo     ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
           echo     idx = predictions.find(ref) >> ref.py
@@ -584,8 +584,8 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
           prompt = 'Alan Turing was a'
           if tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
               ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
@@ -642,8 +642,8 @@ jobs:
           tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
           prompt = 'Alan Turing was a'
           if tokenizer.chat_template:
-            prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
-          tokenized = tokenizer(prompt, return_tensors='pt')
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
               ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 3a75fc02ea..cd372e635d 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
 
     std::optional<AdapterConfig> adapters;
 
+    bool apply_chat_template = true;
+
     /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
      * Otherwise verifies eos_token_id == tokenizer_eos_token_id.
      */
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index fa1e51c7d9..26232574dc 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -178,7 +178,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param streamer optional streamer
     * @return DecodedResults decoded resulting text
     * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
-    * Use custom_chat_template = "" to disable it for non-chat mode.
+    * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     */
     DecodedResults generate(
         StringInputs inputs,
@@ -194,7 +194,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param properties properties
     * @return DecodedResults decoded resulting text
     * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
-    * Use custom_chat_template = "" to disable it for non-chat mode.
+    * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     */
     template <typename... Properties>
     util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
index 48535722e9..b6b1d5c7f6 100644
--- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
+++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
@@ -99,7 +99,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @param streamer A streamer to acquire intermediate result.
     /// @return A string generated by a model.
     /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
-    /// Use custom_chat_template="" to disable it for non-chat mode.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     VLMDecodedResults generate(
         const std::string& prompt,
         const std::vector<ov::Tensor>& rgbs,
@@ -114,7 +114,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @param streamer A streamer to acquire intermediate result.
     /// @return A string generated by a model.
     /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
-    /// Use custom_chat_template="" to disable it for non-chat mode.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     VLMDecodedResults generate(
         const std::string& prompt,
         const ov::Tensor& rgb,
@@ -129,7 +129,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// images.
     /// @return A string generated by a model.
     /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
-    /// Use custom_chat_template="" to disable it for non-chat mode.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     VLMDecodedResults generate(
         const std::string& prompt,
         const ov::AnyMap& config_map
@@ -144,7 +144,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// ov::AnyMap.
     /// @return A string generated by a model.
     /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
-    /// Use custom_chat_template="" to disable it for non-chat mode.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     template <typename... Properties>
     util::EnableIfAllStringAny<VLMDecodedResults, Properties...> generate(
         const std::string& prompt,
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index 67682be787..dba5aaa5bd 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -125,6 +125,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {
     read_anymap_param(properties, "logprobs", logprobs);
     read_anymap_param(properties, "num_return_sequences", num_return_sequences);
     read_anymap_param(properties, "adapters", adapters);
+    read_anymap_param(properties, "apply_chat_template", apply_chat_template);
 
     // penalties
     read_anymap_param(properties, "frequency_penalty", frequency_penalty);
diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp
index d4fa7c3e5d..6b748d6665 100644
--- a/src/cpp/src/icontinuous_batching.cpp
+++ b/src/cpp/src/icontinuous_batching.cpp
@@ -53,10 +53,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
     } else {
         input_ids.reserve(prompts.size());
         timer.start();
-        for (const std::string& prompt : prompts) {
+        for (size_t i = 0; i < prompts.size(); i++) {
+            const std::string& prompt = prompts.at(i);
             const auto encode_start = std::chrono::steady_clock::now();
             ov::Tensor encoded_inputs;
-            if (!m_tokenizer.get_chat_template().empty()) {
+            if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
                 ChatHistory history({{{"role", "user"}, {"content", prompt}}});
                 constexpr bool add_generation_prompt = true;
                 auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
index 18e9d30ebc..250468ef9f 100644
--- a/src/cpp/src/llm_pipeline_stateful.cpp
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -9,6 +9,8 @@
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
 
+#include "debug_utils.hpp"
+
 namespace ov::genai {
 
 StatefulLLMPipeline::StatefulLLMPipeline(
@@ -88,19 +90,18 @@ DecodedResults StatefulLLMPipeline::generate(
 
     if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
         OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
-        std::vector<std::string> templated_input_vector;
-        for (auto& input : *input_vector) {
-            if (!m_tokenizer.get_chat_template().empty()) {
+        if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+            std::vector<std::string> templated_input_vector;
+            for (auto& input : *input_vector) {
                 ChatHistory history({{{"role", "user"}, {"content", input}}});
                 constexpr bool add_generation_prompt = true;
                 auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
                 templated_input_vector.push_back(templated_prompt);
-            } else {
-                // in case when chat_template was not found in tokenizer_config.json or set
-                templated_input_vector.push_back(input);
             }
+            encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
+        } else {
+            encoded_input = m_tokenizer.encode(*input_vector);
         }
-        encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
     } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
         std::string& prompt = *input_prompt;
 
@@ -170,7 +171,7 @@ DecodedResults StatefulLLMPipeline::generate(
             // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
         } else {
             std::string& prompt = *input_prompt;
-            if (!m_tokenizer.get_chat_template().empty()) {
+            if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
                 ChatHistory history({{{"role", "user"}, {"content", prompt}}});
                 constexpr bool add_generation_prompt = true;
                 auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 3f0be25613..0d84ef4f3c 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -827,7 +827,7 @@ DecodedResults StatefulLLMPipeline::generate(
         // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
         tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
     } else {
-        if (!m_tokenizer.get_chat_template().empty()) {
+        if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
             ChatHistory history({{{"role", "user"}, {"content", prompt}}});
             constexpr bool add_generation_prompt = true;
             auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
@@ -1302,7 +1302,7 @@ DecodedResults StatelessLLMPipeline::generate(
         // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
         tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
     } else {
-        if (!m_tokenizer.get_chat_template().empty()) {
+        if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
             ChatHistory history({{{"role", "user"}, {"content", prompt}}});
             constexpr bool add_generation_prompt = true;
             auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 616a12bb6f..a47f44a778 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -50,7 +50,7 @@ class InputsEmbedder::IInputsEmbedder {
     ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
 public:
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) = 0;
 
     virtual std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
@@ -155,7 +155,7 @@ class InputsEmbedder::IInputsEmbedder {
         ),
         m_tokenizer(tokenizer) { }
 
-    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) {
+    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool apply_chat_template = true) {
         ov::Tensor encoded_input_ids;
         if (m_is_chat_conversation) {
             // KV cache in model already contains prompts and answers from previous iterations.
@@ -169,9 +169,9 @@ class InputsEmbedder::IInputsEmbedder {
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
             std::string new_templated_chat_history;
-            if (!m_tokenizer.get_chat_template().empty()) {
+           try {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-            } else {
+            } catch (const std::exception& error) {
                 // Use fallback chat template if it was not found in tokenizer_config.json
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
@@ -223,19 +223,22 @@ class InputsEmbedder::IInputsEmbedder {
             m_tokenized_history.clear();
             std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
         } else {
-            std::string templated_prompt;
-            ChatHistory history({{{"role", "user"}, {"content", prompt}}});
-            constexpr bool add_generation_prompt = true;
+            auto start_tokenizer_time = std::chrono::steady_clock::now();
+            if (apply_chat_template) {
+                std::string templated_prompt;
+                ChatHistory history({{{"role", "user"}, {"content", prompt}}});
+                constexpr bool add_generation_prompt = true;
 
-            if (!m_tokenizer.get_chat_template().empty()) {
-                templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+                if (!m_tokenizer.get_chat_template().empty()) {
+                    templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+                } else {
+                    // Use fallback chat template if it was not found in tokenizer_config.json
+                    templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback);
+                }
+                encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
             } else {
-                // Use fallback chat template if it was not found in tokenizer_config.json
-                templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback);
+                encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
             }
-
-            auto start_tokenizer_time = std::chrono::steady_clock::now();
-            encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_tokenized_history.clear();
@@ -331,7 +334,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
         }
 
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override {
         std::string images_prompt;
         std::vector<EncodedImage> embeds;
 
@@ -366,7 +369,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         }
         images_prompt += prompt;
 
-        ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics);
+        ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics, {}, apply_chat_template);
 
         ov::Tensor inputs_embeds = m_embedding.infer(encoded_input);
         OPENVINO_ASSERT(
@@ -629,7 +632,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
         const ov::AnyMap device_config) :
         IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
 
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override {
         std::string image_token = m_vlm_config.im_start;
         // Adapted from llava-1.5-7b-hf chat_template.json
         std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
@@ -647,7 +650,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
         }
         formatted_prompt += prompt;
 
-        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback);
+        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template);
         ov::Tensor text_embeds = m_embedding.infer(input_ids);
 
         if (images.empty()) {
@@ -742,7 +745,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         const ov::AnyMap device_config) :
         InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
 
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override {
         std::string image_token = m_vlm_config.im_start;
         // Adapted from llava-1.5-7b-hf chat_template.json
         std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
@@ -774,7 +777,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         }
         formatted_prompt += prompt;
 
-        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback);
+        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template);
         ov::Tensor text_embeds = m_embedding.infer(input_ids);
 
         if (images.empty()) {
@@ -1069,7 +1072,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
         const ov::AnyMap device_config) :
         IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
 
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override {
         std::string image_start_token = m_vlm_config.image_start_token;
         std::string image_context_token = m_vlm_config.image_context_token;
         std::string image_end_token = m_vlm_config.image_end_token;
@@ -1097,7 +1100,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
         }
         formatted_prompt += prompt;
 
-        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics);
+        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, {}, apply_chat_template);
         ov::Tensor text_embeds = m_embedding.infer(input_ids);
 
         if (images.empty()) {
@@ -1214,7 +1217,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
             ).create_infer_request();
         }
     
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) override {
         std::string formatted_prompt;
 
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
@@ -1246,7 +1249,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
 
         // Adapted from Qwen/Qwen2-7B-Instruct
         std::string chat_template_fallback = "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}";
-        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback);
+        ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, metrics, chat_template_fallback, apply_chat_template);
         ov::Tensor text_embeds = m_embedding.infer(input_ids);
 
         if (images.empty()) {
@@ -1616,8 +1619,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
     }
 }
 
-ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
-    return m_impl->get_inputs_embeds(prompt, images, metrics);
+ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template) {
+    return m_impl->get_inputs_embeds(prompt, images, metrics, apply_chat_template);
 }
 
 std::pair<ov::Tensor, std::optional<int64_t>> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) {
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 223d090b22..b21d0198cc 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -32,7 +32,7 @@ class InputsEmbedder {
                    const ov::AnyMap device_config);
 
     // compute input embedding for prompt and multiple images
-    ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
+    ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics, bool apply_chat_template);
 
     // compute position ids for language model input
     std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 95e3064548..415e7fcae9 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -166,7 +166,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         generation_config.validate();
 
         auto start_get_inputs_embeds = std::chrono::steady_clock::now();
-        ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
+        ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics, generation_config.apply_chat_template);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
         auto to_remove_from_hist = m_inputs_embedder->get_num_tokens_to_remove_from_hist();
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index bba366401e..5dbf595d24 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -572,6 +572,7 @@ class GenerationConfig:
         num_return_sequences: the number of sequences to generate from a single prompt.
     """
     adapters: AdapterConfig | None
+    apply_chat_template: bool
     assistant_confidence_threshold: float
     diversity_penalty: float
     do_sample: bool
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index e2a6d7062c..a7d7789a55 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -115,6 +115,7 @@ void init_generation_config(py::module_& m) {
         .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output)
         .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids)
         .def_readwrite("adapters", &GenerationConfig::adapters)
+        .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template)
         .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
         .def("is_beam_search", &GenerationConfig::is_beam_search)
         .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index ed6263a284..8f2d2b05b9 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -274,7 +274,7 @@ def run_hugging_face(
         else:
             processed_prompts = prompts
         # process all prompts as a single batch as we have a single generation config for all prompts
-        inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
+        inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left')
         input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
         hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
         hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 372a034148..398c531a9a 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -233,6 +233,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     gen_config.rng_seed = args["seed"]
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
+    gen_config.apply_chat_template = False
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get('num_assistant_tokens', None):
@@ -379,6 +380,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
+    gen_config.apply_chat_template = False
     enable_prompt_permutations = not args.get("disable_prompt_permutation", False)
     if enable_prompt_permutations:
         log.warning(
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
index a02b16b2bb..9cc6702999 100644
--- a/tools/llm_bench/task/visual_language_generation.py
+++ b/tools/llm_bench/task/visual_language_generation.py
@@ -211,6 +211,7 @@ def run_visual_language_generation_genai(
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
+    gen_config.apply_chat_template = False
     kwargs = {}
     if len(images) >= 1:
         kwargs["images"] = images[0]
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 7d4354f846..fa7dc40401 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -267,7 +267,7 @@ def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, us
         model.finish_chat()
         return result
     else:
-        return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+        return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens, apply_chat_template=False)
 
 
 def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
@@ -336,6 +336,7 @@ def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_to
     config = model.get_generation_config()
     config.max_new_tokens = max_new_tokens
     config.do_sample = False
+    config.apply_chat_template = False
     model.set_generation_config(config)
     if tokenizer.chat_template is not None:
         model.start_chat(tokenizer.chat_template)