Move chat history updates to general function and fix last generated token logic for streaming cases #7544

Workflow file for this run

.github/workflows/causal_lm_cpp.yml at aa1ae76

	name: causal_lm_cpp
	on:
	workflow_dispatch:
	pull_request:
	merge_group:
	push:
	branches:
	- master
	- 'releases/**'

	permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions

	concurrency:
	# github.ref is not unique in post-commit
	group: ${{ github.event_name == 'push' && github.run_id \|\| github.ref }}-causal-lm-cpp
	cancel-in-progress: true

	env:
	l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/l_openvino_toolkit_ubuntu20_2025.1.0.dev20250116_x86_64.tgz
	l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/l_openvino_toolkit_ubuntu22_2025.1.0.dev20250116_x86_64.tgz
	m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/m_openvino_toolkit_macos_12_6_2025.1.0.dev20250116_x86_64.tgz
	w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/w_openvino_toolkit_windows_2025.1.0.dev20250116_x86_64.zip

	jobs:
	cpp-multinomial-greedy_causal_lm-ubuntu:
	runs-on: ubuntu-20.04-8-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.9
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
	optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
	wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors
	- run: >
	. ./ov/setupvars.sh
	&& timeout 35s ./build/samples/cpp/text_generation/multinomial_causal_lm ./open_llama_3b_v2/ a
	env:
	PYTHONPATH: "./build"
	- run: >
	. ./ov/setupvars.sh
	&& timeout 35s ./samples/python/text_generation/multinomial_causal_lm.py ./open_llama_3b_v2/ b
	env:
	PYTHONPATH: "./build"
	- run: >
	. ./ov/setupvars.sh
	&& timeout 35s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
	\| diff <(timeout 25s samples/python/text_generation/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
	env:
	PYTHONPATH: "./build"
	- run: >
	. ./ov/setupvars.sh
	&& samples/python/text_generation/lora.py ./TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
	env:
	PYTHONPATH: "./build"

	cpp-beam_search_causal_lm-ubuntu:
	strategy:
	matrix:
	executable:
	[
	./build/samples/cpp/text_generation/beam_search_causal_lm,
	python ./samples/python/text_generation/beam_search_causal_lm.py,
	]
	runs-on: ubuntu-20.04
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10'
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	- name: Compare
	env:
	PYTHONPATH: "./build/" # C++ ignores that
	run: \|
	source ./ov/setupvars.sh
	timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
	python -c "
	import transformers
	with open('pred.txt', 'r') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	prompt = 'Why is the Sun yellow?'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=99, do_sample=False):
	ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref)
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo "Why is the Sun yellow?" passed

	timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
	python -c "
	import transformers
	with open('pred.txt', 'r') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	prompt = '69'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=99, do_sample=False):
	ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref)
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo 69 passed

	timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
	python -c "
	import transformers
	with open('pred.txt', 'r') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	prompt = 'Hi'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=99, do_sample=False):
	ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref)
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo "Hi" passed

	timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
	python -c "
	import transformers
	with open('pred.txt', 'r') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	prompt = 'return 0'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=99, do_sample=False):
	ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref)
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo "return 0" passed

	timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好！你好嗎？" > ./pred.txt
	python -c "
	import transformers
	with open('pred.txt', 'r', errors='ignore') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	prompt = '你好！你好嗎？'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=99, do_sample=False):
	ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref.replace('�', ''))
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo "你好！你好嗎？" passed

	timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" "return 0" "你好！你好嗎？" > ./pred.txt
	python -c "
	import transformers
	with open('pred.txt', 'r', errors='ignore') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	prompts = [
	'Why is the Sun yellow?',
	'return 0',
	'你好！你好嗎？'
	]
	for prompt in prompts:
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=99, do_sample=False):
	ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref.replace('�', ''))
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo "Multi prompt" passed

	cpp-greedy_causal_lm-windows:
	runs-on: windows-latest
	env:
	PYTHONIOENCODING: "utf8"
	defaults:
	run:
	shell: cmd
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.9
	- run: curl --output ov.zip ${{ env.w_ov_link }}
	- run: unzip -d ov ov.zip
	- run: dirs=(ov/) && mv ov//* ov && rmdir "${dirs[@]}"
	shell: bash
	- name: Build app
	run: \|
	call .\ov\setupvars.bat
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert model
	run: \|
	call .\ov\setupvars.bat
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
	curl -o adapter_model.safetensors -s -L https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true
	- run: >
	set PATH=.\build\openvino_genai\;%PATH%
	&& call .\ov\setupvars.bat
	&& .\build\samples\cpp\text_generation\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt
	- run: \|
	echo import transformers > ref.py
	echo predictions = open('cpp.txt', 'r').read() >> ref.py
	echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
	echo prompt = '69' >> ref.py
	echo if tokenizer.chat_template: >> ref.py
	echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) >> ref.py
	echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py
	echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
	echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
	echo idx = predictions.find(ref) >> ref.py
	echo if -1 == idx: >> ref.py
	echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
	echo predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py
	- run: python ref.py
	- run: >
	set PATH=.\build\openvino_genai\;%PATH%
	&& set "PYTHONPATH=./build/"
	&& call .\ov\setupvars.bat
	&& python samples\python\text_generation\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
	- run: fc .\cpp.txt .\py.txt
	- run: >
	set PATH=.\build\openvino_genai\;%PATH%
	&& set "PYTHONPATH=./build/"
	&& call .\ov\setupvars.bat
	&& python samples\python\text_generation\lora.py .\TinyLlama\TinyLlama-1.1B-intermediate-step-1431k-3T\ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"

	cpp-greedy_causal_lm-Qwen-7B-Chat:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.11
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
	- run: >
	. ./ov/setupvars.sh
	&& timeout 2m ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ 69 \| diff <(timeout 2m samples/python/text_generation/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
	env:
	PYTHONPATH: "./build"

	cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.12
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
	- run: >
	. ./ov/setupvars.sh
	&& timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！"
	\| diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好！") -
	env:
	PYTHONPATH: "./build"

	cpp-beam_search_causal_lm-Phi-2:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.9
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
	- run: >
	. ./ov/setupvars.sh
	&& timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./phi-2/ 69
	\| diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./phi-2/ 69) -
	env:
	PYTHONPATH: "./build"

	cpp-beam_search_causal_lm-notus-7b-v1:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10'
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
	- run: >
	. ./ov/setupvars.sh
	&& timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./notus-7b-v1/ 69
	\| diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./notus-7b-v1/ 69) -
	env:
	PYTHONPATH: "./build"

	cpp-speculative_decoding_lm-ubuntu:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.11
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
	- name: run and compare
	run: \|
	source ./ov/setupvars.sh
	./build/samples/cpp/text_generation/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt
	./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
	python ./samples/python/text_generation/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt
	python -c "
	with open('predictions_greedy.txt', 'r') as f:
	predicted_greedy = f.readline()
	with open('predictions_speculative.txt', 'r') as f:
	predicted_speculative = f.readline()
	with open('predictions_py.txt', 'r') as f:
	predicted_py = f.readline()
	assert predicted_greedy == predicted_speculative
	assert predicted_greedy == predicted_py
	assert predicted_speculative == predicted_py
	"
	echo "Alan Turing was a" passed
	env:
	PYTHONPATH: "./build/:$PYTHONPATH"
	LD_LIBRARY_PATH: "./build/openvino_genai/:$LD_LIBRARY_PATH"

	cpp-prompt_lookup_decoding_lm-ubuntu:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.12
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	- name: run and compare
	run: \|
	source ./ov/setupvars.sh

	echo 'Code:```python
	def add(a, b):
	return a + b
	```
	Question: Can you please add 2 and 3
	A:' > ./prompt.txt

	./build/samples/cpp/text_generation/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
	./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
	python ./samples/python/text_generation/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_py.txt
	python -c "
	with open('predictions_greedy.txt', 'r') as f:
	predicted_greedy = f.readline()
	with open('predictions_prompt_lookup.txt', 'r') as f:
	predicted_prompt_lookup = f.readline()
	with open('predictions_py.txt', 'r') as f:
	predicted_prompt_lookup_py = f.readline()
	assert predicted_greedy == predicted_prompt_lookup
	assert predicted_greedy == predicted_prompt_lookup_py
	assert predicted_prompt_lookup == predicted_prompt_lookup_py
	"
	echo "Prompt lookup" passed
	env:
	PYTHONPATH: "./build/:$PYTHONPATH"
	LD_LIBRARY_PATH: "./build/openvino_genai/:$LD_LIBRARY_PATH"

	cpp-Phi-1_5:
	runs-on: ubuntu-20.04-16-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.9
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
	- name: Run Generation
	run: \|
	source ./ov/setupvars.sh
	timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
	- name: Compare
	run: \|
	python -c "
	import transformers
	with open('pred_greedy.txt', 'r') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
	prompt = 'Alan Turing was a'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
	ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref)
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref=}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo Phi-1_5 passed
	- run: >
	. ./ov/setupvars.sh
	&& timeout 50s samples/python/text_generation/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
	\| diff ./pred_greedy.txt -
	env:
	PYTHONPATH: "./build"

	cpp-greedy_causal_lm-redpajama-3b-chat:
	runs-on: ubuntu-20.04-8-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: '3.10'
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
	- name: Run Generation
	run: \|
	source ./ov/setupvars.sh
	timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
	- name: Compare
	run: \|
	python -c "
	import transformers
	with open('pred_greedy.txt', 'r') as file:
	predictions = file.read()
	tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
	prompt = 'Alan Turing was a'
	if tokenizer.chat_template:
	prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
	for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
	ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
	idx = predictions.find(ref)
	if -1 == idx:
	raise RuntimeError(f'Missing "{ref}" from predictions')
	predictions = predictions[:idx] + predictions[idx + len(ref):]
	"
	echo "Alan Turing was a" passed
	- run: >
	. ./ov/setupvars.sh
	&& timeout 50s samples/python/text_generation/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
	\| diff ./pred_greedy.txt -
	env:
	PYTHONPATH: "./build"

	cpp-chat_sample-ubuntu:
	runs-on: ubuntu-24.04
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.11
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	- name: Compare
	env:
	PYTHONPATH: "./build"
	run: \|
	source ./ov/setupvars.sh
	printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt
	timeout 30s ./build/samples/cpp/text_generation/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
	python -c "
	from transformers import AutoTokenizer, AutoModelForCausalLM
	model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id)
	prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
	def gen_prompt(prompt):
	return {'role': 'user', 'content': prompt}
	def gen_answer(answer):
	return {'role': 'assistant', 'content': answer}
	chat_history = []
	chat_prompt = ''
	output = open('ref.txt', 'w')
	for prompt in prompts:
	output.write('question:\n')
	chat_history.append(gen_prompt(prompt))
	chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
	tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
	answer = model.generate(**tokenized, max_length=1000, do_sample=False)
	answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
	chat_history.append(gen_answer(answer_str))
	output.write(answer_str)
	output.write('\n----------\n')
	output.write('question:\n')
	output.close()
	"
	diff pred.txt ref.txt
	echo "Chat sample cpp" passed
	timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
	diff pred2.txt ref.txt
	echo "Chat sample python" passed

	visual_language_chat_sample-ubuntu-minicpm_v2_6:
	runs-on: ubuntu-22.04-16-cores
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.11
	- uses: ./.github/actions/install_openvino
	with:
	ov_link: ${{ env.l_u22_ov_link }}
	- uses: ./.github/actions/build_app
	with:
	build_target: 'visual_language_chat benchmark_vlm py_openvino_genai'
	- uses: ./.github/actions/install_python_deps
	- name: Download and convert tiny-random-minicpmv-2_6 model and an image
	run: \|
	source ./ov/setupvars.sh
	optimum-cli export openvino -m katuni4ka/tiny-random-minicpmv-2_6 tiny-random-minicpmv-2_6 --trust-remote-code --task image-text-to-text
	mkdir images
	- name: Generate images - tiny-random-minicpmv-2_6
	shell: python
	run: \|
	from PIL import Image
	import numpy as np
	import requests
	res = 28, 28
	lines = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255
	lines = lines.reshape([*res, 3])
	lines_image = Image.fromarray(lines)
	lines_image.save("images/lines.png")
	cat = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB')
	cat.save("images/cat.png")
	- name: Run visual_language_chat C++ sample - tiny-random-minicpmv-2_6
	run: >
	set -o pipefail
	&& source ./ov/setupvars.sh
	&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/
	<<< $'Describe the images?' \| tee cpp.txt
	timeout-minutes: 2
	- name: Run benchmark_vlm C++ sample - tiny-random-minicpmv-2_6
	run: >
	set -o pipefail
	&& source ./ov/setupvars.sh
	&& ./build/samples/cpp/visual_language_chat/benchmark_vlm -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
	timeout-minutes: 2
	- name: Run visual_language_chat Python sample - tiny-random-minicpmv-2_6
	run: >
	set -o pipefail
	&& source ./ov/setupvars.sh
	&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/
	<<< $'Describe the images?' \| tee py.txt
	env:
	PYTHONPATH: "./build/"
	- name: Run benchmark_vlm Python sample - tiny-random-minicpmv-2_6
	run: >
	set -o pipefail
	&& source ./ov/setupvars.sh
	&& ./samples/python/visual_language_chat/benchmark_vlm.py -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
	env:
	PYTHONPATH: "./build/"
	- name: Encode cpp.txt with Python encoding instead of terminal one
	shell: python
	run: \|
	with open("cpp.txt", "rb") as f:
	content = f.read().decode("utf-8", "replace")
	with open("cpp.txt", "wb") as f:
	f.write(content.encode("utf-8"))
	- run: diff cpp.txt py.txt
	- name: Run visual_language_chat C++ sample with 2 prompts - tiny-random-minicpmv-2_6
	run: >
	set -o pipefail
	&& source ./ov/setupvars.sh
	&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/cat.png
	<<< $'What is unusual on this image?\nGo on.' \| tee cpp2.txt
	timeout-minutes: 2
	- name: Run visual_language_chat Python sample with 2 prompts - tiny-random-minicpmv-2_6
	run: >
	set -o pipefail
	&& source ./ov/setupvars.sh
	&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/cat.png
	<<< $'What is unusual on this image?\nGo on.' \| tee py2.txt
	env:
	PYTHONPATH: "./build/"
	- name: Encode cpp2.txt with Python encoding instead of terminal one
	shell: python
	run: \|
	with open("cpp2.txt", "rb") as f:
	content = f.read().decode("utf-8", "replace")
	with open("cpp2.txt", "wb") as f:
	f.write(content.encode("utf-8"))
	- run: diff cpp2.txt py2.txt

	visual_language_chat_sample-ubuntu-llava_1_5:
	uses: ./.github/workflows/job_vlm_sample_llava.yml
	with:
	model_id: llava-hf/llava-1.5-7b-hf
	model_dir: llava_1_5_7b_ov

	visual_language_chat_sample-ubuntu-llava_next:
	uses: ./.github/workflows/job_vlm_sample_llava.yml
	with:
	model_id: llava-hf/llava-v1.6-mistral-7b-hf
	model_dir: llava_v1_6_mistral_7b_ov

	visual_language_chat_sample-ubuntu-internvl2:
	runs-on: ubuntu-22.04-16-cores
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.11
	- uses: ./.github/actions/install_openvino
	with:
	ov_link: ${{ env.l_u22_ov_link }}
	- uses: ./.github/actions/build_app
	with:
	build_target: 'visual_language_chat py_openvino_genai'
	- uses: ./.github/actions/install_python_deps
	- name: Download and convert InternVL2 model
	run: \|
	# Lowering transformers version, workaround for https://huggingface.co/OpenGVLab/InternVL2-1B/discussions/7
	python -m pip install -U "transformers<4.45.0"
	source ./ov/setupvars.sh
	optimum-cli export openvino --model OpenGVLab/InternVL2-4B ./internvl2_4b_ov/ --trust-remote-code
	- name: Download images
	run: \|
	wget https://llava-vl.github.io/static/images/monalisa.jpg
	- name: Run visual_language_chat C++ sample - InternVL2
	run: >
	source ./ov/setupvars.sh
	&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./internvl2_4b_ov/ monalisa.jpg
	<<< $'Who drew this painting?\nWhen did the painter live?'
	timeout-minutes: 4

	visual_language_chat_sample-ubuntu-qwen2vl:
	runs-on: ubuntu-22.04-16-cores
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.11
	- uses: ./.github/actions/install_openvino
	with:
	ov_link: ${{ env.l_u22_ov_link }}
	- uses: ./.github/actions/build_app
	with:
	build_target: 'visual_language_chat py_openvino_genai'
	- uses: ./.github/actions/install_python_deps
	- name: Download and convert Qwen2VL model
	run: \|
	source ./ov/setupvars.sh
	optimum-cli export openvino --model Qwen/Qwen2-VL-2B-Instruct ./qwen2_vl_2b_ov/ --trust-remote-code
	- name: Download images
	run: \|
	wget https://llava-vl.github.io/static/images/monalisa.jpg
	- name: Run visual_language_chat C++ sample - Qwen2VL
	run: >
	source ./ov/setupvars.sh
	&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./qwen2_vl_2b_ov/ monalisa.jpg
	<<< $'Who drew this painting?\nWhen did the painter live?'
	timeout-minutes: 4

	cpp-continuous-batching-ubuntu:
	runs-on: ubuntu-20.04-8-cores
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.12
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.l_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	sudo ./ov/install_dependencies/install_openvino_dependencies.sh
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	- name: Run gtests
	run: \|
	source ./ov/setupvars.sh
	./build/tests/cpp/tests_continuous_batching
	- name: Run accuracy_sample
	run: \|
	source ./ov/setupvars.sh
	timeout --verbose 50s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
	- name: Run throughput_benchmark
	run: \|
	wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
	source ./ov/setupvars.sh
	timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
	timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1

	cpp-continuous-batching-windows:
	runs-on: windows-latest
	env:
	PYTHONIOENCODING: "utf8"
	defaults:
	run:
	shell: cmd
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.9
	- name: Install OpenVINO
	run: \|
	curl --output ov.zip ${{ env.w_ov_link }}
	unzip -d ov ov.zip
	dirs=(ov/) && mv ov//* ov && rmdir "${dirs[@]}"
	shell: bash
	- name: Build app
	run: \|
	call .\ov\setupvars.bat
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	call .\ov\setupvars.bat
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	- name: Run gtests
	run: \|
	set PATH=.\build\openvino_genai\;%PATH%
	call .\ov\setupvars.bat
	.\build\tests\cpp\Release\tests_continuous_batching.exe
	- name: Run accuracy_sample
	run: \|
	set PATH=.\build\openvino_genai\;%PATH%
	call .\ov\setupvars.bat
	.\build\tools\continuous_batching\accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5
	- name: Run throughput_benchmark
	run: \|
	curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
	set PATH=.\build\openvino_genai\;%PATH%
	call .\ov\setupvars.bat
	.\build\tools\continuous_batching\benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1

	cpp-continuous-batching-macos:
	runs-on: macos-13
	defaults:
	run:
	shell: bash
	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	- uses: actions/setup-python@v4
	with:
	python-version: 3.9
	- name: Install OpenVINO
	run: \|
	mkdir ./ov/
	curl ${{ env.m_ov_link }} \| tar --directory ./ov/ --strip-components 1 -xz
	brew install coreutils scons
	- name: Build app
	run: \|
	source ./ov/setupvars.sh
	cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
	cmake --build ./build/ --config Release -j
	- name: Download and convert and model
	run: \|
	source ./ov/setupvars.sh
	python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
	python -m pip install -r ./samples/requirements.txt
	optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
	- name: Run gtests
	run: \|
	source ./ov/setupvars.sh
	./build/tests/cpp/tests_continuous_batching
	- name: Run accuracy_sample
	run: \|
	source ./ov/setupvars.sh
	timeout --verbose 120s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
	- name: Run throughput_benchmark
	run: \|
	wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
	source ./ov/setupvars.sh
	./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1

	Overall_Status:
	name: ci/gha_overall_status_causal_lm
	needs: [cpp-multinomial-greedy_causal_lm-ubuntu, cpp-beam_search_causal_lm-ubuntu, cpp-greedy_causal_lm-windows,
	cpp-greedy_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
	cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu,
	cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
	visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-llava_1_5, visual_language_chat_sample-ubuntu-llava_next, visual_language_chat_sample-ubuntu-internvl2,
	cpp-continuous-batching-windows, cpp-continuous-batching-macos]
	if: ${{ always() }}
	runs-on: ubuntu-latest
	steps:
	- name: Check status of all jobs
	if: >-
	${{
	contains(needs.*.result, 'failure') \|\|
	contains(needs.*.result, 'cancelled')
	}}
	run: exit 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Move chat history updates to general function and fix last generated token logic for streaming cases #7544

Workflow file

Move chat history updates to general function and fix last generated token logic for streaming cases #7544

Jobs

Run details

Workflow file for this run