Skip to content

Move chat history updates to general function and fix last generated token logic for streaming cases #7544

Move chat history updates to general function and fix last generated token logic for streaming cases

Move chat history updates to general function and fix last generated token logic for streaming cases #7544

Workflow file for this run

name: causal_lm_cpp
on:
workflow_dispatch:
pull_request:
merge_group:
push:
branches:
- master
- 'releases/**'
permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions
concurrency:
# github.ref is not unique in post-commit
group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-causal-lm-cpp
cancel-in-progress: true
env:
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/l_openvino_toolkit_ubuntu20_2025.1.0.dev20250116_x86_64.tgz
l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/l_openvino_toolkit_ubuntu22_2025.1.0.dev20250116_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/m_openvino_toolkit_macos_12_6_2025.1.0.dev20250116_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/w_openvino_toolkit_windows_2025.1.0.dev20250116_x86_64.zip
jobs:
cpp-multinomial-greedy_causal_lm-ubuntu:
runs-on: ubuntu-20.04-8-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors
- run: >
. ./ov/setupvars.sh
&& timeout 35s ./build/samples/cpp/text_generation/multinomial_causal_lm ./open_llama_3b_v2/ a
env:
PYTHONPATH: "./build"
- run: >
. ./ov/setupvars.sh
&& timeout 35s ./samples/python/text_generation/multinomial_causal_lm.py ./open_llama_3b_v2/ b
env:
PYTHONPATH: "./build"
- run: >
. ./ov/setupvars.sh
&& timeout 35s ./build/samples/cpp/text_generation/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
| diff <(timeout 25s samples/python/text_generation/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
env:
PYTHONPATH: "./build"
- run: >
. ./ov/setupvars.sh
&& samples/python/text_generation/lora.py ./TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
env:
PYTHONPATH: "./build"
cpp-beam_search_causal_lm-ubuntu:
strategy:
matrix:
executable:
[
./build/samples/cpp/text_generation/beam_search_causal_lm,
python ./samples/python/text_generation/beam_search_causal_lm.py,
]
runs-on: ubuntu-20.04
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Compare
env:
PYTHONPATH: "./build/" # C++ ignores that
run: |
source ./ov/setupvars.sh
timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
python -c "
import transformers
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = 'Why is the Sun yellow?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "Why is the Sun yellow?" passed
timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
python -c "
import transformers
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = '69'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo 69 passed
timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
python -c "
import transformers
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = 'Hi'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "Hi" passed
timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
python -c "
import transformers
with open('pred.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = 'return 0'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "return 0" passed
timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt
python -c "
import transformers
with open('pred.txt', 'r', errors='ignore') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompt = '你好! 你好嗎?'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "你好! 你好嗎?" passed
timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" "return 0" "你好! 你好嗎?" > ./pred.txt
python -c "
import transformers
with open('pred.txt', 'r', errors='ignore') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
prompts = [
'Why is the Sun yellow?',
'return 0',
'你好! 你好嗎?'
]
for prompt in prompts:
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref.replace('�', ''))
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "Multi prompt" passed
cpp-greedy_causal_lm-windows:
runs-on: windows-latest
env:
PYTHONIOENCODING: "utf8"
defaults:
run:
shell: cmd
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- run: curl --output ov.zip ${{ env.w_ov_link }}
- run: unzip -d ov ov.zip
- run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
shell: bash
- name: Build app
run: |
call .\ov\setupvars.bat
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert model
run: |
call .\ov\setupvars.bat
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
curl -o adapter_model.safetensors -s -L https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true
- run: >
set PATH=.\build\openvino_genai\;%PATH%
&& call .\ov\setupvars.bat
&& .\build\samples\cpp\text_generation\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt
- run: |
echo import transformers > ref.py
echo predictions = open('cpp.txt', 'r').read() >> ref.py
echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
echo prompt = '69' >> ref.py
echo if tokenizer.chat_template: >> ref.py
echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) >> ref.py
echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py
echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
echo idx = predictions.find(ref) >> ref.py
echo if -1 == idx: >> ref.py
echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
echo predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py
- run: python ref.py
- run: >
set PATH=.\build\openvino_genai\;%PATH%
&& set "PYTHONPATH=./build/"
&& call .\ov\setupvars.bat
&& python samples\python\text_generation\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt
- run: fc .\cpp.txt .\py.txt
- run: >
set PATH=.\build\openvino_genai\;%PATH%
&& set "PYTHONPATH=./build/"
&& call .\ov\setupvars.bat
&& python samples\python\text_generation\lora.py .\TinyLlama\TinyLlama-1.1B-intermediate-step-1431k-3T\ adapter_model.safetensors "How to create a table with two columns, one of them has type float, another one has type int?"
cpp-greedy_causal_lm-Qwen-7B-Chat:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
- run: >
. ./ov/setupvars.sh
&& timeout 2m ./build/samples/cpp/text_generation/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/text_generation/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
env:
PYTHONPATH: "./build"
cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
- run: >
. ./ov/setupvars.sh
&& timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!"
| diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") -
env:
PYTHONPATH: "./build"
cpp-beam_search_causal_lm-Phi-2:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
- run: >
. ./ov/setupvars.sh
&& timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./phi-2/ 69
| diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./phi-2/ 69) -
env:
PYTHONPATH: "./build"
cpp-beam_search_causal_lm-notus-7b-v1:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
- run: >
. ./ov/setupvars.sh
&& timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./notus-7b-v1/ 69
| diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./notus-7b-v1/ 69) -
env:
PYTHONPATH: "./build"
cpp-speculative_decoding_lm-ubuntu:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
- name: run and compare
run: |
source ./ov/setupvars.sh
./build/samples/cpp/text_generation/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt
./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
python ./samples/python/text_generation/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt
python -c "
with open('predictions_greedy.txt', 'r') as f:
predicted_greedy = f.readline()
with open('predictions_speculative.txt', 'r') as f:
predicted_speculative = f.readline()
with open('predictions_py.txt', 'r') as f:
predicted_py = f.readline()
assert predicted_greedy == predicted_speculative
assert predicted_greedy == predicted_py
assert predicted_speculative == predicted_py
"
echo "Alan Turing was a" passed
env:
PYTHONPATH: "./build/:$PYTHONPATH"
LD_LIBRARY_PATH: "./build/openvino_genai/:$LD_LIBRARY_PATH"
cpp-prompt_lookup_decoding_lm-ubuntu:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: run and compare
run: |
source ./ov/setupvars.sh
echo 'Code:```python
def add(a, b):
return a + b
```
Question: Can you please add 2 and 3
A:' > ./prompt.txt
./build/samples/cpp/text_generation/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
python ./samples/python/text_generation/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_py.txt
python -c "
with open('predictions_greedy.txt', 'r') as f:
predicted_greedy = f.readline()
with open('predictions_prompt_lookup.txt', 'r') as f:
predicted_prompt_lookup = f.readline()
with open('predictions_py.txt', 'r') as f:
predicted_prompt_lookup_py = f.readline()
assert predicted_greedy == predicted_prompt_lookup
assert predicted_greedy == predicted_prompt_lookup_py
assert predicted_prompt_lookup == predicted_prompt_lookup_py
"
echo "Prompt lookup" passed
env:
PYTHONPATH: "./build/:$PYTHONPATH"
LD_LIBRARY_PATH: "./build/openvino_genai/:$LD_LIBRARY_PATH"
cpp-Phi-1_5:
runs-on: ubuntu-20.04-16-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
- name: Run Generation
run: |
source ./ov/setupvars.sh
timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
- name: Compare
run: |
python -c "
import transformers
with open('pred_greedy.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
if -1 == idx:
raise RuntimeError(f'Missing "{ref=}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo Phi-1_5 passed
- run: >
. ./ov/setupvars.sh
&& timeout 50s samples/python/text_generation/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
| diff ./pred_greedy.txt -
env:
PYTHONPATH: "./build"
cpp-greedy_causal_lm-redpajama-3b-chat:
runs-on: ubuntu-20.04-8-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
- name: Run Generation
run: |
source ./ov/setupvars.sh
timeout 50s ./build/samples/cpp/text_generation/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
- name: Compare
run: |
python -c "
import transformers
with open('pred_greedy.txt', 'r') as file:
predictions = file.read()
tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
prompt = 'Alan Turing was a'
if tokenizer.chat_template:
prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
idx = predictions.find(ref)
if -1 == idx:
raise RuntimeError(f'Missing "{ref}" from predictions')
predictions = predictions[:idx] + predictions[idx + len(ref):]
"
echo "Alan Turing was a" passed
- run: >
. ./ov/setupvars.sh
&& timeout 50s samples/python/text_generation/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
| diff ./pred_greedy.txt -
env:
PYTHONPATH: "./build"
cpp-chat_sample-ubuntu:
runs-on: ubuntu-24.04
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Compare
env:
PYTHONPATH: "./build"
run: |
source ./ov/setupvars.sh
printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt
timeout 30s ./build/samples/cpp/text_generation/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
python -c "
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
def gen_prompt(prompt):
return {'role': 'user', 'content': prompt}
def gen_answer(answer):
return {'role': 'assistant', 'content': answer}
chat_history = []
chat_prompt = ''
output = open('ref.txt', 'w')
for prompt in prompts:
output.write('question:\n')
chat_history.append(gen_prompt(prompt))
chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
answer = model.generate(**tokenized, max_length=1000, do_sample=False)
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
chat_history.append(gen_answer(answer_str))
output.write(answer_str)
output.write('\n----------\n')
output.write('question:\n')
output.close()
"
diff pred.txt ref.txt
echo "Chat sample cpp" passed
timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
diff pred2.txt ref.txt
echo "Chat sample python" passed
visual_language_chat_sample-ubuntu-minicpm_v2_6:
runs-on: ubuntu-22.04-16-cores
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- uses: ./.github/actions/install_openvino
with:
ov_link: ${{ env.l_u22_ov_link }}
- uses: ./.github/actions/build_app
with:
build_target: 'visual_language_chat benchmark_vlm py_openvino_genai'
- uses: ./.github/actions/install_python_deps
- name: Download and convert tiny-random-minicpmv-2_6 model and an image
run: |
source ./ov/setupvars.sh
optimum-cli export openvino -m katuni4ka/tiny-random-minicpmv-2_6 tiny-random-minicpmv-2_6 --trust-remote-code --task image-text-to-text
mkdir images
- name: Generate images - tiny-random-minicpmv-2_6
shell: python
run: |
from PIL import Image
import numpy as np
import requests
res = 28, 28
lines = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255
lines = lines.reshape([*res, 3])
lines_image = Image.fromarray(lines)
lines_image.save("images/lines.png")
cat = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB')
cat.save("images/cat.png")
- name: Run visual_language_chat C++ sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/
<<< $'Describe the images?' | tee cpp.txt
timeout-minutes: 2
- name: Run benchmark_vlm C++ sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/benchmark_vlm -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
timeout-minutes: 2
- name: Run visual_language_chat Python sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/
<<< $'Describe the images?' | tee py.txt
env:
PYTHONPATH: "./build/"
- name: Run benchmark_vlm Python sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./samples/python/visual_language_chat/benchmark_vlm.py -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
env:
PYTHONPATH: "./build/"
- name: Encode cpp.txt with Python encoding instead of terminal one
shell: python
run: |
with open("cpp.txt", "rb") as f:
content = f.read().decode("utf-8", "replace")
with open("cpp.txt", "wb") as f:
f.write(content.encode("utf-8"))
- run: diff cpp.txt py.txt
- name: Run visual_language_chat C++ sample with 2 prompts - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/cat.png
<<< $'What is unusual on this image?\nGo on.' | tee cpp2.txt
timeout-minutes: 2
- name: Run visual_language_chat Python sample with 2 prompts - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/cat.png
<<< $'What is unusual on this image?\nGo on.' | tee py2.txt
env:
PYTHONPATH: "./build/"
- name: Encode cpp2.txt with Python encoding instead of terminal one
shell: python
run: |
with open("cpp2.txt", "rb") as f:
content = f.read().decode("utf-8", "replace")
with open("cpp2.txt", "wb") as f:
f.write(content.encode("utf-8"))
- run: diff cpp2.txt py2.txt
visual_language_chat_sample-ubuntu-llava_1_5:
uses: ./.github/workflows/job_vlm_sample_llava.yml
with:
model_id: llava-hf/llava-1.5-7b-hf
model_dir: llava_1_5_7b_ov
visual_language_chat_sample-ubuntu-llava_next:
uses: ./.github/workflows/job_vlm_sample_llava.yml
with:
model_id: llava-hf/llava-v1.6-mistral-7b-hf
model_dir: llava_v1_6_mistral_7b_ov
visual_language_chat_sample-ubuntu-internvl2:
runs-on: ubuntu-22.04-16-cores
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- uses: ./.github/actions/install_openvino
with:
ov_link: ${{ env.l_u22_ov_link }}
- uses: ./.github/actions/build_app
with:
build_target: 'visual_language_chat py_openvino_genai'
- uses: ./.github/actions/install_python_deps
- name: Download and convert InternVL2 model
run: |
# Lowering transformers version, workaround for https://huggingface.co/OpenGVLab/InternVL2-1B/discussions/7
python -m pip install -U "transformers<4.45.0"
source ./ov/setupvars.sh
optimum-cli export openvino --model OpenGVLab/InternVL2-4B ./internvl2_4b_ov/ --trust-remote-code
- name: Download images
run: |
wget https://llava-vl.github.io/static/images/monalisa.jpg
- name: Run visual_language_chat C++ sample - InternVL2
run: >
source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./internvl2_4b_ov/ monalisa.jpg
<<< $'Who drew this painting?\nWhen did the painter live?'
timeout-minutes: 4
visual_language_chat_sample-ubuntu-qwen2vl:
runs-on: ubuntu-22.04-16-cores
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- uses: ./.github/actions/install_openvino
with:
ov_link: ${{ env.l_u22_ov_link }}
- uses: ./.github/actions/build_app
with:
build_target: 'visual_language_chat py_openvino_genai'
- uses: ./.github/actions/install_python_deps
- name: Download and convert Qwen2VL model
run: |
source ./ov/setupvars.sh
optimum-cli export openvino --model Qwen/Qwen2-VL-2B-Instruct ./qwen2_vl_2b_ov/ --trust-remote-code
- name: Download images
run: |
wget https://llava-vl.github.io/static/images/monalisa.jpg
- name: Run visual_language_chat C++ sample - Qwen2VL
run: >
source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./qwen2_vl_2b_ov/ monalisa.jpg
<<< $'Who drew this painting?\nWhen did the painter live?'
timeout-minutes: 4
cpp-continuous-batching-ubuntu:
runs-on: ubuntu-20.04-8-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Run gtests
run: |
source ./ov/setupvars.sh
./build/tests/cpp/tests_continuous_batching
- name: Run accuracy_sample
run: |
source ./ov/setupvars.sh
timeout --verbose 50s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
- name: Run throughput_benchmark
run: |
wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
source ./ov/setupvars.sh
timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
cpp-continuous-batching-windows:
runs-on: windows-latest
env:
PYTHONIOENCODING: "utf8"
defaults:
run:
shell: cmd
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
curl --output ov.zip ${{ env.w_ov_link }}
unzip -d ov ov.zip
dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
shell: bash
- name: Build app
run: |
call .\ov\setupvars.bat
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
call .\ov\setupvars.bat
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Run gtests
run: |
set PATH=.\build\openvino_genai\;%PATH%
call .\ov\setupvars.bat
.\build\tests\cpp\Release\tests_continuous_batching.exe
- name: Run accuracy_sample
run: |
set PATH=.\build\openvino_genai\;%PATH%
call .\ov\setupvars.bat
.\build\tools\continuous_batching\accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5
- name: Run throughput_benchmark
run: |
curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
set PATH=.\build\openvino_genai\;%PATH%
call .\ov\setupvars.bat
.\build\tools\continuous_batching\benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
cpp-continuous-batching-macos:
runs-on: macos-13
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
brew install coreutils scons
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Run gtests
run: |
source ./ov/setupvars.sh
./build/tests/cpp/tests_continuous_batching
- name: Run accuracy_sample
run: |
source ./ov/setupvars.sh
timeout --verbose 120s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
- name: Run throughput_benchmark
run: |
wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
source ./ov/setupvars.sh
./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
Overall_Status:
name: ci/gha_overall_status_causal_lm
needs: [cpp-multinomial-greedy_causal_lm-ubuntu, cpp-beam_search_causal_lm-ubuntu, cpp-greedy_causal_lm-windows,
cpp-greedy_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu,
cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-llava_1_5, visual_language_chat_sample-ubuntu-llava_next, visual_language_chat_sample-ubuntu-internvl2,
cpp-continuous-batching-windows, cpp-continuous-batching-macos]
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
- name: Check status of all jobs
if: >-
${{
contains(needs.*.result, 'failure') ||
contains(needs.*.result, 'cancelled')
}}
run: exit 1