-
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial support for AIU in vLLM. What is currently supported/tested: - Single AIU - Model: llama-7b-chat - Offline inference (batch size 1) - Online inference (with `max-num-seq=1`) --------- Signed-off-by: Nikolaos Papandreou <npo@zurich.ibm.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nikolaos Papandreou <npo@zurich.ibm.com> Co-authored-by: TRAVIS JOHNSON <tsjohnso@us.ibm.com>
- Loading branch information
1 parent
a43b10b
commit 44681e4
Showing
18 changed files
with
1,549 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,367 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"The wurlitzer extension is already loaded. To reload it, use:\n", | ||
" %reload_ext wurlitzer\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from vllm import LLM, SamplingParams\n", | ||
"import time\n", | ||
"%load_ext wurlitzer" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import json\n", | ||
"\n", | ||
"with open(\"/etc/aiu/senlib_config.json\", 'rb') as f:\n", | ||
" config = json.load(f)\n", | ||
"\n", | ||
"os.environ[\"AIU_CONFIG_FILE_0\"] = \"/etc/aiu/senlib_config.json\"\n", | ||
"os.environ[\"FLEX_RDMA_PCI_BUS_ADDR_0\"] = config[\"GENERAL\"][\"sen_bus_id\"][0]\n", | ||
"os.environ[\"AIU_WORLD_RANK_0\"] = \"0\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"INFO 06-27 12:28:18 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='/tmp/7B-F', speculative_config=None, tokenizer='/tmp/7B-F', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/tmp/7B-F)\n", | ||
">> DEBUG SETUP\n", | ||
"0 / 1 : Python Version : 3.9.18\n", | ||
"0 / 1 : PyTorch Version: 2.2.2+cpu\n", | ||
"0 / 1 : PCI Addr Rank 0 AIU_WORLD_RANK_0=0\n", | ||
"0 / 1 : PCI Addr Rank 0 FLEX_RDMA_PCI_BUS_ADDR_0=0000:1d:00.0\n", | ||
"0 / 1 : FLEX_COMPUTE=SENTIENT\n", | ||
"0 / 1 : FLEX_DEVICE=VFIO\n", | ||
"0 / 1 : DEEPRT_EXPORT_DIR=export/0\n", | ||
"0 / 1 : DTCOMPILER_EXPORT_DIR=export/0\n", | ||
"0 / 1 : AIU_CONFIG_FILE_0=/etc/aiu/senlib_config.json\n", | ||
"0 / 1 : SENLIB_DEVEL_CONFIG_FILE=/etc/aiu/senlib_config.json\n", | ||
"0 / 1 : FLEX_RDMA_PCI_BUS_ADDR_0=0000:1d:00.0\n", | ||
"0 / 1 : FLEX_RDMA_LOCAL_RANK=0\n", | ||
"0 / 1 : FLEX_RDMA_LOCAL_SIZE=1\n", | ||
"0 / 1 : FLEX_RDMA_WORLD_RANK=0\n", | ||
"0 / 1 : FLEX_RDMA_WORLD_SIZE=1\n", | ||
"0 / 1 : Sentient AIU: Enabled (0) (offset=0)\n", | ||
"0 / 1 : Dynamo Backend : sendnn_decoder\n", | ||
"0 / 1 : CPU Cores : 56 x 2 HW threads\n", | ||
"------------------------------------------------------------\n", | ||
"NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit from 64 to 160 to accomidate prompt size of 64 and decode tokens of 20\n", | ||
"NOTICE: Adjusting torch._dynamo.config.cache_size_limit from 8 to 160 to accomidate prompt size of 64 and decode tokens of 20\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Create an LLM.\n", | ||
"llm = LLM(\n", | ||
" model=\"/tmp/7B-F\",\n", | ||
" tokenizer=\"/tmp/7B-F\",\n", | ||
" max_model_len=2048,\n", | ||
" block_size=2048,\n", | ||
" device=\"sendnn\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\nProvide a list of instructions for preparing chicken soup for a family of four.\\n\\n### Response:']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Sample prompts.\n", | ||
"template = \"Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\n{}\\n\\n### Response:\"\n", | ||
"prompt1 = template.format(\n", | ||
" \"Provide a list of instructions for preparing chicken soup for a family of four.\"\n", | ||
")\n", | ||
"prompts = [\n", | ||
" prompt1,\n", | ||
"]\n", | ||
"print(prompts)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a sampling params object.\n", | ||
"max_tokens = 10\n", | ||
"sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"=============== WARM UP 1\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[sendnn_model_runner:execute_model] t_token: 12782.44ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 13177.26ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 18823.05ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 15443.51ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 13228.93ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 13462.87ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 13535.42ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 12860.41ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 13707.36ms\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed prompts: 100%|██████████| 1/1 [02:19<00:00, 139.99s/it]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[sendnn_model_runner:execute_model] t_token: 12949.60ms\n", | ||
"Time elaspsed for 10 tokens is 139.99 sec\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(\"=============== WARM UP 1\")\n", | ||
"t0 = time.time()\n", | ||
"outputs = llm.generate(prompts, sampling_params)\n", | ||
"print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"=============== WARM UP 2\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[sendnn_model_runner:execute_model] t_token: 795800.62ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 166.80ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 164.47ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 164.03ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 164.57ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 164.09ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 164.19ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 164.40ms\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed prompts: 100%|██████████| 1/1 [13:17<00:00, 797.29s/it]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[sendnn_model_runner:execute_model] t_token: 164.07ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 163.94ms\n", | ||
"Time elaspsed for 10 tokens is 797.29 sec\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(\"=============== WARM UP 2\")\n", | ||
"t0 = time.time()\n", | ||
"outputs = llm.generate(prompts, sampling_params)\n", | ||
"print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"=============== GENERATE\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[sendnn_model_runner:execute_model] t_token: 206.45ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 154.32ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 154.72ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 153.83ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 153.82ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 153.44ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 154.03ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 153.05ms\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Processed prompts: 100%|██████████| 1/1 [00:01<00:00, 1.60s/it]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[sendnn_model_runner:execute_model] t_token: 153.79ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 153.72ms\n", | ||
"[sendnn_model_runner:execute_model] t_token: 3.13ms\n", | ||
"Time elaspsed for 10 tokens is 1.60 sec\n", | ||
"Prompt: 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\nProvide a list of instructions for preparing chicken soup for a family of four.\\n\\n### Response:', Generated text: '\\nOf course! Here are the steps to prepare prepare'\n", | ||
"CompletionOutput(index=0, text='\\nOf course! Here are the steps to prepare prepare', token_ids=[13, 2776, 3236, 29991, 2266, 526, 278, 6576, 304, 19012, 19012], cumulative_logprob=-11.722966461020405, logprobs=None, finish_reason=length, stop_reason=None)\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(\"=============== GENERATE\")\n", | ||
"sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)\n", | ||
"t0 = time.time()\n", | ||
"outputs = llm.generate(prompts, sampling_params)\n", | ||
"print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n", | ||
"for output in outputs:\n", | ||
" prompt = output.prompt\n", | ||
" generated_text = output.outputs[0].text\n", | ||
" print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\n", | ||
"print(output.outputs[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.