Skip to content

Commit

Permalink
Initial support for AIU (#17)
Browse files Browse the repository at this point in the history
Initial support for AIU in vLLM.

What is currently supported/tested:
- Single AIU
- Model: llama-7b-chat
- Offline inference (batch size 1) 
- Online inference (with `max-num-seq=1`)

---------

Signed-off-by: Nikolaos Papandreou <npo@zurich.ibm.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Nikolaos Papandreou <npo@zurich.ibm.com>
Co-authored-by: TRAVIS JOHNSON <tsjohnso@us.ibm.com>
  • Loading branch information
3 people authored and GitHub Enterprise committed Jul 26, 2024
1 parent a43b10b commit 44681e4
Show file tree
Hide file tree
Showing 18 changed files with 1,549 additions and 11 deletions.
367 changes: 367 additions & 0 deletions examples/offline_inference_sendnn.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The wurlitzer extension is already loaded. To reload it, use:\n",
" %reload_ext wurlitzer\n"
]
}
],
"source": [
"from vllm import LLM, SamplingParams\n",
"import time\n",
"%load_ext wurlitzer"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"\n",
"with open(\"/etc/aiu/senlib_config.json\", 'rb') as f:\n",
" config = json.load(f)\n",
"\n",
"os.environ[\"AIU_CONFIG_FILE_0\"] = \"/etc/aiu/senlib_config.json\"\n",
"os.environ[\"FLEX_RDMA_PCI_BUS_ADDR_0\"] = config[\"GENERAL\"][\"sen_bus_id\"][0]\n",
"os.environ[\"AIU_WORLD_RANK_0\"] = \"0\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO 06-27 12:28:18 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='/tmp/7B-F', speculative_config=None, tokenizer='/tmp/7B-F', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/tmp/7B-F)\n",
">> DEBUG SETUP\n",
"0 / 1 : Python Version : 3.9.18\n",
"0 / 1 : PyTorch Version: 2.2.2+cpu\n",
"0 / 1 : PCI Addr Rank 0 AIU_WORLD_RANK_0=0\n",
"0 / 1 : PCI Addr Rank 0 FLEX_RDMA_PCI_BUS_ADDR_0=0000:1d:00.0\n",
"0 / 1 : FLEX_COMPUTE=SENTIENT\n",
"0 / 1 : FLEX_DEVICE=VFIO\n",
"0 / 1 : DEEPRT_EXPORT_DIR=export/0\n",
"0 / 1 : DTCOMPILER_EXPORT_DIR=export/0\n",
"0 / 1 : AIU_CONFIG_FILE_0=/etc/aiu/senlib_config.json\n",
"0 / 1 : SENLIB_DEVEL_CONFIG_FILE=/etc/aiu/senlib_config.json\n",
"0 / 1 : FLEX_RDMA_PCI_BUS_ADDR_0=0000:1d:00.0\n",
"0 / 1 : FLEX_RDMA_LOCAL_RANK=0\n",
"0 / 1 : FLEX_RDMA_LOCAL_SIZE=1\n",
"0 / 1 : FLEX_RDMA_WORLD_RANK=0\n",
"0 / 1 : FLEX_RDMA_WORLD_SIZE=1\n",
"0 / 1 : Sentient AIU: Enabled (0) (offset=0)\n",
"0 / 1 : Dynamo Backend : sendnn_decoder\n",
"0 / 1 : CPU Cores : 56 x 2 HW threads\n",
"------------------------------------------------------------\n",
"NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit from 64 to 160 to accomidate prompt size of 64 and decode tokens of 20\n",
"NOTICE: Adjusting torch._dynamo.config.cache_size_limit from 8 to 160 to accomidate prompt size of 64 and decode tokens of 20\n"
]
}
],
"source": [
"# Create an LLM.\n",
"llm = LLM(\n",
" model=\"/tmp/7B-F\",\n",
" tokenizer=\"/tmp/7B-F\",\n",
" max_model_len=2048,\n",
" block_size=2048,\n",
" device=\"sendnn\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\nProvide a list of instructions for preparing chicken soup for a family of four.\\n\\n### Response:']\n"
]
}
],
"source": [
"# Sample prompts.\n",
"template = \"Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\n{}\\n\\n### Response:\"\n",
"prompt1 = template.format(\n",
" \"Provide a list of instructions for preparing chicken soup for a family of four.\"\n",
")\n",
"prompts = [\n",
" prompt1,\n",
"]\n",
"print(prompts)\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Create a sampling params object.\n",
"max_tokens = 10\n",
"sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=============== WARM UP 1\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[sendnn_model_runner:execute_model] t_token: 12782.44ms\n",
"[sendnn_model_runner:execute_model] t_token: 13177.26ms\n",
"[sendnn_model_runner:execute_model] t_token: 18823.05ms\n",
"[sendnn_model_runner:execute_model] t_token: 15443.51ms\n",
"[sendnn_model_runner:execute_model] t_token: 13228.93ms\n",
"[sendnn_model_runner:execute_model] t_token: 13462.87ms\n",
"[sendnn_model_runner:execute_model] t_token: 13535.42ms\n",
"[sendnn_model_runner:execute_model] t_token: 12860.41ms\n",
"[sendnn_model_runner:execute_model] t_token: 13707.36ms\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 100%|██████████| 1/1 [02:19<00:00, 139.99s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[sendnn_model_runner:execute_model] t_token: 12949.60ms\n",
"Time elaspsed for 10 tokens is 139.99 sec\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"print(\"=============== WARM UP 1\")\n",
"t0 = time.time()\n",
"outputs = llm.generate(prompts, sampling_params)\n",
"print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=============== WARM UP 2\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[sendnn_model_runner:execute_model] t_token: 795800.62ms\n",
"[sendnn_model_runner:execute_model] t_token: 166.80ms\n",
"[sendnn_model_runner:execute_model] t_token: 164.47ms\n",
"[sendnn_model_runner:execute_model] t_token: 164.03ms\n",
"[sendnn_model_runner:execute_model] t_token: 164.57ms\n",
"[sendnn_model_runner:execute_model] t_token: 164.09ms\n",
"[sendnn_model_runner:execute_model] t_token: 164.19ms\n",
"[sendnn_model_runner:execute_model] t_token: 164.40ms\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 100%|██████████| 1/1 [13:17<00:00, 797.29s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[sendnn_model_runner:execute_model] t_token: 164.07ms\n",
"[sendnn_model_runner:execute_model] t_token: 163.94ms\n",
"Time elaspsed for 10 tokens is 797.29 sec\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"print(\"=============== WARM UP 2\")\n",
"t0 = time.time()\n",
"outputs = llm.generate(prompts, sampling_params)\n",
"print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=============== GENERATE\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 0%| | 0/1 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[sendnn_model_runner:execute_model] t_token: 206.45ms\n",
"[sendnn_model_runner:execute_model] t_token: 154.32ms\n",
"[sendnn_model_runner:execute_model] t_token: 154.72ms\n",
"[sendnn_model_runner:execute_model] t_token: 153.83ms\n",
"[sendnn_model_runner:execute_model] t_token: 153.82ms\n",
"[sendnn_model_runner:execute_model] t_token: 153.44ms\n",
"[sendnn_model_runner:execute_model] t_token: 154.03ms\n",
"[sendnn_model_runner:execute_model] t_token: 153.05ms\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Processed prompts: 100%|██████████| 1/1 [00:01<00:00, 1.60s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[sendnn_model_runner:execute_model] t_token: 153.79ms\n",
"[sendnn_model_runner:execute_model] t_token: 153.72ms\n",
"[sendnn_model_runner:execute_model] t_token: 3.13ms\n",
"Time elaspsed for 10 tokens is 1.60 sec\n",
"Prompt: 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\nProvide a list of instructions for preparing chicken soup for a family of four.\\n\\n### Response:', Generated text: '\\nOf course! Here are the steps to prepare prepare'\n",
"CompletionOutput(index=0, text='\\nOf course! Here are the steps to prepare prepare', token_ids=[13, 2776, 3236, 29991, 2266, 526, 278, 6576, 304, 19012, 19012], cumulative_logprob=-11.722966461020405, logprobs=None, finish_reason=length, stop_reason=None)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"print(\"=============== GENERATE\")\n",
"sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)\n",
"t0 = time.time()\n",
"outputs = llm.generate(prompts, sampling_params)\n",
"print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n",
"for output in outputs:\n",
" prompt = output.prompt\n",
" generated_text = output.outputs[0].text\n",
" print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\n",
"print(output.outputs[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 44681e4

Please sign in to comment.