Initial support for AIU (#17)

Initial support for AIU in vLLM. What is currently supported/tested: - Single AIU - Model: llama-7b-chat - Offline inference (batch size 1) - Online inference (with `max-num-seq=1`) --------- Signed-off-by: Nikolaos Papandreou <npo@zurich.ibm.com> Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nikolaos Papandreou <npo@zurich.ibm.com> Co-authored-by: TRAVIS JOHNSON <tsjohnso@us.ibm.com>
IBM · Jul 26, 2024 · 44681e4 · 44681e4
1 parent a43b10b
commit 44681e4
Show file tree

Hide file tree

Showing 18 changed files with 1,549 additions and 11 deletions.
diff --git a/examples/offline_inference_sendnn.ipynb b/examples/offline_inference_sendnn.ipynb
@@ -0,0 +1,367 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The wurlitzer extension is already loaded. To reload it, use:\n",
+      "  %reload_ext wurlitzer\n"
+     ]
+    }
+   ],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import time\n",
+    "%load_ext wurlitzer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "with open(\"/etc/aiu/senlib_config.json\", 'rb') as f:\n",
+    "    config = json.load(f)\n",
+    "\n",
+    "os.environ[\"AIU_CONFIG_FILE_0\"] = \"/etc/aiu/senlib_config.json\"\n",
+    "os.environ[\"FLEX_RDMA_PCI_BUS_ADDR_0\"] = config[\"GENERAL\"][\"sen_bus_id\"][0]\n",
+    "os.environ[\"AIU_WORLD_RANK_0\"] = \"0\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 06-27 12:28:18 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='/tmp/7B-F', speculative_config=None, tokenizer='/tmp/7B-F', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=/tmp/7B-F)\n",
+      ">> DEBUG  SETUP\n",
+      "0 / 1 : Python Version : 3.9.18\n",
+      "0 / 1 : PyTorch Version: 2.2.2+cpu\n",
+      "0 / 1 : PCI Addr Rank 0 AIU_WORLD_RANK_0=0\n",
+      "0 / 1 : PCI Addr Rank 0 FLEX_RDMA_PCI_BUS_ADDR_0=0000:1d:00.0\n",
+      "0 / 1 : FLEX_COMPUTE=SENTIENT\n",
+      "0 / 1 : FLEX_DEVICE=VFIO\n",
+      "0 / 1 : DEEPRT_EXPORT_DIR=export/0\n",
+      "0 / 1 : DTCOMPILER_EXPORT_DIR=export/0\n",
+      "0 / 1 : AIU_CONFIG_FILE_0=/etc/aiu/senlib_config.json\n",
+      "0 / 1 : SENLIB_DEVEL_CONFIG_FILE=/etc/aiu/senlib_config.json\n",
+      "0 / 1 : FLEX_RDMA_PCI_BUS_ADDR_0=0000:1d:00.0\n",
+      "0 / 1 : FLEX_RDMA_LOCAL_RANK=0\n",
+      "0 / 1 : FLEX_RDMA_LOCAL_SIZE=1\n",
+      "0 / 1 : FLEX_RDMA_WORLD_RANK=0\n",
+      "0 / 1 : FLEX_RDMA_WORLD_SIZE=1\n",
+      "0 / 1 : Sentient AIU: Enabled (0) (offset=0)\n",
+      "0 / 1 : Dynamo Backend  : sendnn_decoder\n",
+      "0 / 1 : CPU Cores       : 56 x 2 HW threads\n",
+      "------------------------------------------------------------\n",
+      "NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit from 64 to 160 to accomidate prompt size of 64 and decode tokens of 20\n",
+      "NOTICE: Adjusting torch._dynamo.config.cache_size_limit from 8 to 160 to accomidate prompt size of 64 and decode tokens of 20\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create an LLM.\n",
+    "llm = LLM(\n",
+    "    model=\"/tmp/7B-F\",\n",
+    "    tokenizer=\"/tmp/7B-F\",\n",
+    "    max_model_len=2048,\n",
+    "    block_size=2048,\n",
+    "    device=\"sendnn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\nProvide a list of instructions for preparing chicken soup for a family of four.\\n\\n### Response:']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Sample prompts.\n",
+    "template = \"Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\n{}\\n\\n### Response:\"\n",
+    "prompt1 = template.format(\n",
+    "    \"Provide a list of instructions for preparing chicken soup for a family of four.\"\n",
+    ")\n",
+    "prompts = [\n",
+    "    prompt1,\n",
+    "]\n",
+    "print(prompts)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a sampling params object.\n",
+    "max_tokens = 10\n",
+    "sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============== WARM UP 1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sendnn_model_runner:execute_model] t_token: 12782.44ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 13177.26ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 18823.05ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 15443.51ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 13228.93ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 13462.87ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 13535.42ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 12860.41ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 13707.36ms\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [02:19<00:00, 139.99s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sendnn_model_runner:execute_model] t_token: 12949.60ms\n",
+      "Time elaspsed for 10 tokens is 139.99 sec\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"=============== WARM UP 1\")\n",
+    "t0 = time.time()\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============== WARM UP 2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sendnn_model_runner:execute_model] t_token: 795800.62ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 166.80ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 164.47ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 164.03ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 164.57ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 164.09ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 164.19ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 164.40ms\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [13:17<00:00, 797.29s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sendnn_model_runner:execute_model] t_token: 164.07ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 163.94ms\n",
+      "Time elaspsed for 10 tokens is 797.29 sec\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"=============== WARM UP 2\")\n",
+    "t0 = time.time()\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=============== GENERATE\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sendnn_model_runner:execute_model] t_token: 206.45ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 154.32ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 154.72ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 153.83ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 153.82ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 153.44ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 154.03ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 153.05ms\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[sendnn_model_runner:execute_model] t_token: 153.79ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 153.72ms\n",
+      "[sendnn_model_runner:execute_model] t_token: 3.13ms\n",
+      "Time elaspsed for 10 tokens is 1.60 sec\n",
+      "Prompt: 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Be polite in your response to the user.\\n\\n### Instruction:\\nProvide a list of instructions for preparing chicken soup for a family of four.\\n\\n### Response:', Generated text: '\\nOf course! Here are the steps to prepare prepare'\n",
+      "CompletionOutput(index=0, text='\\nOf course! Here are the steps to prepare prepare', token_ids=[13, 2776, 3236, 29991, 2266, 526, 278, 6576, 304, 19012, 19012], cumulative_logprob=-11.722966461020405, logprobs=None, finish_reason=length, stop_reason=None)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"=============== GENERATE\")\n",
+    "sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)\n",
+    "t0 = time.time()\n",
+    "outputs = llm.generate(prompts, sampling_params)\n",
+    "print(\"Time elaspsed for %d tokens is %.2f sec\" % (max_tokens, time.time()-t0))\n",
+    "for output in outputs:\n",
+    "   prompt = output.prompt\n",
+    "   generated_text = output.outputs[0].text\n",
+    "   print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\n",
+    "print(output.outputs[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}