From a38d7ee71637947b16f615631c42848e944cb0c8 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Fri, 9 Aug 2024 08:27:38 +0300 Subject: [PATCH 01/10] Enable LoRA support Squashed commit of the following: commit 7beaeba26f39ab753e7f6c531f0aa0042802bccb commit 549bffbf62cd888ec7a8f5e763d2da4c727813ec commit 2769fd8e01b50bf0d419e869e79285f9a0e80688 commit 1911f44035fa5f33bad159b61558d93df2308450 commit e154e3c1baa777026e2cd84513815854b058bc0c commit 220460d7bb630d0bf57bd6f99331c35e2a6dfcdb commit 1256be5d867ef001609f7e37a943ebd9e05de90e commit 03d6bc3eae7fa7ab8f0a37309465544bee86ceca commit 4b7468cf66819d8eaefc0f009540b04ba09accc3 commit b7d2d86be9c791717b2c7dae6b73299fa18323fc commit 712a7ed43add6989e6a60a1421d62ba37b43e68a commit 1ee15b4aff73e9971178906da0ae9b31b519dd09 commit 5c6a3122d34580438780cf2d3e3a05578316e8b4 commit ccb056987eff6cb9efd18439fa840430b13fa175 commit c10afb44c879218dfa9ef7de9f573d97c61a43c6 commit 6b3a0393e0695813990b1a92e77c7efbbbb65f9e commit 4ef5a6d76033087216aa8366521340e498983790 commit 301579d7ff1850c200fb0a4eb5f170b29fdd9d83 commit ed98772979a8954c386fc59a63857f622b95ba1a commit 55c82ba9d51063fbaebd8bac84908f198e18c1a1 commit d7dddc9188869020e271296b751ff348425e8cf8 commit 7cc2b99cc242fc91509da96dfa16cd91bffeb1d8 commit e120246f9ae6d72a8a9404360693c2efe13991ff --- examples/lora_inference_hpu.py | 44 ++++++ tests/conftest.py | 9 ++ tests/lora/conftest.py | 9 +- tests/lora/test_llama_hpu.py | 104 ++++++++++++++ tests/lora/test_lora_hpu.py | 223 +++++++++++++++++++++++++++++ tests/lora/test_multilora_hpu.py | 137 ++++++++++++++++++ tests/lora/utils.py | 11 +- vllm/executor/habana_executor.py | 23 +-- vllm/lora/layers.py | 57 +++++++- vllm/lora/models.py | 30 ++-- vllm/lora/punica.py | 2 +- vllm/utils.py | 5 + vllm/worker/habana_model_runner.py | 149 ++++++++++++++----- vllm/worker/habana_worker.py | 25 ++-- 14 files changed, 747 insertions(+), 81 deletions(-) create mode 100644 examples/lora_inference_hpu.py create mode 100644 tests/lora/test_llama_hpu.py create mode 100644 tests/lora/test_lora_hpu.py create mode 100644 tests/lora/test_multilora_hpu.py diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py new file mode 100644 index 0000000000000..8c50f42febc7c --- /dev/null +++ b/examples/lora_inference_hpu.py @@ -0,0 +1,44 @@ +from huggingface_hub import snapshot_download +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") + +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True, max_num_seqs=2, dtype='bfloat16') + +sampling_params = SamplingParams( + temperature=0, + max_tokens=1024, + stop=["[/assistant]"] +) + +prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 +] + +expected_output = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 +] + +outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) +) + +for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + match = expected_output[i] == generated_text + if not match: + print(f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}") diff --git a/tests/conftest.py b/tests/conftest.py index 59510075b0063..ecc418b381cb1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -589,10 +589,19 @@ def caplog_vllm(temporary_enable_log_propagate, caplog): # because caplog depends on logs propagated to the root logger. yield caplog +def is_hpu(): + try: + import habana_frameworks.torch as htorch + return True + except: + return False @pytest.fixture(scope="session") def num_gpus_available(): """Get number of GPUs without initializing the CUDA context in current process.""" + if is_hpu(): + return torch.hpu.device_count() + return cuda_device_count_stateless() diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0bcae5b0c96dc..fcf0b82e9d380 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -47,6 +47,12 @@ class ContextInfo(TypedDict): "context_length": "32k", }] +def is_hpu(): + try: + import habana_frameworks.torch as htorch + return True + except: + return False def cleanup(): destroy_model_parallel() @@ -54,7 +60,8 @@ def cleanup(): with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - torch.cuda.empty_cache() + if not is_hpu(): + torch.cuda.empty_cache() ray.shutdown() diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py new file mode 100644 index 0000000000000..4095c18d5317c --- /dev/null +++ b/tests/lora/test_llama_hpu.py @@ -0,0 +1,104 @@ +from typing import List + +import os +import pytest +import ray + +from multiprocessing import Process + +import vllm +from vllm.lora.request import LoRARequest + +from conftest import cleanup + +MODEL_PATH = "meta-llama/Llama-2-7b-hf" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def _test_llama_lora(sql_lora_files, tp_size): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=tp_size) + + expected_no_lora_output = [ + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 + ] + expected_lora_output = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 + ] + + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output + + print("removing lora") + cleanup() + + +def test_llama_lora_1x(sql_lora_files): + p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) + p.start() + p.join() + assert p.exitcode == 0, f"Results don't match with the reference" + + +def test_llama_lora_2x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) + p.start() + p.join() + assert p.exitcode == 0, f"Results don't match with the reference" + + +def test_llama_lora_4x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) + p.start() + p.join() + assert p.exitcode == 0, f"Results don't match with the reference" diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py new file mode 100644 index 0000000000000..442303b087415 --- /dev/null +++ b/tests/lora/test_lora_hpu.py @@ -0,0 +1,223 @@ +import pytest +import torch + +from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice + +from .utils import DummyLoRAManager + +TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] +QKV_TENSOR_SIZES = [ + (8192, 1024, 1024), + (8192 // 8, 1024 // 8, 1024 // 8), + (4096, 4096, 4096), + (4096 // 2, 4096 // 2, 4096 // 2), +] +BATCH_SIZES = [8, 32, 256] +RANKS = [8] +DTYPES = [torch.bfloat16] +TOLERANCES = { + torch.float16: (5e-3, 5e-3), + torch.bfloat16: (3e-2, 2e-2), +} +MAX_LORAS = 8 + +@pytest.mark.parametrize("m", TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora(m, n, k, rank, dtype) -> None: + manager = DummyLoRAManager() + + module_name = "module" + weight = torch.rand([m, n], device="hpu", dtype=dtype) + + manager.init_random_lora(module_name, weight, rank=rank) + lora = manager.get_module_lora(module_name) + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = input @ lora.lora_a @ lora.lora_b * lora.scaling + + lora_a_stack = torch.zeros(MAX_LORAS+1, + 1, + lora.lora_a.shape[1], + lora.lora_a.shape[0], + device="hpu", + dtype=dtype) + lora_b_stack = torch.zeros(MAX_LORAS+1, + 1, + lora.lora_b.shape[1], + lora.lora_b.shape[0], + device="hpu", + dtype=dtype) + for i in range(MAX_LORAS): + lora_a_stack[i][0] = lora.lora_a.T + lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T + + output = torch.zeros(k, m, device="hpu", dtype=dtype) + _apply_lora( + input, lora_a_stack, lora_b_stack, + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), + output) + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + _apply_lora(input, lora_a_stack, lora_b_stack, + torch.full((len(input), ), -1, device="hpu"), output) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() + + +@pytest.mark.parametrize("m", TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: + if m % 2 != 0: + pytest.skip("m must be divisible by 2") + if m // 2 not in TENSOR_SIZES: + pytest.skip("m//2 must be in TENSOR_SIZES") + + manager = DummyLoRAManager() + + module_name = "module" + weight = torch.rand([m // 2, n], device="hpu", dtype=dtype) + + manager.init_random_lora(module_name + "1", weight, rank=rank) + lora_1 = manager.get_module_lora(module_name + "1") + manager.init_random_lora(module_name + "2", weight, rank=rank) + lora_2 = manager.get_module_lora(module_name + "2") + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = torch.cat([ + input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, + input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling + ], + dim=1) + + lora_a_stacks = [ + torch.zeros(MAX_LORAS+1, + 1, + lora_1.lora_a.shape[1], + lora_1.lora_a.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + lora_b_stacks = [ + torch.zeros(MAX_LORAS+1, + 1, + lora_1.lora_b.shape[1], + lora_1.lora_b.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + for i in range(MAX_LORAS): + lora_a_stacks[0][i][0] = lora_1.lora_a.T + lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T + lora_a_stacks[1][i][0] = lora_2.lora_a.T + lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T + + output = torch.zeros(k, m, device="hpu", dtype=dtype) + _apply_lora_packed_nslice( + input, lora_a_stacks, lora_b_stacks, + torch.randint(0, + MAX_LORAS, (len(input), ), + device="hpu"), output, (m // 2, m // 2)) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, + torch.full((len(input), ), -1, device="hpu"), + output, (m // 2, m // 2)) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() + + +@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: + manager = DummyLoRAManager() + + module_name = "module" + weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype) + weight_kv = torch.empty(qkv[1], n, device="hpu", dtype=dtype) + + manager.init_random_lora(module_name + "q", weight_q, rank=rank) + lora_q = manager.get_module_lora(module_name + "q") + manager.init_random_lora(module_name + "k", weight_kv, rank=rank) + lora_k = manager.get_module_lora(module_name + "k") + manager.init_random_lora(module_name + "v", weight_kv, rank=rank) + lora_v = manager.get_module_lora(module_name + "v") + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = torch.cat([ + input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling, + input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling, + input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling + ], + dim=1) + + lora_a_stacks = [ + torch.zeros(MAX_LORAS+1, + 1, + lora_q.lora_a.shape[1], + lora_q.lora_a.shape[0], + device="hpu", + dtype=dtype) + ] + [ + torch.zeros(MAX_LORAS+1, + 1, + lora_k.lora_a.shape[1], + lora_k.lora_a.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + lora_b_stacks = [ + torch.zeros(MAX_LORAS+1, + 1, + lora_q.lora_b.shape[1], + lora_q.lora_b.shape[0], + device="hpu", + dtype=dtype) + ] + [ + torch.zeros(MAX_LORAS+1, + 1, + lora_k.lora_b.shape[1], + lora_k.lora_b.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + for i in range(MAX_LORAS): + lora_a_stacks[0][i][0] = lora_q.lora_a.T + lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T + lora_a_stacks[1][i][0] = lora_k.lora_a.T + lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T + lora_a_stacks[2][i][0] = lora_v.lora_a.T + lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T + + output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) + _apply_lora_packed_nslice( + input, lora_a_stacks, lora_b_stacks, + torch.randint(0, + MAX_LORAS, (len(input), ), + device="hpu"), output, (qkv[0], qkv[1], qkv[2])) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, + torch.full((len(input), ), -1, device="hpu"), + output, (qkv[0], qkv[1], qkv[2])) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py new file mode 100644 index 0000000000000..f002a544ca7a2 --- /dev/null +++ b/tests/lora/test_multilora_hpu.py @@ -0,0 +1,137 @@ +from typing import List, Optional, Tuple + +from huggingface_hub import snapshot_download + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest + +from multiprocessing import Process + +import os + +def create_test_prompts( + lora_path: str +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + """Create a list of test prompts with their sampling parameters. + + 2 requests for base model, 4 requests for the LoRA. We define 2 + different LoRA adapters (using the same model for demo purposes). + Since we also set `max_loras=1`, the expectation is that the requests + with the second LoRA adapter will be ran after all requests with the + first adapter have finished. + """ + # TODO Fix issues when enabling paramerters [presence_penalty=0.2, + # (n=3, best_of=3, use_beam_search=True)] in SamplingParams. + + return [ + ("A robot may not injure a human being", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), None), + ("To be or not to be,", + SamplingParams(temperature=0.8, + top_k=5, + max_tokens=128), None), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora2", 2, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + result = {} + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params, lora_request = test_prompts.pop(0) + engine.add_request(str(request_id), + prompt, + sampling_params, + lora_request=lora_request) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + + for request_output in request_outputs: + if request_output.finished: + result[request_output.request_id] = request_output.outputs[0].text + return result + +# References from GPU with dtype=bfloat16 +expected_output = [ +" or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", +" that is the question.\nIt is the most famous line in all of Shakespeare\'s plays and one of the most famous in all of English Literature. The quote is from Hamlet, Prince of Denmark, Act III, Scene I. In this scene, the ghost of Hamlet\'s father appears to his son and asks him to avenge his death. The ghost tells Hamlet of the murder of the king and how he was done in by his brother, Claudius. Hamlet is distraught and confused by the revelation and the ghost asks Hamlet to \"Revenge", +" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", +" SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", +" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", +" SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' " +] + +def _test_llama_multilora(sql_lora_files, tp_size): + """Main function that sets up and runs the prompt processing.""" + engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_loras=6, + max_lora_rank=8, + max_num_seqs=16, + dtype='bfloat16', + tensor_parallel_size=tp_size) + engine = LLMEngine.from_engine_args(engine_args) + test_prompts = create_test_prompts(sql_lora_files) + results = process_requests(engine, test_prompts) + generated_texts = [results[key] for key in sorted(results)] + assert generated_texts == expected_output + + +def test_llama_multilora_1x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) + p.start() + p.join() + assert p.exitcode == 0, f"Results don't match with the reference" + + +def test_llama_multilora_2x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) + p.start() + p.join() + assert p.exitcode == 0, f"Results don't match with the reference" + + +def test_llama_multilora_4x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) + p.start() + p.join() + assert p.exitcode == 0, f"Results don't match with the reference" diff --git a/tests/lora/utils.py b/tests/lora/utils.py index b73cf5bf55324..6ed985e72e6b3 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -3,6 +3,7 @@ import torch from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.utils import get_device class DummyLoRAManager: @@ -28,16 +29,16 @@ def init_random_lora(self, lora_alpha=1, lora_a=torch.rand([weight.shape[1], rank], dtype=weight.dtype, - device="cuda"), + device=get_device()), lora_b=torch.rand([rank, weight.shape[0]], dtype=weight.dtype, - device="cuda"), + device=get_device()), ) if generate_embeddings_tensor: lora.embeddings_tensor = torch.rand(5, generate_embeddings_tensor, dtype=weight.dtype, - device="cuda") + device=get_device()) self.set_module_lora(module_name, lora) return lora @@ -53,8 +54,8 @@ def init_lora(self, module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([input_dim, rank], device="cuda"), - lora_b=torch.rand([rank, output_dim], device="cuda"), + lora_a=torch.rand([input_dim, rank], device=get_device()), + lora_b=torch.rand([rank, output_dim], device=get_device()), embeddings_tensor=embeddings_tensor, ) self.set_module_lora(module_name, lora) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 80f8037a2d043..fe336e57d45ae 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -154,29 +154,32 @@ def execute_model( return output def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") - - def list_loras(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.driver_worker.list_loras() def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def check_health(self) -> None: # GPUExecutor will always be healthy as long as diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 87de285a373a2..dd40f2d1239b9 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -27,6 +27,7 @@ LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.utils import is_hpu if TYPE_CHECKING: pass @@ -63,6 +64,40 @@ def dec(*args, **kwargs): return dec +def custom_bgmv(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, indices: torch.LongTensor, layer_idx: int, scale: float,): + """ + wa_t_all and wb_t_all contains all LoRA A and LoRA B weight matrices stacked into a single tensor assuming same rank. + The corresponding LoRA A and B for each sample is selected based on indices. The avoids a for loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:,0,:,:].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, indices)[:,0,:,:].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out @ wb + out = out.squeeze(1) + y += out * scale + +def custom_bgmv_embed(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, indices: torch.LongTensor, layer_idx: int, scale: float,): + """ + wa_t_all and wb_t_all contains all LoRA A and LoRA B weight matrices stacked into a single tensor assuming same rank. + The corresponding LoRA A and B for each sample is selected based on indices. The avoids a for loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:,0,:,:].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out.squeeze(1) + y += out * scale + def _apply_lora( x: torch.Tensor, lora_a_stacked: torch.Tensor, @@ -89,7 +124,10 @@ def _apply_lora( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) indices = indices.view(-1) - add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) + if is_hpu(): + custom_bgmv(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) + else: + add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) return output.view_as(org_output) @@ -127,9 +165,13 @@ def _apply_lora_packed_nslice( indices = indices.view(-1) offset_left = 0 for slice_idx in range(len(output_slices)): - add_lora_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, - output_slices[slice_idx]) + if is_hpu(): + custom_bgmv(output[:, offset_left: offset_left+output_slices[slice_idx]], x, lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], indices, 0, 1.0) + else: + add_lora_slice(output, x, lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, + output_slices[slice_idx]) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -330,8 +372,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings = full_lora_a_embeddings.view( full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) - bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + if is_hpu(): + custom_bgmv_embed(full_output, full_lora_a_embeddings, self.lora_b_stacked, self.indices[:self.indices_len[0]], 0, 1.0) + else: + bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) return full_output.view_as(full_output_org) @classmethod diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e1ede7d4d710a..c308d058ec784 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -24,7 +24,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA -from vllm.utils import is_pin_memory_available +from vllm.utils import is_pin_memory_available, is_hpu, get_device logger = init_logger(__name__) @@ -93,7 +93,7 @@ def convert_mapping( long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), - device="cuda", + device=get_device(), dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 @@ -118,9 +118,9 @@ def convert_mapping( if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") + indices = torch.tensor(indices_list, dtype=torch.long, device=get_device()) prompt_mapping_tensor = torch.tensor(prompt_mapping, - device="cuda", + device=get_device(), dtype=torch.long) embeddings_indices = torch.stack([ indices[2] * extra_vocab_size, @@ -133,7 +133,7 @@ def convert_mapping( sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 sampler_indices_padded = ( torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + + 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + (sampler_indices_padded * len(sampler_indices_padded))) long_lora_indices = None long_lora_indices_len: Optional[int] = None @@ -424,20 +424,20 @@ def __init__( self.long_lora_context: Optional[LongContextLoRAContext] = None self.base_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.sampler_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.embeddings_indices = torch.empty(2, self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.long_lora_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} @@ -465,11 +465,17 @@ def __init__( @property def capacity(self) -> int: - return self.lora_config.max_cpu_loras + if is_hpu(): + return self.lora_config.max_cpu_loras + 1 + else: + return self.lora_config.max_cpu_loras @property def lora_slots(self) -> int: - return self.lora_config.max_loras + if is_hpu(): + return self.lora_config.max_loras + 1 + else: + return self.lora_config.max_loras @property def adapter_slots(self) -> int: diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 64f87a4b2c69d..d6ee5ded67624 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -204,4 +204,4 @@ def add_lora_slice(y: torch.Tensor, buffer.size(1), y_slice_size, y_offset, - ) + ) \ No newline at end of file diff --git a/vllm/utils.py b/vllm/utils.py index fe84253feb172..bee667d350f47 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -969,6 +969,11 @@ def cuda_device_count_stateless() -> int: # after https://github.com/pytorch/pytorch/pull/122815 is released. return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) +def get_device() -> str: + if is_hpu(): + return "hpu" + return "cuda" + def error_on_invalid_device_count_status(): cache_entries = 0 diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e52b61539b540..955f0f036edc2 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -196,6 +196,10 @@ def forward(self, *args, **kwargs): hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) + from vllm.lora.layers import VocabParallelEmbeddingWithLoRA + if isinstance(self.model.model.embed_tokens, VocabParallelEmbeddingWithLoRA): + for i in range(0,4): + self.model.model.embed_tokens.indices_len[i] = selected_token_indices.numel() return hidden_states def compute_logits(self, *args, **kwargs): @@ -435,6 +439,22 @@ def load_model(self) -> None: f"took {m_getmodel.get_summary_string()}") logger.info(msg) + if self.lora_config: + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") + assert hasattr( + self.model, + "embedding_modules"), "Model does not have embedding_modules" + assert hasattr(self.model, "embedding_padding_modules" + ), "Model does not have embedding_padding_modules" + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, self.vocab_size, + self.lora_config, self.device, self.model.embedding_modules, + self.model.embedding_padding_modules) + self.model = self.lora_manager.create_lora_manager(self.model) + if self.model_config.quantization == 'inc': logger.info("Preparing model with INC..") with HabanaMemoryProfiler() as m_inc: @@ -467,27 +487,14 @@ def load_model(self) -> None: msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) - if self.lora_config: - assert hasattr(self.model, "supported_lora_modules" - ) and self.model.supported_lora_modules, ( - "Model does not support LoRA") - assert hasattr( - self.model, - "embedding_modules"), "Model does not have embedding_modules" - assert hasattr(self.model, "embedding_padding_modules" - ), "Model does not have embedding_padding_modules" - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, self.vocab_size, - self.lora_config, self.device, self.model.embedding_modules, - self.model.embedding_padding_modules) - self.model = self.lora_manager.create_lora_manager(self.model) - def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False return (batch_size, seq_len, is_prompt) in self.graphed_buckets + def _is_valid_bucket(self, bucket): + return bucket[0] * bucket[1] <= self.max_num_batched_tokens + def _setup_buckets(self) -> None: self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', @@ -520,6 +527,9 @@ def _setup_buckets(self) -> None: self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg) + if self.lora_config: + self.prompt_buckets[:] = [bucket for bucket in self.prompt_buckets if self._is_valid_bucket(bucket)] + msg = (f"Generated {len(self.prompt_buckets)} " f"prompt buckets: {list(sorted(self.prompt_buckets))}") logger.info(msg) @@ -530,6 +540,8 @@ def _setup_buckets(self) -> None: logger.info(msg) self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) + if self.lora_config: + self.decode_buckets[:] = [bucket for bucket in self.decode_buckets if self._is_valid_bucket(bucket)] msg = (f"Generated {len(self.decode_buckets)} decode buckets: " f"{list(sorted(self.decode_buckets))}") logger.info(msg) @@ -606,16 +618,6 @@ def _prepare_prompt( # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - lora_id = seq_group_metadata.lora_int_id - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - lora_index_mapping += [lora_id] * (seq_len - context_len) - lora_prompt_mapping.append( - [lora_id] * - (seq_len - context_len - if seq_group_metadata.sampling_params.prompt_logprobs else 1)) if seq_group_metadata.multi_modal_data: multi_modal_input_list.append( @@ -674,6 +676,19 @@ def _prepare_prompt( max_prompt_len = max( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) + + for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping += [lora_id] * (max_prompt_len - context_len) + lora_prompt_mapping.extend( + [lora_id] * + (max_prompt_len - context_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, pad=0, @@ -1027,7 +1042,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: ]) return attention_metadata - def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): + def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt, lora_request=None): sampling_params = SamplingParams(temperature=0) num_blocks = math.ceil(seq_len / self.block_size) if is_prompt: @@ -1048,6 +1063,7 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): seq_data={group_id: seq_data}, sampling_params=sampling_params, block_tables=block_tables, + lora_request=lora_request ) def profile_run(self) -> None: @@ -1055,21 +1071,55 @@ def profile_run(self) -> None: kv_caches = [None] * num_layers max_batch_size = self.prompt_bs_bucket_cfg[-1] max_seq_len = self.prompt_seq_bucket_cfg[-1] + if self.lora_config: + max_seq_len = self.max_num_batched_tokens // max_batch_size - self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches) + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, is_profile_run=True) def warmup_scenario(self, batch_size, seq_len, is_prompt, - kv_caches) -> None: + kv_caches, is_profile_run = False) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = ("warmup_" f"{'prompt' if is_prompt else 'decode'}_" f"bs{batch_size}_" f"seq{seq_len}_" f"graphs{'T' if use_graphs else 'F'}") + max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config and is_profile_run: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_local_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs else 1 + if self.lora_config and not is_profile_run: + lora_mapping = LoRAMapping( + [0] * batch_size * seq_len, + [0] * batch_size *seq_len, + ) + self.set_active_loras(set(), lora_mapping) seqs = [ - self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) + self.create_dummy_seq_group_metadata(i, seq_len, is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) for i in range(batch_size) ] torch.hpu.synchronize() @@ -1080,6 +1130,37 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, self.profiler.end() gc.collect() + def remove_all_loras(self): + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.remove_all_adapters() + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_adapter(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_adapter(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_adapter(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_adapters() + def log_warmup(self, phase, i, max_i, batch_size, seq_len): free_mem = format_bytes( HabanaMemoryProfiler.current_free_device_memory()) @@ -1403,9 +1484,11 @@ def execute_model( raise ValueError( "num_steps > 1 is not supported in HabanaModelRunner") - # NOTE(kzawora): Need to restore this after adding LoRA - # if self.lora_config: - # self.set_active_loras(lora_requests, lora_mapping) + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) input_tokens = model_input.input_tokens input_positions = model_input.input_positions attn_metadata = model_input.attn_metadata diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 87122c03d3c8f..bff5dc3109247 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -174,9 +174,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_hpu_blocks = max(num_hpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) - # NOTE(kzawora): Restore this once LoRA support is added - # if self.model_runner.lora_manager: - # self.model_runner.remove_all_loras() + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() gc.collect() return num_hpu_blocks, num_cpu_blocks @@ -279,29 +278,29 @@ def execute_worker(self, worker_input: WorkerInput) -> None: self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + return self.model_runner.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") - - def list_loras(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + return self.model_runner.remove_lora(lora_id) def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + return self.model_runner.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_runner.list_loras() def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") def shutdown_inc(self): self.model_runner.shutdown_inc() From f039d7c7ad5f69cbfc7a611e1463190041b172fd Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Fri, 9 Aug 2024 10:56:50 +0300 Subject: [PATCH 02/10] Fix formatting --- examples/lora_inference_hpu.py | 39 +++++++------- tests/conftest.py | 9 ++-- tests/lora/conftest.py | 9 ++-- tests/lora/test_llama_hpu.py | 15 ++---- tests/lora/test_lora_hpu.py | 34 ++++++------ tests/lora/test_multilora_hpu.py | 35 ++++++------ vllm/executor/habana_executor.py | 12 +++-- vllm/lora/layers.py | 55 +++++++++++++------ vllm/lora/models.py | 10 ++-- vllm/lora/punica.py | 2 +- vllm/utils.py | 1 + vllm/worker/habana_model_runner.py | 85 +++++++++++++++++++----------- vllm/worker/habana_worker.py | 12 +++-- 13 files changed, 183 insertions(+), 135 deletions(-) diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py index 8c50f42febc7c..b8154a29a82bb 100644 --- a/examples/lora_inference_hpu.py +++ b/examples/lora_inference_hpu.py @@ -1,24 +1,26 @@ from huggingface_hub import snapshot_download + from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") -llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True, max_num_seqs=2, dtype='bfloat16') +llm = LLM(model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_num_seqs=2, + dtype='bfloat16') -sampling_params = SamplingParams( - temperature=0, - max_tokens=1024, - stop=["[/assistant]"] -) +sampling_params = SamplingParams(temperature=0, + max_tokens=1024, + stop=["[/assistant]"]) prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] expected_output = [ @@ -30,15 +32,16 @@ " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 ] -outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) -) +outputs = llm.generate(prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, + sql_lora_path)) for i, output in enumerate(outputs): prompt = output.prompt generated_text = output.outputs[0].text match = expected_output[i] == generated_text if not match: - print(f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}") + print( + f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}" # noqa: E501 + ) diff --git a/tests/conftest.py b/tests/conftest.py index ecc418b381cb1..cfb7cf56b519a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -589,12 +589,11 @@ def caplog_vllm(temporary_enable_log_propagate, caplog): # because caplog depends on logs propagated to the root logger. yield caplog + def is_hpu(): - try: - import habana_frameworks.torch as htorch - return True - except: - return False + from importlib import util + return util.find_spec('habana_frameworks') is not None + @pytest.fixture(scope="session") def num_gpus_available(): diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index fcf0b82e9d380..3e4c8be6dbaa3 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -47,12 +47,11 @@ class ContextInfo(TypedDict): "context_length": "32k", }] + def is_hpu(): - try: - import habana_frameworks.torch as htorch - return True - except: - return False + from importlib import util + return util.find_spec('habana_frameworks') is not None + def cleanup(): destroy_model_parallel() diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py index 4095c18d5317c..4ba8a48500728 100644 --- a/tests/lora/test_llama_hpu.py +++ b/tests/lora/test_llama_hpu.py @@ -1,16 +1,11 @@ +from multiprocessing import Process from typing import List -import os -import pytest -import ray - -from multiprocessing import Process +from conftest import cleanup import vllm from vllm.lora.request import LoRARequest -from conftest import cleanup - MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -85,7 +80,7 @@ def test_llama_lora_1x(sql_lora_files): p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) p.start() p.join() - assert p.exitcode == 0, f"Results don't match with the reference" + assert p.exitcode == 0, "Results don't match with the reference" def test_llama_lora_2x(sql_lora_files): @@ -93,7 +88,7 @@ def test_llama_lora_2x(sql_lora_files): p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) p.start() p.join() - assert p.exitcode == 0, f"Results don't match with the reference" + assert p.exitcode == 0, "Results don't match with the reference" def test_llama_lora_4x(sql_lora_files): @@ -101,4 +96,4 @@ def test_llama_lora_4x(sql_lora_files): p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) p.start() p.join() - assert p.exitcode == 0, f"Results don't match with the reference" + assert p.exitcode == 0, "Results don't match with the reference" diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index 442303b087415..ddbab66e166b3 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -21,6 +21,7 @@ } MAX_LORAS = 8 + @pytest.mark.parametrize("m", TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) @@ -38,13 +39,13 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: input = torch.rand(k, n, device="hpu", dtype=dtype) expected = input @ lora.lora_a @ lora.lora_b * lora.scaling - lora_a_stack = torch.zeros(MAX_LORAS+1, + lora_a_stack = torch.zeros(MAX_LORAS + 1, 1, lora.lora_a.shape[1], lora.lora_a.shape[0], device="hpu", dtype=dtype) - lora_b_stack = torch.zeros(MAX_LORAS+1, + lora_b_stack = torch.zeros(MAX_LORAS + 1, 1, lora.lora_b.shape[1], lora.lora_b.shape[0], @@ -55,10 +56,9 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T output = torch.zeros(k, m, device="hpu", dtype=dtype) - _apply_lora( - input, lora_a_stack, lora_b_stack, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), - output) + _apply_lora(input, lora_a_stack, lora_b_stack, + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), + output) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) @@ -99,7 +99,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: dim=1) lora_a_stacks = [ - torch.zeros(MAX_LORAS+1, + torch.zeros(MAX_LORAS + 1, 1, lora_1.lora_a.shape[1], lora_1.lora_a.shape[0], @@ -107,7 +107,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: dtype=dtype) for i in range(2) ] lora_b_stacks = [ - torch.zeros(MAX_LORAS+1, + torch.zeros(MAX_LORAS + 1, 1, lora_1.lora_b.shape[1], lora_1.lora_b.shape[0], @@ -123,9 +123,8 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: output = torch.zeros(k, m, device="hpu", dtype=dtype) _apply_lora_packed_nslice( input, lora_a_stacks, lora_b_stacks, - torch.randint(0, - MAX_LORAS, (len(input), ), - device="hpu"), output, (m // 2, m // 2)) + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, + (m // 2, m // 2)) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) @@ -167,14 +166,14 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: dim=1) lora_a_stacks = [ - torch.zeros(MAX_LORAS+1, + torch.zeros(MAX_LORAS + 1, 1, lora_q.lora_a.shape[1], lora_q.lora_a.shape[0], device="hpu", dtype=dtype) ] + [ - torch.zeros(MAX_LORAS+1, + torch.zeros(MAX_LORAS + 1, 1, lora_k.lora_a.shape[1], lora_k.lora_a.shape[0], @@ -182,14 +181,14 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: dtype=dtype) for i in range(2) ] lora_b_stacks = [ - torch.zeros(MAX_LORAS+1, + torch.zeros(MAX_LORAS + 1, 1, lora_q.lora_b.shape[1], lora_q.lora_b.shape[0], device="hpu", dtype=dtype) ] + [ - torch.zeros(MAX_LORAS+1, + torch.zeros(MAX_LORAS + 1, 1, lora_k.lora_b.shape[1], lora_k.lora_b.shape[0], @@ -207,9 +206,8 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) _apply_lora_packed_nslice( input, lora_a_stacks, lora_b_stacks, - torch.randint(0, - MAX_LORAS, (len(input), ), - device="hpu"), output, (qkv[0], qkv[1], qkv[2])) + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, + (qkv[0], qkv[1], qkv[2])) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index f002a544ca7a2..3d6c5e6ef76aa 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -1,13 +1,9 @@ +from multiprocessing import Process from typing import List, Optional, Tuple -from huggingface_hub import snapshot_download - from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm.lora.request import LoRARequest -from multiprocessing import Process - -import os def create_test_prompts( lora_path: str @@ -20,7 +16,7 @@ def create_test_prompts( with the second LoRA adapter will be ran after all requests with the first adapter have finished. """ - # TODO Fix issues when enabling paramerters [presence_penalty=0.2, + # TODO Fix issues when enabling parameters [presence_penalty=0.2, # (n=3, best_of=3, use_beam_search=True)] in SamplingParams. return [ @@ -30,9 +26,7 @@ def create_test_prompts( prompt_logprobs=1, max_tokens=128), None), ("To be or not to be,", - SamplingParams(temperature=0.8, - top_k=5, - max_tokens=128), None), + SamplingParams(temperature=0.8, top_k=5, max_tokens=128), None), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 SamplingParams(temperature=0.0, @@ -84,19 +78,22 @@ def process_requests(engine: LLMEngine, for request_output in request_outputs: if request_output.finished: - result[request_output.request_id] = request_output.outputs[0].text + result[ + request_output.request_id] = request_output.outputs[0].text return result + # References from GPU with dtype=bfloat16 expected_output = [ -" or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", -" that is the question.\nIt is the most famous line in all of Shakespeare\'s plays and one of the most famous in all of English Literature. The quote is from Hamlet, Prince of Denmark, Act III, Scene I. In this scene, the ghost of Hamlet\'s father appears to his son and asks him to avenge his death. The ghost tells Hamlet of the murder of the king and how he was done in by his brother, Claudius. Hamlet is distraught and confused by the revelation and the ghost asks Hamlet to \"Revenge", -" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", -" SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", -" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", -" SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' " + " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", # noqa: E501 + " that is the question.\nIt is the most famous line in all of Shakespeare\'s plays and one of the most famous in all of English Literature. The quote is from Hamlet, Prince of Denmark, Act III, Scene I. In this scene, the ghost of Hamlet\'s father appears to his son and asks him to avenge his death. The ghost tells Hamlet of the murder of the king and how he was done in by his brother, Claudius. Hamlet is distraught and confused by the revelation and the ghost asks Hamlet to \"Revenge", # noqa: E501 + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' " # noqa: E501 ] + def _test_llama_multilora(sql_lora_files, tp_size): """Main function that sets up and runs the prompt processing.""" engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", @@ -118,7 +115,7 @@ def test_llama_multilora_1x(sql_lora_files): p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) p.start() p.join() - assert p.exitcode == 0, f"Results don't match with the reference" + assert p.exitcode == 0, "Results don't match with the reference" def test_llama_multilora_2x(sql_lora_files): @@ -126,7 +123,7 @@ def test_llama_multilora_2x(sql_lora_files): p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) p.start() p.join() - assert p.exitcode == 0, f"Results don't match with the reference" + assert p.exitcode == 0, "Results don't match with the reference" def test_llama_multilora_4x(sql_lora_files): @@ -134,4 +131,4 @@ def test_llama_multilora_4x(sql_lora_files): p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) p.start() p.join() - assert p.exitcode == 0, f"Results don't match with the reference" + assert p.exitcode == 0, "Results don't match with the reference" diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index fe336e57d45ae..baeaec5afa371 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -170,16 +170,20 @@ def list_loras(self) -> Set[int]: def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def check_health(self) -> None: # GPUExecutor will always be healthy as long as diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index dd40f2d1239b9..3ffd3f7da0dd1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -64,17 +64,27 @@ def dec(*args, **kwargs): return dec -def custom_bgmv(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, indices: torch.LongTensor, layer_idx: int, scale: float,): +def custom_bgmv( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): """ - wa_t_all and wb_t_all contains all LoRA A and LoRA B weight matrices stacked into a single tensor assuming same rank. - The corresponding LoRA A and B for each sample is selected based on indices. The avoids a for loop as well as graph breaks. + `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices + stacked into a single tensor, assuming same rank. The corresponding LoRA + A and B for each sample is selected based on `indices`. This avoids a + for-loop as well as graph breaks. """ assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' max_loras = wa_t_all.size(0) # Wrap-around for negative indices indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:,0,:,:].transpose(-1, -2) - wb = torch.index_select(wb_t_all, 0, indices)[:,0,:,:].transpose(-1, -2) + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) x = x.unsqueeze(1) out = x @ wa @@ -82,22 +92,33 @@ def custom_bgmv(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_a out = out.squeeze(1) y += out * scale -def custom_bgmv_embed(y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, indices: torch.LongTensor, layer_idx: int, scale: float,): + +def custom_bgmv_embed( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): """ - wa_t_all and wb_t_all contains all LoRA A and LoRA B weight matrices stacked into a single tensor assuming same rank. - The corresponding LoRA A and B for each sample is selected based on indices. The avoids a for loop as well as graph breaks. + `wa_t_all` contains all LoRA A weight matrices stacked into a single + tensor, assuming same rank. The corresponding LoRA A for each sample is + selected based on `indices`. This avoids a for-loop as well as + graph breaks. """ assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' max_loras = wa_t_all.size(0) # Wrap-around for negative indices indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:,0,:,:].transpose(-1, -2) + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) x = x.unsqueeze(1) out = x @ wa out = out.squeeze(1) y += out * scale + def _apply_lora( x: torch.Tensor, lora_a_stacked: torch.Tensor, @@ -166,12 +187,14 @@ def _apply_lora_packed_nslice( offset_left = 0 for slice_idx in range(len(output_slices)): if is_hpu(): - custom_bgmv(output[:, offset_left: offset_left+output_slices[slice_idx]], x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0) + custom_bgmv( + output[:, offset_left:offset_left + output_slices[slice_idx]], + x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], + indices, 0, 1.0) else: add_lora_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, - output_slices[slice_idx]) + lora_b_stacked[slice_idx], indices, 0, 1.0, + offset_left, output_slices[slice_idx]) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -373,10 +396,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) if is_hpu(): - custom_bgmv_embed(full_output, full_lora_a_embeddings, self.lora_b_stacked, self.indices[:self.indices_len[0]], 0, 1.0) + custom_bgmv_embed(full_output, full_lora_a_embeddings, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) else: bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + self.indices[:self.indices_len[0]], 0, 1.0) return full_output.view_as(full_output_org) @classmethod diff --git a/vllm/lora/models.py b/vllm/lora/models.py index c308d058ec784..a2765e364599b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -24,7 +24,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA -from vllm.utils import is_pin_memory_available, is_hpu, get_device +from vllm.utils import get_device, is_hpu, is_pin_memory_available logger = init_logger(__name__) @@ -131,10 +131,10 @@ def convert_mapping( sampler_indices = prompt_mapping_tensor sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = ( - torch.arange( - 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + - (sampler_indices_padded * len(sampler_indices_padded))) + sampler_indices_padded = (torch.arange( + 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + + (sampler_indices_padded * + len(sampler_indices_padded))) long_lora_indices = None long_lora_indices_len: Optional[int] = None if long_lora_context: diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index d6ee5ded67624..64f87a4b2c69d 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -204,4 +204,4 @@ def add_lora_slice(y: torch.Tensor, buffer.size(1), y_slice_size, y_offset, - ) \ No newline at end of file + ) diff --git a/vllm/utils.py b/vllm/utils.py index bee667d350f47..fa6e132dd3522 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -969,6 +969,7 @@ def cuda_device_count_stateless() -> int: # after https://github.com/pytorch/pytorch/pull/122815 is released. return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) + def get_device() -> str: if is_hpu(): return "hpu" diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 955f0f036edc2..88ff721a0af5e 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -197,9 +197,11 @@ def forward(self, *args, **kwargs): hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - if isinstance(self.model.model.embed_tokens, VocabParallelEmbeddingWithLoRA): - for i in range(0,4): - self.model.model.embed_tokens.indices_len[i] = selected_token_indices.numel() + if isinstance(self.model.model.embed_tokens, + VocabParallelEmbeddingWithLoRA): + for i in range(0, 4): + self.model.model.embed_tokens.indices_len[ + i] = selected_token_indices.numel() return hidden_states def compute_logits(self, *args, **kwargs): @@ -441,17 +443,18 @@ def load_model(self) -> None: if self.lora_config: assert hasattr(self.model, "supported_lora_modules" - ) and self.model.supported_lora_modules, ( - "Model does not support LoRA") + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") + assert hasattr(self.model, "embedding_modules" + ), "Model does not have embedding_modules" assert hasattr( - self.model, - "embedding_modules"), "Model does not have embedding_modules" - assert hasattr(self.model, "embedding_padding_modules" - ), "Model does not have embedding_padding_modules" + self.model, "embedding_padding_modules" + ), "Model does not have embedding_padding_modules" self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, self.vocab_size, - self.lora_config, self.device, self.model.embedding_modules, + self.scheduler_config.max_num_batched_tokens, + self.vocab_size, self.lora_config, self.device, + self.model.embedding_modules, self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) @@ -528,7 +531,10 @@ def _setup_buckets(self) -> None: self.prompt_seq_bucket_cfg) if self.lora_config: - self.prompt_buckets[:] = [bucket for bucket in self.prompt_buckets if self._is_valid_bucket(bucket)] + self.prompt_buckets[:] = [ + bucket for bucket in self.prompt_buckets + if self._is_valid_bucket(bucket) + ] msg = (f"Generated {len(self.prompt_buckets)} " f"prompt buckets: {list(sorted(self.prompt_buckets))}") @@ -541,7 +547,10 @@ def _setup_buckets(self) -> None: self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) if self.lora_config: - self.decode_buckets[:] = [bucket for bucket in self.decode_buckets if self._is_valid_bucket(bucket)] + self.decode_buckets[:] = [ + bucket for bucket in self.decode_buckets + if self._is_valid_bucket(bucket) + ] msg = (f"Generated {len(self.decode_buckets)} decode buckets: " f"{list(sorted(self.decode_buckets))}") logger.info(msg) @@ -677,7 +686,8 @@ def _prepare_prompt( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) - for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): + for seq_group_metadata, context_len in zip(seq_group_metadata_list, + context_lens): lora_id = seq_group_metadata.lora_int_id if lora_id > 0: @@ -1042,7 +1052,11 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: ]) return attention_metadata - def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt, lora_request=None): + def create_dummy_seq_group_metadata(self, + group_id, + seq_len, + is_prompt, + lora_request=None): sampling_params = SamplingParams(temperature=0) num_blocks = math.ceil(seq_len / self.block_size) if is_prompt: @@ -1057,14 +1071,12 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt, lora_req output_token_ids = [1] * output_len seq_data = SequenceData(prompt_token_ids) seq_data.output_token_ids = output_token_ids - return SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=(output_len == 0), - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - lora_request=lora_request - ) + return SequenceGroupMetadata(request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + lora_request=lora_request) def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) @@ -1074,10 +1086,18 @@ def profile_run(self) -> None: if self.lora_config: max_seq_len = self.max_num_batched_tokens // max_batch_size - self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, is_profile_run=True) - - def warmup_scenario(self, batch_size, seq_len, is_prompt, - kv_caches, is_profile_run = False) -> None: + self.warmup_scenario(max_batch_size, + max_seq_len, + True, + kv_caches, + is_profile_run=True) + + def warmup_scenario(self, + batch_size, + seq_len, + is_prompt, + kv_caches, + is_profile_run=False) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = ("warmup_" f"{'prompt' if is_prompt else 'decode'}_" @@ -1113,13 +1133,16 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, if self.lora_config and not is_profile_run: lora_mapping = LoRAMapping( [0] * batch_size * seq_len, - [0] * batch_size *seq_len, + [0] * batch_size * seq_len, ) self.set_active_loras(set(), lora_mapping) seqs = [ - self.create_dummy_seq_group_metadata(i, seq_len, is_prompt, - lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) + self.create_dummy_seq_group_metadata( + i, + seq_len, + is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) for i in range(batch_size) ] torch.hpu.synchronize() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index bff5dc3109247..9d083915041fe 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -291,16 +291,20 @@ def list_loras(self) -> Set[int]: def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError("Prompt Adapter is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def shutdown_inc(self): self.model_runner.shutdown_inc() From 319cfc7a2c7d93fb0ce263171981de87cd2b19ba Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 12 Aug 2024 09:27:38 +0300 Subject: [PATCH 03/10] Update custom_bgmv docstring ...Also update test reference for test_multilora_hpu.py --- tests/lora/test_multilora_hpu.py | 15 ++++++-------- vllm/lora/layers.py | 34 +++++++++++++++++++++++++------- vllm/lora/models.py | 8 ++++++++ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index 3d6c5e6ef76aa..80f1b8f2e0bf9 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -12,13 +12,7 @@ def create_test_prompts( 2 requests for base model, 4 requests for the LoRA. We define 2 different LoRA adapters (using the same model for demo purposes). - Since we also set `max_loras=1`, the expectation is that the requests - with the second LoRA adapter will be ran after all requests with the - first adapter have finished. """ - # TODO Fix issues when enabling parameters [presence_penalty=0.2, - # (n=3, best_of=3, use_beam_search=True)] in SamplingParams. - return [ ("A robot may not injure a human being", SamplingParams(temperature=0.0, @@ -26,7 +20,10 @@ def create_test_prompts( prompt_logprobs=1, max_tokens=128), None), ("To be or not to be,", - SamplingParams(temperature=0.8, top_k=5, max_tokens=128), None), + SamplingParams(temperature=0.8, + top_k=5, + presence_penalty=0.2, + max_tokens=128), None), ( "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 SamplingParams(temperature=0.0, @@ -86,7 +83,7 @@ def process_requests(engine: LLMEngine, # References from GPU with dtype=bfloat16 expected_output = [ " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", # noqa: E501 - " that is the question.\nIt is the most famous line in all of Shakespeare\'s plays and one of the most famous in all of English Literature. The quote is from Hamlet, Prince of Denmark, Act III, Scene I. In this scene, the ghost of Hamlet\'s father appears to his son and asks him to avenge his death. The ghost tells Hamlet of the murder of the king and how he was done in by his brother, Claudius. Hamlet is distraught and confused by the revelation and the ghost asks Hamlet to \"Revenge", # noqa: E501 + " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the", # noqa: E501 " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 @@ -98,7 +95,7 @@ def _test_llama_multilora(sql_lora_files, tp_size): """Main function that sets up and runs the prompt processing.""" engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", enable_lora=True, - max_loras=6, + max_loras=2, max_lora_rank=8, max_num_seqs=16, dtype='bfloat16', diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 3ffd3f7da0dd1..912cd0b47202a 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -75,9 +75,19 @@ def custom_bgmv( ): """ `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices - stacked into a single tensor, assuming same rank. The corresponding LoRA - A and B for each sample is selected based on `indices`. This avoids a - for-loop as well as graph breaks. + stacked into single tensors, assuming same rank. HPU handles no-LoRA + requests using zero valued A and B tensors. These zero valued tensors are + appended at the end of `wa_t_all` and `wb_t_all` during initialization. For + custom BGMV, the corresponding `wa` and `wb` for each batch is created + based on the lora_index of each sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The `wa` tensor for a batch of size batch_Size will have + a shape of (batch_size, num_layers, hidden_dim, lora_rank) + + This method avoids for-loop as well as graph breaks. """ assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' max_loras = wa_t_all.size(0) @@ -102,10 +112,20 @@ def custom_bgmv_embed( scale: float, ): """ - `wa_t_all` contains all LoRA A weight matrices stacked into a single - tensor, assuming same rank. The corresponding LoRA A for each sample is - selected based on `indices`. This avoids a for-loop as well as - graph breaks. + `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor + assuming same rank. HPU handles no-LoRA requests using zero valued A + tensor. This zero valued tensor is appended at the end of `wa_t_all` during + initialization. For custom BGMV, the corresponding wa for each batch is + created based on the lora_index of the sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The wa tensor for a batch of size batch_Size will have a + shape of (batch_size, num_layers, lora_rank, hidden_dim) + + + This method avoids for-loop as well as graph breaks. """ assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' max_loras = wa_t_all.size(0) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index a2765e364599b..30d2fd9502977 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -466,6 +466,11 @@ def __init__( @property def capacity(self) -> int: if is_hpu(): + # HPU handles no LoRA requests using zero valued A and B tensors. + # These zero valued tensors are appended at the end of A and B, + # making total number of loras to be lora_config.max_cpu_loras + 1. + # This demands the total number of max_cpu_loras to be + # lora_config.max_cpu_loras + 1 return self.lora_config.max_cpu_loras + 1 else: return self.lora_config.max_cpu_loras @@ -473,6 +478,9 @@ def capacity(self) -> int: @property def lora_slots(self) -> int: if is_hpu(): + # HPU handles no LoRA requests using zero valued A and B tensors. + # These zero valued tensors are appended at the end of A and B, + # making total number of loras to be lora_config.max_cpu_loras + 1. return self.lora_config.max_loras + 1 else: return self.lora_config.max_loras From 9f729367a3234f4937190c83648ca5fd69ec23ef Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:53:40 +0530 Subject: [PATCH 04/10] Make log_prompt change model agnostic --- vllm/worker/habana_model_runner.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 88ff721a0af5e..d2267c7de8d12 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -197,11 +197,12 @@ def forward(self, *args, **kwargs): hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - if isinstance(self.model.model.embed_tokens, - VocabParallelEmbeddingWithLoRA): - for i in range(0, 4): - self.model.model.embed_tokens.indices_len[ - i] = selected_token_indices.numel() + property = vars(self.model.model) + modules = list(property['_modules'].values()) + for module in modules: + if isinstance(module, VocabParallelEmbeddingWithLoRA): + for i in range(0, 4): + module.indices_len[i] = selected_token_indices.numel() return hidden_states def compute_logits(self, *args, **kwargs): From d6120c35a8a1f55fb430153e118f1d4fe64cd936 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:02:09 +0530 Subject: [PATCH 05/10] Make block size compliant to max_num_batched_tokens for LoRA --- vllm/worker/habana_model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d2267c7de8d12..c0b5d7588abbb 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -451,6 +451,8 @@ def load_model(self) -> None: assert hasattr( self.model, "embedding_padding_modules" ), "Model does not have embedding_padding_modules" + if self.block_size > self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs: + self.block_size = self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens, From 0e77af2b6e96de9432bc5fc49296e1ac5e2759ca Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Tue, 13 Aug 2024 13:08:13 +0300 Subject: [PATCH 06/10] Move embedding index select to execute model --- vllm/worker/habana_model_runner.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index c0b5d7588abbb..6d2f4f9c3ca20 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -196,13 +196,6 @@ def forward(self, *args, **kwargs): hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) - from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - property = vars(self.model.model) - modules = list(property['_modules'].values()) - for module in modules: - if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, 4): - module.indices_len[i] = selected_token_indices.numel() return hidden_states def compute_logits(self, *args, **kwargs): @@ -1561,6 +1554,17 @@ def execute_model( selected_token_indices=sampling_metadata.selected_token_indices ) + from vllm.lora.layers import VocabParallelEmbeddingWithLoRA + property = vars(self.model.model) + model = list(property['_modules'].values())[0] + property = vars(model) + modules = list(property['_modules'].values()) + for module in modules: + if isinstance(module, VocabParallelEmbeddingWithLoRA): + for i in range(0, 4): + module.indices_len[ + i] = sampling_metadata.selected_token_indices.numel() + # Compute the logits. with self.profiler.record_event( 'internal', ('compute_logits_' From 24c9ccbb65eea62b4dcfa85890ccae541178e41e Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 13 Aug 2024 14:22:57 +0300 Subject: [PATCH 07/10] Update test dtype as bfloat16 and minor fixes --- tests/lora/test_llama_hpu.py | 13 +++++++------ tests/lora/test_multilora_hpu.py | 6 +++--- vllm/worker/habana_model_runner.py | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py index 4ba8a48500728..ff1952109dd77 100644 --- a/tests/lora/test_llama_hpu.py +++ b/tests/lora/test_llama_hpu.py @@ -41,13 +41,14 @@ def _test_llama_lora(sql_lora_files, tp_size): enable_lora=True, max_num_seqs=16, max_loras=4, + dtype='bfloat16', tensor_parallel_size=tp_size) expected_no_lora_output = [ "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kamu/ [kamɯ́]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kamu/ [kamɯ́]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio.\n\n answer:\n\n SELECT people.sex, AVG(unsure_rate) AS avg_unsure_rate\n FROM candidate\n JOIN people ON candidate.people_id = people.people_id\n GROUP BY people.sex\n ORDER BY avg_unsure_rate DESC\n\n ", # noqa: E501 " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 ] @@ -55,7 +56,7 @@ def _test_llama_lora(sql_lora_files, tp_size): " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(*) DESC LIMIT 1) ", # noqa: E501 " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 ] @@ -80,7 +81,7 @@ def test_llama_lora_1x(sql_lora_files): p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) p.start() p.join() - assert p.exitcode == 0, "Results don't match with the reference" + assert p.exitcode == 0 def test_llama_lora_2x(sql_lora_files): @@ -88,7 +89,7 @@ def test_llama_lora_2x(sql_lora_files): p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) p.start() p.join() - assert p.exitcode == 0, "Results don't match with the reference" + assert p.exitcode == 0 def test_llama_lora_4x(sql_lora_files): @@ -96,4 +97,4 @@ def test_llama_lora_4x(sql_lora_files): p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) p.start() p.join() - assert p.exitcode == 0, "Results don't match with the reference" + assert p.exitcode == 0 diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index 80f1b8f2e0bf9..53c92cb891f47 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -112,7 +112,7 @@ def test_llama_multilora_1x(sql_lora_files): p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) p.start() p.join() - assert p.exitcode == 0, "Results don't match with the reference" + assert p.exitcode == 0 def test_llama_multilora_2x(sql_lora_files): @@ -120,7 +120,7 @@ def test_llama_multilora_2x(sql_lora_files): p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) p.start() p.join() - assert p.exitcode == 0, "Results don't match with the reference" + assert p.exitcode == 0 def test_llama_multilora_4x(sql_lora_files): @@ -128,4 +128,4 @@ def test_llama_multilora_4x(sql_lora_files): p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) p.start() p.join() - assert p.exitcode == 0, "Results don't match with the reference" + assert p.exitcode == 0 diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6d2f4f9c3ca20..961bf88c644b9 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -444,8 +444,8 @@ def load_model(self) -> None: assert hasattr( self.model, "embedding_padding_modules" ), "Model does not have embedding_padding_modules" - if self.block_size > self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs: - self.block_size = self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs + if self.block_size > self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs: # noqa: E501 + self.block_size = self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs # noqa: E501 self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens, From dbd804f7aa1374ac878c09dfc3b594f519a21889 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 19 Aug 2024 13:26:23 +0300 Subject: [PATCH 08/10] Update test dtype to float32 ...to fix accuracy mismatch between tp_size = 1 vs tp_size > 1 --- tests/lora/test_llama_hpu.py | 8 ++++---- tests/lora/test_multilora_hpu.py | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py index ff1952109dd77..dfd551f2ca043 100644 --- a/tests/lora/test_llama_hpu.py +++ b/tests/lora/test_llama_hpu.py @@ -41,14 +41,14 @@ def _test_llama_lora(sql_lora_files, tp_size): enable_lora=True, max_num_seqs=16, max_loras=4, - dtype='bfloat16', + dtype='float32', tensor_parallel_size=tp_size) expected_no_lora_output = [ "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kamu/ [kamɯ́]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kamu/ [kamɯ́]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio.\n\n answer:\n\n SELECT people.sex, AVG(unsure_rate) AS avg_unsure_rate\n FROM candidate\n JOIN people ON candidate.people_id = people.people_id\n GROUP BY people.sex\n ORDER BY avg_unsure_rate DESC\n\n ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 ] @@ -56,7 +56,7 @@ def _test_llama_lora(sql_lora_files, tp_size): " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(*) DESC LIMIT 1) ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 ] diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index 53c92cb891f47..edca64fd5a2ae 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -80,14 +80,13 @@ def process_requests(engine: LLMEngine, return result -# References from GPU with dtype=bfloat16 expected_output = [ " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", # noqa: E501 " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the", # noqa: E501 " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' " # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' " # noqa: E501 ] @@ -98,7 +97,7 @@ def _test_llama_multilora(sql_lora_files, tp_size): max_loras=2, max_lora_rank=8, max_num_seqs=16, - dtype='bfloat16', + dtype='float32', tensor_parallel_size=tp_size) engine = LLMEngine.from_engine_args(engine_args) test_prompts = create_test_prompts(sql_lora_files) From 557a23e388158423b3a36b2a79398a57ef4c9d1e Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Mon, 19 Aug 2024 14:31:21 +0300 Subject: [PATCH 09/10] Make max seq number compliant with max_num_batched_token --- vllm/worker/habana_model_runner.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 961bf88c644b9..ce7a0ad8dd1fc 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -444,8 +444,6 @@ def load_model(self) -> None: assert hasattr( self.model, "embedding_padding_modules" ), "Model does not have embedding_padding_modules" - if self.block_size > self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs: # noqa: E501 - self.block_size = self.scheduler_config.max_num_batched_tokens // self.scheduler_config.max_num_seqs # noqa: E501 self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens, @@ -495,13 +493,17 @@ def _is_valid_bucket(self, bucket): return bucket[0] * bucket[1] <= self.max_num_batched_tokens def _setup_buckets(self) -> None: + max_bucket_cfg = 64 + if self.lora_config and \ + max_bucket_cfg > self.max_num_batched_tokens // self.block_size: + max_bucket_cfg = self.max_num_batched_tokens // self.block_size self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min( self.max_num_seqs, - 64)) + max_bucket_cfg)) self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, From 78436a602c9d6bb0239eff9285235211b9fe8045 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 20 Aug 2024 09:17:55 +0300 Subject: [PATCH 10/10] Move HPU specific LoRA ops to vllm.hpu.ops module --- vllm/hpu/ops.py | 75 +++++++++++++++++++++++++ vllm/lora/layers.py | 89 +++--------------------------- vllm/worker/habana_model_runner.py | 22 ++++---- 3 files changed, 96 insertions(+), 90 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 2af5634a8d1a6..662c53486b4ca 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -191,3 +191,78 @@ def prompt_attention( valid_seq_lengths, 'right') attn_weights = attn_weights.transpose(1, 2) return attn_weights + + +def dispatch_bgmv_linear( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices + stacked into single tensors, assuming same rank. HPU handles no-LoRA + requests using zero valued A and B tensors. These zero valued tensors are + appended at the end of `wa_t_all` and `wb_t_all` during initialization. For + custom BGMV, the corresponding `wa` and `wb` for each batch is created + based on the lora_index of each sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The `wa` tensor for a batch of size batch_Size will have + a shape of (batch_size, num_layers, hidden_dim, lora_rank) + + This method avoids for-loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out @ wb + out = out.squeeze(1) + y += out * scale + + +def dispatch_bgmv_embedding( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor + assuming same rank. HPU handles no-LoRA requests using zero valued A + tensor. This zero valued tensor is appended at the end of `wa_t_all` during + initialization. For custom BGMV, the corresponding wa for each batch is + created based on the lora_index of the sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The wa tensor for a batch of size batch_Size will have a + shape of (batch_size, num_layers, lora_rank, hidden_dim) + + + This method avoids for-loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out.squeeze(1) + y += out * scale \ No newline at end of file diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 912cd0b47202a..4a45f3fda88f1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -29,6 +29,9 @@ VocabParallelEmbedding) from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear + if TYPE_CHECKING: pass @@ -64,81 +67,6 @@ def dec(*args, **kwargs): return dec -def custom_bgmv( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices - stacked into single tensors, assuming same rank. HPU handles no-LoRA - requests using zero valued A and B tensors. These zero valued tensors are - appended at the end of `wa_t_all` and `wb_t_all` during initialization. For - custom BGMV, the corresponding `wa` and `wb` for each batch is created - based on the lora_index of each sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The `wa` tensor for a batch of size batch_Size will have - a shape of (batch_size, num_layers, hidden_dim, lora_rank) - - This method avoids for-loop as well as graph breaks. - """ - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wa_t_all.size(0) - # Wrap-around for negative indices - indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - - x = x.unsqueeze(1) - out = x @ wa - out = out @ wb - out = out.squeeze(1) - y += out * scale - - -def custom_bgmv_embed( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor - assuming same rank. HPU handles no-LoRA requests using zero valued A - tensor. This zero valued tensor is appended at the end of `wa_t_all` during - initialization. For custom BGMV, the corresponding wa for each batch is - created based on the lora_index of the sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The wa tensor for a batch of size batch_Size will have a - shape of (batch_size, num_layers, lora_rank, hidden_dim) - - - This method avoids for-loop as well as graph breaks. - """ - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wa_t_all.size(0) - # Wrap-around for negative indices - indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - - x = x.unsqueeze(1) - out = x @ wa - out = out.squeeze(1) - y += out * scale - - def _apply_lora( x: torch.Tensor, lora_a_stacked: torch.Tensor, @@ -166,7 +94,8 @@ def _apply_lora( output = output.view(-1, output.shape[-1]) indices = indices.view(-1) if is_hpu(): - custom_bgmv(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) + dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked, + indices, 0, 1.0) else: add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) return output.view_as(org_output) @@ -207,7 +136,7 @@ def _apply_lora_packed_nslice( offset_left = 0 for slice_idx in range(len(output_slices)): if is_hpu(): - custom_bgmv( + dispatch_bgmv_linear( output[:, offset_left:offset_left + output_slices[slice_idx]], x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], indices, 0, 1.0) @@ -416,9 +345,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) if is_hpu(): - custom_bgmv_embed(full_output, full_lora_a_embeddings, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + dispatch_bgmv_embedding(full_output, full_lora_a_embeddings, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) else: bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, self.indices[:self.indices_len[0]], 0, 1.0) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index ce7a0ad8dd1fc..d129bb5cbc0ca 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1556,16 +1556,18 @@ def execute_model( selected_token_indices=sampling_metadata.selected_token_indices ) - from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - property = vars(self.model.model) - model = list(property['_modules'].values())[0] - property = vars(model) - modules = list(property['_modules'].values()) - for module in modules: - if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, 4): - module.indices_len[ - i] = sampling_metadata.selected_token_indices.numel() + if self.lora_config: + from vllm.lora.layers import VocabParallelEmbeddingWithLoRA + property = vars(self.model.model) + model = list(property['_modules'].values())[0] + property = vars(model) + modules = list(property['_modules'].values()) + for module in modules: + if isinstance(module, VocabParallelEmbeddingWithLoRA): + for i in range(0, 4): + module.indices_len[ + i] = sampling_metadata.selected_token_indices.numel( + ) # Compute the logits. with self.profiler.record_event(