From e528d064d73a0a56fa3ff19585b309fb3466d726 Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Wed, 8 Nov 2023 05:59:56 +0200 Subject: [PATCH 01/43] Porting vllm to HPU --- benchmarks/benchmark_throughput.py | 23 ++++++-- requirements.txt | 4 +- setup.py | 74 ++++++++++++------------- vllm/__init__.py | 1 + vllm/core/scheduler.py | 2 + vllm/entrypoints/llm.py | 25 ++++++++- vllm/model_executor/layers/attention.py | 4 +- vllm/model_executor/models/bloom.py | 4 +- vllm/worker/cache_engine.py | 3 +- 9 files changed, 88 insertions(+), 52 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 3aac479c01bd2..ab3f2944b1e5f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -35,11 +35,16 @@ def sample_requests( completions = [completion for _, completion in dataset] completion_token_ids = tokenizer(completions).input_ids tokenized_dataset = [] + count = 0 for i in range(len(dataset)): + count += 1 + i = i % 10 output_len = len(completion_token_ids[i]) if fixed_output_len is not None: output_len = fixed_output_len tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) + if count == num_requests: + break # Filter out too long sequences. filtered_dataset: List[Tuple[str, int, int]] = [] @@ -53,9 +58,10 @@ def sample_requests( continue filtered_dataset.append((prompt, prompt_len, output_len)) - # Sample the requests. - sampled_requests = random.sample(filtered_dataset, num_requests) - return sampled_requests + # # Sample the requests. + # sampled_requests = random.sample(filtered_dataset, num_requests) + # return sampled_requests + return filtered_dataset def run_vllm( @@ -71,6 +77,7 @@ def run_vllm( dtype: str, max_model_len: Optional[int], enforce_eager: bool, + profiling: bool = False, # For Gaudi2 ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -83,6 +90,10 @@ def run_vllm( dtype=dtype, max_model_len=max_model_len, enforce_eager=enforce_eager, + max_num_batched_tokens=(16 * 512), + max_num_seqs=256, + max_paddings=(16 * 512), + block_size=16, ) # Add the requests to the engine. @@ -104,7 +115,7 @@ def run_vllm( start = time.perf_counter() # FIXME(woosuk): Do not use internal method. - llm._run_engine(use_tqdm=True) + llm._run_engine(use_tqdm=True, profiling=profiling) end = time.perf_counter() return end - start @@ -206,7 +217,8 @@ def main(args: argparse.Namespace): args.quantization, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, - args.max_model_len, args.enforce_eager) + args.max_model_len, args.enforce_eager, + args.profiling) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -284,6 +296,7 @@ def main(args: argparse.Namespace): parser.add_argument("--enforce-eager", action="store_true", help="enforce eager execution") + parser.add_argument("--profiling", action='store_true', help='Profiling first 4 steps') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/requirements.txt b/requirements.txt index 92ba0a716c45c..73a64a94391f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,9 @@ pandas # Required for Ray data. pyarrow # Required for Ray data. sentencepiece # Required for LLaMA tokenizer. numpy -torch == 2.1.2 +#torch == 2.1.2 transformers >= 4.36.0 # Required for Mixtral. -xformers == 0.0.23.post1 # Required for CUDA 12.1. +#xformers == 0.0.23.post1 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic == 1.10.13 # Required for OpenAI server. diff --git a/setup.py b/setup.py index 45a18776798fb..da56a61fc0278 100644 --- a/setup.py +++ b/setup.py @@ -28,10 +28,10 @@ def _is_cuda() -> bool: return torch.version.cuda is not None -# Compiler flags. -CXX_FLAGS = ["-g", "-O2", "-std=c++17"] -# TODO(woosuk): Should we use -O3? -NVCC_FLAGS = ["-O2", "-std=c++17"] +# # Compiler flags. +# CXX_FLAGS = ["-g", "-O2", "-std=c++17"] +# # TODO(woosuk): Should we use -O3? +# NVCC_FLAGS = ["-O2", "-std=c++17"] if _is_hip(): if ROCM_HOME is None: @@ -210,32 +210,33 @@ def get_torch_arch_list() -> Set[str]: f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" f"amdgpu_arch_found: {amd_arch}") -ext_modules = [] - -vllm_extension_sources = [ - "csrc/cache_kernels.cu", - "csrc/attention/attention_kernels.cu", - "csrc/pos_encoding_kernels.cu", - "csrc/activation_kernels.cu", - "csrc/layernorm_kernels.cu", - "csrc/quantization/squeezellm/quant_cuda_kernel.cu", - "csrc/cuda_utils_kernels.cu", - "csrc/pybind.cpp", -] - -if _is_cuda(): - vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") - vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu") - -vllm_extension = CUDAExtension( - name="vllm._C", - sources=vllm_extension_sources, - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, -) -ext_modules.append(vllm_extension) +# ext_modules = [] + +if _is_cuda() or _is_hip(): + vllm_extension_sources = [ + "csrc/cache_kernels.cu", + "csrc/attention/attention_kernels.cu", + "csrc/pos_encoding_kernels.cu", + "csrc/activation_kernels.cu", + "csrc/layernorm_kernels.cu", + "csrc/quantization/squeezellm/quant_cuda_kernel.cu", + "csrc/cuda_utils_kernels.cu", + "csrc/pybind.cpp", + ] + + if _is_cuda(): + vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") + vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu") + + vllm_extension = CUDAExtension( + name="vllm._C", + sources=vllm_extension_sources, + extra_compile_args={ + "cxx": CXX_FLAGS, + "nvcc": NVCC_FLAGS, + }, + ) + ext_modules.append(vllm_extension) def get_path(*filepath) -> str: @@ -274,12 +275,8 @@ def get_vllm_version() -> str: def read_readme() -> str: - """Read the README file if present.""" - p = get_path("README.md") - if os.path.isfile(p): - return io.open(get_path("README.md"), "r", encoding="utf-8").read() - else: - return "" + """Read the README file.""" + return io.open(get_path("README.md"), "r", encoding="utf-8").read() def get_requirements() -> List[str]: @@ -319,7 +316,6 @@ def get_requirements() -> List[str]: "examples", "tests")), python_requires=">=3.8", install_requires=get_requirements(), - ext_modules=ext_modules, - cmdclass={"build_ext": BuildExtension}, - package_data={"vllm": ["py.typed"]}, + # ext_modules=ext_modules, + # cmdclass={"build_ext": BuildExtension}, ) diff --git a/vllm/__init__.py b/vllm/__init__.py index e5cd1c2f3334b..9f25f62bd2c1a 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,5 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" +import habana_frameworks.torch.gpu_migration from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index ca28bbdc2fb95..1fc07fe85dc0b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -137,6 +137,8 @@ def _schedule(self) -> SchedulerOutputs: # sequence groups are added to the front and the new sequence groups # are added to the back. while self.waiting: + if len(scheduled) == 10: + break seq_group = self.waiting[0] assert seq_group.num_seqs() == 1, ( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0700298b03a3d..1df7b1a80b1b2 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -9,6 +9,8 @@ from vllm.sampling_params import SamplingParams from vllm.utils import Counter +import torch +import habana_frameworks.torch as htorch class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -174,20 +176,41 @@ def _add_request( self.llm_engine.add_request(request_id, prompt, sampling_params, prompt_token_ids) - def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: + def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOutput]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() pbar = tqdm(total=num_requests, desc="Processed prompts") + if profiling: + prof = torch.profiler.profile( + schedule = torch.profiler.schedule(wait=0, warmup=0, active=4, repeat=1), + activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU], + with_stack = True, + record_shapes = False, + on_trace_ready = torch.profiler.tensorboard_trace_handler("./", use_gzip = True) + ) + prof.start() + count = 0 + # Run the engine. outputs: List[RequestOutput] = [] while self.llm_engine.has_unfinished_requests(): step_outputs = self.llm_engine.step() + if profiling: + count += 1 + if count == 4: + break for output in step_outputs: if output.finished: outputs.append(output) if use_tqdm: pbar.update(1) + if profiling: + htorch.core.mark_step() + htorch.hpu.synchronize() + prof.step() + if profiling: + prof.stop() if use_tqdm: pbar.close() # Sort the outputs by request ID. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 6482875d1c55b..37d30bd1e3790 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -4,8 +4,8 @@ import torch import torch.nn as nn from xformers import ops as xops -from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, - LowerTriangularMaskWithTensorBias) +# from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, +# LowerTriangularMaskWithTensorBias) from vllm._C import ops from vllm._C import cache_ops diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 6d1aeeed78e93..3a557288c3dd1 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -109,8 +109,8 @@ def __init__( scaling = self.head_dim**-0.5 self.attn = PagedAttention(self.num_heads, self.head_dim, - scaling, - alibi_slopes=alibi_slopes) + scaling)#, + #alibi_slopes=alibi_slopes) def forward( self, diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 1dd0243f8f3a3..4c8a00c15126c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -89,7 +89,8 @@ def allocate_cpu_cache(self) -> List[KVCache]: cpu_cache: List[KVCache] = [] key_block_shape = self.get_key_block_shape() value_block_shape = self.get_value_block_shape() - pin_memory = not in_wsl() + # pin_memory = not in_wsl() + pin_memory = not in_wsl() and not torch.hpu.is_available() if not pin_memory: # Pinning memory in WSL is not supported. # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications From d8da01f8625e37054aaebe63147f75adeda1efed Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Wed, 8 Nov 2023 08:04:22 +0200 Subject: [PATCH 02/43] add hpu cache allocate --- vllm/worker/cache_engine.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 4c8a00c15126c..d2f5d53de4938 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -41,7 +41,8 @@ def __init__( self.num_cpu_blocks = cache_config.num_cpu_blocks # Initialize the cache. - self.gpu_cache = self.allocate_gpu_cache() + # self.gpu_cache = self.allocate_gpu_cache() + self.gpu_cache = self.allocate_hpu_cache() self.cpu_cache = self.allocate_cpu_cache() # Initialize the stream for caching operations. @@ -67,6 +68,29 @@ def get_value_block_shape(self) -> Tuple[int, int, int]: self.block_size, ) + def allocate_hpu_cache(self) -> List[KVCache]: + hpu_cache: List[KVCache] = [] + kv_block_shape = ( + self.num_heads, + self.head_size, + self.block_size) + for _ in range(self.num_layers): + key_blocks = [] + value_blocks = [] + for _ in range(self.num_gpu_blocks): + key_blocks.append(torch.empty( + size=kv_block_shape, + dtype=self.dtype, + device="hpu", + )) + value_blocks.append(torch.empty( + size=kv_block_shape, + dtype=self.dtype, + device="hpu", + )) + hpu_cache.append((key_blocks, value_blocks)) + return hpu_cache + def allocate_gpu_cache(self) -> List[KVCache]: gpu_cache: List[KVCache] = [] key_block_shape = self.get_key_block_shape() From 4d1538faf369125752cd65effd6ddbed91442bdd Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Wed, 8 Nov 2023 09:25:17 +0200 Subject: [PATCH 03/43] move slot_mapping to cpu and add is_prompt in cache_ops.reshape_and_cache --- vllm/worker/worker.py | 182 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 8698b15721507..68e4fa99b79f8 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -125,6 +125,188 @@ def warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) + def _prepare_inputs( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata]: + seq_groups: List[Tuple[List[int], SamplingParams]] = [] + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + selected_token_indices: List[int] = [] + selected_token_start_idx = 0 + categorized_sample_indices = {t: [] for t in SamplingType} + categorized_sample_indices_start_idx = 0 + + # Add prompt tokens. + prompt_lens: List[int] = [] + for seq_group_metadata in seq_group_metadata_list: + if not seq_group_metadata.is_prompt: + continue + + seq_ids = list(seq_group_metadata.seq_data.keys()) + sampling_params = seq_group_metadata.sampling_params + seq_groups.append((seq_ids, sampling_params)) + + # Use any sequence in the group. + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + prompt_len = len(prompt_tokens) + prompt_lens.append(prompt_len) + + if sampling_params.prompt_logprobs is not None: + # NOTE: prompt token positions do not need sample, skip + categorized_sample_indices_start_idx += prompt_len - 1 + + categorized_sample_indices[sampling_params.sampling_type].append( + categorized_sample_indices_start_idx) + categorized_sample_indices_start_idx += 1 + + input_tokens.append(prompt_tokens) + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.append(list(range(prompt_len))) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.append([0] * prompt_len) + continue + + # Compute the slot mapping. + slot_mapping.append([]) + block_table = seq_group_metadata.block_tables[seq_id] + for i in range(prompt_len): + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping[-1].append(slot) + + # Add generation tokens. + max_context_len = 0 + max_num_blocks_per_seq = 0 + context_lens: List[int] = [] + generation_block_tables: List[List[int]] = [] + max_seq_len = max(prompt_lens) if prompt_lens else 1 + for seq_group_metadata in seq_group_metadata_list: + if seq_group_metadata.is_prompt: + # We need to do this in this loop as we need to know max_seq_len + assert len( + seq_ids) == 1, "Prompt input should have only one seq." + sampling_params = seq_group_metadata.sampling_params + if sampling_params.prompt_logprobs is not None: + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + prompt_len - 1)) + selected_token_indices.append(selected_token_start_idx + + prompt_len - 1) + selected_token_start_idx += max_seq_len + continue + + seq_ids = list(seq_group_metadata.seq_data.keys()) + sampling_params = seq_group_metadata.sampling_params + seq_groups.append((seq_ids, sampling_params)) + + num_seqs = len(seq_ids) + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + num_seqs)) + selected_token_start_idx += num_seqs + + categorized_sample_indices[sampling_params.sampling_type].extend( + range(categorized_sample_indices_start_idx, + categorized_sample_indices_start_idx + num_seqs)) + categorized_sample_indices_start_idx += num_seqs + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) + + context_len = seq_data.get_len() + position = context_len - 1 + if self.sliding_window is not None: + context_len = min(context_len, self.sliding_window) + input_positions.append([position]) + + block_table = seq_group_metadata.block_tables[seq_id] + + max_context_len = max(max_context_len, context_len) + max_num_blocks_per_seq = max(max_num_blocks_per_seq, + len(block_table)) + context_lens.append(context_len) + + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append([slot]) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + generation_block_tables.append(block_table) + + padded_input_tokens = [ + _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens + ] + padded_input_positions = [ + _pad_to_max(positions, max_seq_len, pad=0) + for positions in input_positions + ] + padded_slot_mapping = [ + _pad_to_max(mapping, max_seq_len, pad=-1) + for mapping in slot_mapping + ] + padded_block_tables = [ + _pad_to_max(block_table, max_num_blocks_per_seq, pad=0) + for block_table in generation_block_tables + ] + + # Convert to tensors. + tokens_tensor = torch.tensor(padded_input_tokens, + dtype=torch.long, + device="cuda") + positions_tensor = torch.tensor(padded_input_positions, + dtype=torch.long, + device="cuda") + slot_mapping_tensor = torch.tensor(padded_slot_mapping, + dtype=torch.long, + device="cpu") + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device="cuda") + selected_token_indices = torch.tensor(selected_token_indices, + dtype=torch.long, + device="cuda") + categorized_sample_indices = { + t: torch.tensor(seq_ids, dtype=torch.int, device="cuda") + for t, seq_ids in categorized_sample_indices.items() + } + block_tables_tensor = torch.tensor(padded_block_tables, + dtype=torch.int, + device="cuda") + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + input_metadata = InputMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + slot_mapping=slot_mapping_tensor, + context_lens=context_lens_tensor, + max_context_len=max_context_len, + block_tables=block_tables_tensor, + selected_token_indices=selected_token_indices, + categorized_sample_indices=categorized_sample_indices, + sliding_window=self.sliding_window, + ) + return tokens_tensor, positions_tensor, input_metadata + @torch.inference_mode() def execute_model( self, From c3368243df6d4a5bb152413d2315406765603c3a Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Wed, 8 Nov 2023 10:39:24 +0200 Subject: [PATCH 04/43] add bucket to input metadata --- vllm/worker/worker.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 68e4fa99b79f8..7faf379ceca2a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -249,6 +249,16 @@ def _prepare_inputs( block_table = block_table[-sliding_window_blocks:] generation_block_tables.append(block_table) + def round_up(n, multiple): + print(n, multiple) + return (n + multiple - 1) // multiple * multiple + + if self.block_size is not None: + if max_seq_len != 1: + max_seq_len = round_up(max_seq_len, self.block_size) + if max_num_blocks_per_seq != 0: + max_num_blocks_per_seq = round_up(max_num_blocks_per_seq, self.block_size) + padded_input_tokens = [ _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens ] From 068c7484426804f606f62154bcff9b8f9fba9e97 Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Fri, 10 Nov 2023 06:14:25 +0200 Subject: [PATCH 05/43] 1. limit max block number for lazy mode (TODO) 2. set some inpu metadata from cuda to cpu --- vllm/engine/llm_engine.py | 4 ++-- vllm/worker/worker.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d6e388bf135b2..6ad70936dfd32 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -217,7 +217,7 @@ def _init_cache(self) -> None: # Since we use a shared centralized controller, we take the minimum # number of blocks across all workers to make sure all the memory # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) + num_gpu_blocks = min(10500, min(b[0] for b in num_blocks)) num_cpu_blocks = min(b[1] for b in num_blocks) # FIXME(woosuk): Change to debug log. logger.info(f"# GPU blocks: {num_gpu_blocks}, " @@ -425,7 +425,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, child_seqs.append((parent, parent)) for seq, _ in child_seqs: - self._decode_sequence(seq, seq_group.sampling_params) + # self._decode_sequence(seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) # Non-beam search case diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7faf379ceca2a..c948a6701e0c2 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -250,7 +250,6 @@ def _prepare_inputs( generation_block_tables.append(block_table) def round_up(n, multiple): - print(n, multiple) return (n + multiple - 1) // multiple * multiple if self.block_size is not None: @@ -287,7 +286,7 @@ def round_up(n, multiple): device="cpu") context_lens_tensor = torch.tensor(context_lens, dtype=torch.int, - device="cuda") + device="cpu") selected_token_indices = torch.tensor(selected_token_indices, dtype=torch.long, device="cuda") @@ -297,7 +296,7 @@ def round_up(n, multiple): } block_tables_tensor = torch.tensor(padded_block_tables, dtype=torch.int, - device="cuda") + device="cpu") seq_data: Dict[int, SequenceData] = {} for seq_group_metadata in seq_group_metadata_list: From 9a042f7a6be3fd48af20165c9288442f34b9d5ca Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Fri, 10 Nov 2023 15:02:23 +0200 Subject: [PATCH 06/43] remove bucket for block tables --- vllm/worker/worker.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index c948a6701e0c2..6068311307f7b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -255,8 +255,6 @@ def round_up(n, multiple): if self.block_size is not None: if max_seq_len != 1: max_seq_len = round_up(max_seq_len, self.block_size) - if max_num_blocks_per_seq != 0: - max_num_blocks_per_seq = round_up(max_num_blocks_per_seq, self.block_size) padded_input_tokens = [ _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens From 1e7e16d54360885dd2ba8bcb73248f3c628ee1c8 Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Sat, 11 Nov 2023 11:56:08 +0200 Subject: [PATCH 07/43] add run bash script and change benchmark config --- benchmarks/benchmark_throughput.py | 10 +++++----- benchmarks/run_benchmark_bloom560m.sh | 1 + vllm/core/scheduler.py | 2 +- vllm/worker/worker.py | 13 +++++++++++++ 4 files changed, 20 insertions(+), 6 deletions(-) create mode 100755 benchmarks/run_benchmark_bloom560m.sh diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ab3f2944b1e5f..43b368f020471 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -38,7 +38,7 @@ def sample_requests( count = 0 for i in range(len(dataset)): count += 1 - i = i % 10 + i = i % 4 output_len = len(completion_token_ids[i]) if fixed_output_len is not None: output_len = fixed_output_len @@ -90,10 +90,10 @@ def run_vllm( dtype=dtype, max_model_len=max_model_len, enforce_eager=enforce_eager, - max_num_batched_tokens=(16 * 512), - max_num_seqs=256, - max_paddings=(16 * 512), - block_size=16, + max_num_batched_tokens=(16 * 128), + max_num_seqs=20, + max_paddings=(16 * 128), + block_size=32, ) # Add the requests to the engine. diff --git a/benchmarks/run_benchmark_bloom560m.sh b/benchmarks/run_benchmark_bloom560m.sh new file mode 100755 index 0000000000000..404860a95372d --- /dev/null +++ b/benchmarks/run_benchmark_bloom560m.sh @@ -0,0 +1 @@ +python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100 diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 1fc07fe85dc0b..e13da6f88580a 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -137,7 +137,7 @@ def _schedule(self) -> SchedulerOutputs: # sequence groups are added to the front and the new sequence groups # are added to the back. while self.waiting: - if len(scheduled) == 10: + if len(scheduled) == 4: break seq_group = self.waiting[0] diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 6068311307f7b..16672086e5bae 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -312,6 +312,19 @@ def round_up(n, multiple): categorized_sample_indices=categorized_sample_indices, sliding_window=self.sliding_window, ) + + # Create attention mask + attn_masks = [ + torch.zeros((len(input_tokens), self.block_size), dtype=torch.int64) for _ in range(max_num_blocks_per_seq)] + for i in range(0, max_num_blocks_per_seq): + for seq_id in range(len(input_tokens)): + if (i * self.block_size) < context_lens[seq_id] and (i + 1) * self.block_size > context_lens[seq_id]: + attn_masks[i][seq_id, :context_lens[seq_id] % self.block_size] = 1 + elif (i+1) * self.block_size <= context_lens[seq_id]: + attn_masks[i][seq_id, :] = 1 + attn_masks[i] = attn_masks[i].to(device="cuda", non_blocking=True) + input_metadata.attention_masks = attn_masks + print("input token shape: ", tokens_tensor.shape) return tokens_tensor, positions_tensor, input_metadata @torch.inference_mode() From 153eb716f81d094437069e1a75f7a9699412cf6d Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Tue, 14 Nov 2023 03:34:05 +0200 Subject: [PATCH 08/43] 1. modify kv cache structure to tensors 2. update hpu paged attention API (for hpu graph compatibility) --- vllm/worker/cache_engine.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d2f5d53de4938..d7b4df272523d 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -77,17 +77,16 @@ def allocate_hpu_cache(self) -> List[KVCache]: for _ in range(self.num_layers): key_blocks = [] value_blocks = [] - for _ in range(self.num_gpu_blocks): - key_blocks.append(torch.empty( - size=kv_block_shape, - dtype=self.dtype, - device="hpu", - )) - value_blocks.append(torch.empty( - size=kv_block_shape, - dtype=self.dtype, - device="hpu", - )) + key_blocks = torch.empty( + size=(self.num_gpu_blocks, *kv_block_shape), + dtype=self.dtype, + device="hpu", + ) + value_blocks = torch.empty( + size=(self.num_gpu_blocks, *kv_block_shape), + dtype=self.dtype, + device="hpu", + ) hpu_cache.append((key_blocks, value_blocks)) return hpu_cache From 9b7e0a71a7965e790d9be692fd9dfe09fa11c9c6 Mon Sep 17 00:00:00 2001 From: Xiaotong Chen Date: Thu, 16 Nov 2023 07:04:08 +0200 Subject: [PATCH 09/43] add attention mask for generation --- vllm/worker/worker.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 16672086e5bae..7e6a7769bd4a7 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -314,16 +314,15 @@ def round_up(n, multiple): ) # Create attention mask - attn_masks = [ - torch.zeros((len(input_tokens), self.block_size), dtype=torch.int64) for _ in range(max_num_blocks_per_seq)] - for i in range(0, max_num_blocks_per_seq): - for seq_id in range(len(input_tokens)): - if (i * self.block_size) < context_lens[seq_id] and (i + 1) * self.block_size > context_lens[seq_id]: - attn_masks[i][seq_id, :context_lens[seq_id] % self.block_size] = 1 - elif (i+1) * self.block_size <= context_lens[seq_id]: - attn_masks[i][seq_id, :] = 1 - attn_masks[i] = attn_masks[i].to(device="cuda", non_blocking=True) - input_metadata.attention_masks = attn_masks + if max_num_blocks_per_seq != 0: + attn_masks = torch.zeros((max_num_blocks_per_seq, len(input_tokens), self.block_size), dtype=torch.int64) + for i in range(0, max_num_blocks_per_seq): + for seq_id in range(len(input_tokens)): + if (i * self.block_size) < context_lens[seq_id] and (i + 1) * self.block_size > context_lens[seq_id]: + attn_masks[i][seq_id, :context_lens[seq_id] % self.block_size] = 1 + elif (i + 1) * self.block_size <= context_lens[seq_id]: + attn_masks[i][seq_id, :] = 1 + input_metadata.attention_masks = attn_masks.to(device="cuda") print("input token shape: ", tokens_tensor.shape) return tokens_tensor, positions_tensor, input_metadata From c99eefc1d22415e00f0c0bd7af4181cd9942117d Mon Sep 17 00:00:00 2001 From: Jinyan Chen Date: Sun, 19 Nov 2023 02:32:48 -0800 Subject: [PATCH 10/43] add multi_query_kv_attention attn_bias --- vllm/model_executor/layers/attention.py | 4 ++-- vllm/worker/worker.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 37d30bd1e3790..6482875d1c55b 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -4,8 +4,8 @@ import torch import torch.nn as nn from xformers import ops as xops -# from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, -# LowerTriangularMaskWithTensorBias) +from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, + LowerTriangularMaskWithTensorBias) from vllm._C import ops from vllm._C import cache_ops diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7e6a7769bd4a7..d901035b1b2e8 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -323,6 +323,8 @@ def round_up(n, multiple): elif (i + 1) * self.block_size <= context_lens[seq_id]: attn_masks[i][seq_id, :] = 1 input_metadata.attention_masks = attn_masks.to(device="cuda") + # import pdb + # pdb.set_trace() print("input token shape: ", tokens_tensor.shape) return tokens_tensor, positions_tensor, input_metadata From 1327be851d4da3cd0a2e2d9ce8bd8b2c7f47246f Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Fri, 8 Dec 2023 10:05:21 +0000 Subject: [PATCH 11/43] Temp commit --- vllm/entrypoints/llm.py | 8 +- vllm/model_executor/layers/attention.py | 6 + .../model_executor/layers/rotary_embedding.py | 112 ++++++++++++++++++ 3 files changed, 123 insertions(+), 3 deletions(-) mode change 100644 => 100755 vllm/entrypoints/llm.py mode change 100644 => 100755 vllm/model_executor/layers/attention.py mode change 100644 => 100755 vllm/model_executor/layers/rotary_embedding.py diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py old mode 100644 new mode 100755 index 1df7b1a80b1b2..e399de249c9c3 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -183,7 +183,7 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu pbar = tqdm(total=num_requests, desc="Processed prompts") if profiling: prof = torch.profiler.profile( - schedule = torch.profiler.schedule(wait=0, warmup=0, active=4, repeat=1), + schedule = torch.profiler.schedule(wait=6, warmup=0, active=2, repeat=1), activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU], with_stack = True, record_shapes = False, @@ -196,9 +196,11 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu outputs: List[RequestOutput] = [] while self.llm_engine.has_unfinished_requests(): step_outputs = self.llm_engine.step() + print("vLLM completed a step") if profiling: count += 1 - if count == 4: + print(f"Processing step {count}") + if count == 8: break for output in step_outputs: if output.finished: @@ -207,9 +209,9 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu pbar.update(1) if profiling: htorch.core.mark_step() - htorch.hpu.synchronize() prof.step() if profiling: + htorch.hpu.synchronize() prof.stop() if use_tqdm: pbar.close() diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py old mode 100644 new mode 100755 index 6482875d1c55b..109569944eecb --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -10,7 +10,13 @@ from vllm._C import ops from vllm._C import cache_ops from vllm.model_executor.input_metadata import InputMetadata +<<<<<<< HEAD from vllm.utils import is_hip +======= +from vllm.model_executor.layers.rotary_embedding import ( + DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding, + LlamaRotaryEmbedding, YaRNScalingRotaryEmbedding) +>>>>>>> 0077e65 (Temp commit) _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py old mode 100644 new mode 100755 index 91c093e33e3c9..76ee077bb7b82 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -43,6 +43,118 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: return x.flatten(-2) +def get_device_name(): + """ + Returns the name of the current device: Gaudi or Gaudi2. + + Inspired from: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274 + """ + import habana_frameworks.torch.utils.experimental as htexp + + device_type = htexp._get_device_type() + + if device_type == htexp.synDeviceType.synDeviceGaudi: + return "gaudi" + elif device_type == htexp.synDeviceType.synDeviceGaudi2: + return "gaudi2" + else: + raise ValueError(f"Unsupported device: the device type is {device_type}.") + +# TODO: remove this workaround when FusedRoPE properly works on Gaudi +if get_device_name() == "gaudi2": + try: + from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE + except ImportError: + print("Not using HPU fused kernel for apply_rotary_pos_emb") + FusedRoPE = None +else: + FusedRoPE = None + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids]#.unsqueeze(unsqueeze_dim) + sin = sin[position_ids]#.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class LlamaRotaryEmbedding(nn.Module): + def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='cuda'): + super().__init__() + + self.head_size = head_size + self.dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + #import pdb + #pdb.set_trace() + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): + #import pdb + #pdb.set_trace() + seq_len = key.shape[-2] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) + + cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) + queries = torch.split(query, self.head_size, dim=-1) + keys = torch.split(key, self.head_size, dim=-1) + qs = [] + ks = [] + for i in range(len(keys)): + if query.device.type == "hpu" and FusedRoPE: + q, k = FusedRoPE.apply(queries[i], cos, sin, positions), FusedRoPE.apply(keys[i], cos, sin, positions) + else: + q, k = apply_rotary_pos_emb(queries[i], keys[i], cos, sin, positions) + qs.append(q) + ks.append(k) + return torch.cat(qs, dim=-1), torch.cat(ks, dim=-1) + + class RotaryEmbedding(nn.Module): """Original rotary positional embedding.""" From de7799fce09fe7adb20d009c80f568306096de20 Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Mon, 18 Dec 2023 19:17:59 +0200 Subject: [PATCH 12/43] Integrate fused kernels for RMSNorm and RoPE --- vllm/model_executor/layers/attention.py | 6 ----- vllm/model_executor/layers/layernorm.py | 11 ++++++++ .../model_executor/layers/rotary_embedding.py | 25 +++++++++---------- 3 files changed, 23 insertions(+), 19 deletions(-) mode change 100644 => 100755 vllm/model_executor/layers/layernorm.py diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 109569944eecb..6482875d1c55b 100755 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -10,13 +10,7 @@ from vllm._C import ops from vllm._C import cache_ops from vllm.model_executor.input_metadata import InputMetadata -<<<<<<< HEAD from vllm.utils import is_hip -======= -from vllm.model_executor.layers.rotary_embedding import ( - DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding, - LlamaRotaryEmbedding, YaRNScalingRotaryEmbedding) ->>>>>>> 0077e65 (Temp commit) _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py old mode 100644 new mode 100755 index cb3cee2bad5ad..bf63d905b4378 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -6,6 +6,11 @@ from vllm._C import ops +try: + from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm +except ImportError: + print("Not using HPU fused kernel for RMSNorm") + FusedRMSNorm = None class RMSNorm(nn.Module): """Root mean square normalization. @@ -56,6 +61,12 @@ def forward( self.variance_epsilon, ) return x, residual + + if x.device.type == "hpu" and FusedRMSNorm: + orig_dtype = x.dtype + x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon) + return x.to(orig_dtype) + out = torch.empty_like(x) ops.rms_norm( out, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 76ee077bb7b82..40d61e2e91240 100755 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -63,7 +63,7 @@ def get_device_name(): # TODO: remove this workaround when FusedRoPE properly works on Gaudi if get_device_name() == "gaudi2": try: - from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE + from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE except ImportError: print("Not using HPU fused kernel for apply_rotary_pos_emb") FusedRoPE = None @@ -141,18 +141,17 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) - queries = torch.split(query, self.head_size, dim=-1) - keys = torch.split(key, self.head_size, dim=-1) - qs = [] - ks = [] - for i in range(len(keys)): - if query.device.type == "hpu" and FusedRoPE: - q, k = FusedRoPE.apply(queries[i], cos, sin, positions), FusedRoPE.apply(keys[i], cos, sin, positions) - else: - q, k = apply_rotary_pos_emb(queries[i], keys[i], cos, sin, positions) - qs.append(q) - ks.append(k) - return torch.cat(qs, dim=-1), torch.cat(ks, dim=-1) + query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size)) + key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) + if query.device.type == "hpu" and FusedRoPE: + #print('using FusedRoPE') + cos = cos[positions].unsqueeze(2) + sin = sin[positions].unsqueeze(2) + query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) + else: + #print('using torch RoPE') + query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) + return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) class RotaryEmbedding(nn.Module): From b8391810cb5baeca5cd1628b33efa2b286d5518b Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Thu, 21 Dec 2023 13:47:25 +0200 Subject: [PATCH 13/43] Resolve merge conflicts --- benchmarks/run_benchmark_bloom560m.sh | 29 +- setup.py | 430 +++++++++--------- tests/async_engine/test_api_server.py | 2 +- tests/conftest.py | 6 +- tests/kernels/test_attention.py | 5 +- tests/samplers/test_beam_search.py | 2 +- tests/samplers/test_logprobs.py | 6 +- vllm/entrypoints/llm.py | 0 vllm/model_executor/layers/attention.py | 14 +- vllm/model_executor/layers/layernorm.py | 6 + .../model_executor/layers/rotary_embedding.py | 4 +- vllm/model_executor/models/llama.py | 1 + vllm/model_executor/sampling_metadata.py | 2 +- vllm/worker/worker.py | 1 + 14 files changed, 274 insertions(+), 234 deletions(-) mode change 100755 => 100644 vllm/entrypoints/llm.py mode change 100755 => 100644 vllm/model_executor/layers/attention.py mode change 100755 => 100644 vllm/model_executor/layers/layernorm.py mode change 100755 => 100644 vllm/model_executor/layers/rotary_embedding.py diff --git a/benchmarks/run_benchmark_bloom560m.sh b/benchmarks/run_benchmark_bloom560m.sh index 404860a95372d..13726bc3f46c0 100755 --- a/benchmarks/run_benchmark_bloom560m.sh +++ b/benchmarks/run_benchmark_bloom560m.sh @@ -1 +1,28 @@ -python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100 +cd /software/users/mdvoretckii/huda +source reset.sh +cd /software/users/mdvoretckii/habana_vllm +python -m pip install -e . +python -m pip install xformers --no-deps +cd benchmarks +#python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100 +python benchmark_throughput.py --tokenizer lmsys/vicuna-7b-v1.3 --model lmsys/vicuna-7b-v1.3 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100 +#curl -X POST -H "Accept: Application/json" -H "Content-Type: application/json" http://localhost:8000/generate -d '{"prompt":"Would you like a jelly baby?","use_beam_search":false,"n":1}' + + +# Missing ops: +# Bloom: alibi +# llama: RMS Norm, RoPE, fused silu, fail in sample +# --- +# GPT2: gelu_new +# Aquila: issues with external source +# Baichuan: no tokenizer +# Falcon: fail in sample +# Falcon RW: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens' +# GPT BigCode: gated, santacoder fails in sample (not affected by CPU RoPE) +# GPT-J: gelu_new +# GPT-NeoX: gelu_fast +# InternLM: no tokenizer class +# Mistral: max_num_batched_tokens (2048) is smaller than max_model_len (32768). +# MPT: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens' +# OPT: fail in sample +# Qwen: no tokenizer class diff --git a/setup.py b/setup.py index da56a61fc0278..57d02e9229022 100644 --- a/setup.py +++ b/setup.py @@ -33,210 +33,210 @@ def _is_cuda() -> bool: # # TODO(woosuk): Should we use -O3? # NVCC_FLAGS = ["-O2", "-std=c++17"] -if _is_hip(): - if ROCM_HOME is None: - raise RuntimeError( - "Cannot find ROCM_HOME. ROCm must be available to build the package." - ) - NVCC_FLAGS += ["-DUSE_ROCM"] - -if _is_cuda() and CUDA_HOME is None: - raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") - -ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 -CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] - - -def get_amdgpu_offload_arch(): - command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" - try: - output = subprocess.check_output([command]) - return output.decode('utf-8').strip() - except subprocess.CalledProcessError as e: - error_message = f"Error: {e}" - raise RuntimeError(error_message) from e - except FileNotFoundError as e: - # If the command is not found, print an error message - error_message = f"The command {command} was not found." - raise RuntimeError(error_message) from e - - return None - - -def get_hipcc_rocm_version(): - # Run the hipcc --version command - result = subprocess.run(['hipcc', '--version'], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True) - - # Check if the command was executed successfully - if result.returncode != 0: - print("Error running 'hipcc --version'") - return None - - # Extract the version using a regular expression - match = re.search(r'HIP version: (\S+)', result.stdout) - if match: - # Return the version string - return match.group(1) - else: - print("Could not find HIP version in the output") - return None - - -def get_nvcc_cuda_version(cuda_dir: str) -> Version: - """Get the CUDA version from nvcc. - - Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py - """ - nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], - universal_newlines=True) - output = nvcc_output.split() - release_idx = output.index("release") + 1 - nvcc_cuda_version = parse(output[release_idx].split(",")[0]) - return nvcc_cuda_version - - -def get_torch_arch_list() -> Set[str]: - # TORCH_CUDA_ARCH_LIST can have one or more architectures, - # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the - # compiler to additionally include PTX code that can be runtime-compiled - # and executed on the 8.6 or newer architectures. While the PTX code will - # not give the best performance on the newer architectures, it provides - # forward compatibility. - env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) - if env_arch_list is None: - return set() - - # List are separated by ; or space. - torch_arch_list = set(env_arch_list.replace(" ", ";").split(";")) - if not torch_arch_list: - return set() - - # Filter out the invalid architectures and print a warning. - valid_archs = NVIDIA_SUPPORTED_ARCHS.union( - {s + "+PTX" - for s in NVIDIA_SUPPORTED_ARCHS}) - arch_list = torch_arch_list.intersection(valid_archs) - # If none of the specified architectures are valid, raise an error. - if not arch_list: - raise RuntimeError( - "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " - f"variable ({env_arch_list}) is supported. " - f"Supported CUDA/ROCM architectures are: {valid_archs}.") - invalid_arch_list = torch_arch_list - valid_archs - if invalid_arch_list: - warnings.warn( - f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " - "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " - f"({env_arch_list}). Supported CUDA/ROCM architectures are: " - f"{valid_archs}.", - stacklevel=2) - return arch_list - - -# First, check the TORCH_CUDA_ARCH_LIST environment variable. -compute_capabilities = get_torch_arch_list() -if _is_cuda() and not compute_capabilities: - # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available - # GPUs on the current machine. - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 7: - raise RuntimeError( - "GPUs with compute capability below 7.0 are not supported.") - compute_capabilities.add(f"{major}.{minor}") - -if _is_cuda(): - nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) - if not compute_capabilities: - # If no GPU is specified nor available, add all supported architectures - # based on the NVCC CUDA version. - compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() - if nvcc_cuda_version < Version("11.1"): - compute_capabilities.remove("8.6") - if nvcc_cuda_version < Version("11.8"): - compute_capabilities.remove("8.9") - compute_capabilities.remove("9.0") - # Validate the NVCC CUDA version. - if nvcc_cuda_version < Version("11.0"): - raise RuntimeError( - "CUDA 11.0 or higher is required to build the package.") - if (nvcc_cuda_version < Version("11.1") - and any(cc.startswith("8.6") for cc in compute_capabilities)): - raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6.") - if nvcc_cuda_version < Version("11.8"): - if any(cc.startswith("8.9") for cc in compute_capabilities): - # CUDA 11.8 is required to generate the code targeting compute capability 8.9. - # However, GPUs with compute capability 8.9 can also run the code generated by - # the previous versions of CUDA 11 and targeting compute capability 8.0. - # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 - # instead of 8.9. - warnings.warn( - "CUDA 11.8 or higher is required for compute capability 8.9. " - "Targeting compute capability 8.0 instead.", - stacklevel=2) - compute_capabilities = set(cc for cc in compute_capabilities - if not cc.startswith("8.9")) - compute_capabilities.add("8.0+PTX") - if any(cc.startswith("9.0") for cc in compute_capabilities): - raise RuntimeError( - "CUDA 11.8 or higher is required for compute capability 9.0.") - - # Add target compute capabilities to NVCC flags. - for capability in compute_capabilities: - num = capability[0] + capability[2] - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] - if capability.endswith("+PTX"): - NVCC_FLAGS += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" - ] - - # Use NVCC threads to parallelize the build. - if nvcc_cuda_version >= Version("11.2"): - nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) - num_threads = min(os.cpu_count(), nvcc_threads) - NVCC_FLAGS += ["--threads", str(num_threads)] - -elif _is_hip(): - amd_arch = get_amdgpu_offload_arch() - if amd_arch not in ROCM_SUPPORTED_ARCHS: - raise RuntimeError( - f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" - f"amdgpu_arch_found: {amd_arch}") - -# ext_modules = [] - -if _is_cuda() or _is_hip(): - vllm_extension_sources = [ - "csrc/cache_kernels.cu", - "csrc/attention/attention_kernels.cu", - "csrc/pos_encoding_kernels.cu", - "csrc/activation_kernels.cu", - "csrc/layernorm_kernels.cu", - "csrc/quantization/squeezellm/quant_cuda_kernel.cu", - "csrc/cuda_utils_kernels.cu", - "csrc/pybind.cpp", - ] - - if _is_cuda(): - vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") - vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu") - - vllm_extension = CUDAExtension( - name="vllm._C", - sources=vllm_extension_sources, - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, - ) - ext_modules.append(vllm_extension) +# if _is_hip(): +# if ROCM_HOME is None: +# raise RuntimeError( +# "Cannot find ROCM_HOME. ROCm must be available to build the package." +# ) +# NVCC_FLAGS += ["-DUSE_ROCM"] + +# if _is_cuda() and CUDA_HOME is None: +# raise RuntimeError( +# "Cannot find CUDA_HOME. CUDA must be available to build the package.") + +# ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 +# CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] +# NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] + + +# def get_amdgpu_offload_arch(): +# command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" +# try: +# output = subprocess.check_output([command]) +# return output.decode('utf-8').strip() +# except subprocess.CalledProcessError as e: +# error_message = f"Error: {e}" +# raise RuntimeError(error_message) from e +# except FileNotFoundError as e: +# # If the command is not found, print an error message +# error_message = f"The command {command} was not found." +# raise RuntimeError(error_message) from e + +# return None + + +# def get_hipcc_rocm_version(): +# # Run the hipcc --version command +# result = subprocess.run(['hipcc', '--version'], +# stdout=subprocess.PIPE, +# stderr=subprocess.STDOUT, +# text=True) + +# # Check if the command was executed successfully +# if result.returncode != 0: +# print("Error running 'hipcc --version'") +# return None + +# # Extract the version using a regular expression +# match = re.search(r'HIP version: (\S+)', result.stdout) +# if match: +# # Return the version string +# return match.group(1) +# else: +# print("Could not find HIP version in the output") +# return None + + +# def get_nvcc_cuda_version(cuda_dir: str) -> Version: +# """Get the CUDA version from nvcc. + +# Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py +# """ +# nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], +# universal_newlines=True) +# output = nvcc_output.split() +# release_idx = output.index("release") + 1 +# nvcc_cuda_version = parse(output[release_idx].split(",")[0]) +# return nvcc_cuda_version + + +# def get_torch_arch_list() -> Set[str]: +# # TORCH_CUDA_ARCH_LIST can have one or more architectures, +# # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the +# # compiler to additionally include PTX code that can be runtime-compiled +# # and executed on the 8.6 or newer architectures. While the PTX code will +# # not give the best performance on the newer architectures, it provides +# # forward compatibility. +# env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) +# if env_arch_list is None: +# return set() + +# # List are separated by ; or space. +# torch_arch_list = set(env_arch_list.replace(" ", ";").split(";")) +# if not torch_arch_list: +# return set() + +# # Filter out the invalid architectures and print a warning. +# valid_archs = NVIDIA_SUPPORTED_ARCHS.union( +# {s + "+PTX" +# for s in NVIDIA_SUPPORTED_ARCHS}) +# arch_list = torch_arch_list.intersection(valid_archs) +# # If none of the specified architectures are valid, raise an error. +# if not arch_list: +# raise RuntimeError( +# "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " +# f"variable ({env_arch_list}) is supported. " +# f"Supported CUDA/ROCM architectures are: {valid_archs}.") +# invalid_arch_list = torch_arch_list - valid_archs +# if invalid_arch_list: +# warnings.warn( +# f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " +# "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " +# f"({env_arch_list}). Supported CUDA/ROCM architectures are: " +# f"{valid_archs}.", +# stacklevel=2) +# return arch_list + + +# # First, check the TORCH_CUDA_ARCH_LIST environment variable. +# compute_capabilities = get_torch_arch_list() +# if _is_cuda() and not compute_capabilities: +# # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available +# # GPUs on the current machine. +# device_count = torch.cuda.device_count() +# for i in range(device_count): +# major, minor = torch.cuda.get_device_capability(i) +# if major < 7: +# raise RuntimeError( +# "GPUs with compute capability below 7.0 are not supported.") +# compute_capabilities.add(f"{major}.{minor}") + +# if _is_cuda(): +# nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) +# if not compute_capabilities: +# # If no GPU is specified nor available, add all supported architectures +# # based on the NVCC CUDA version. +# compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() +# if nvcc_cuda_version < Version("11.1"): +# compute_capabilities.remove("8.6") +# if nvcc_cuda_version < Version("11.8"): +# compute_capabilities.remove("8.9") +# compute_capabilities.remove("9.0") +# # Validate the NVCC CUDA version. +# if nvcc_cuda_version < Version("11.0"): +# raise RuntimeError( +# "CUDA 11.0 or higher is required to build the package.") +# if (nvcc_cuda_version < Version("11.1") +# and any(cc.startswith("8.6") for cc in compute_capabilities)): +# raise RuntimeError( +# "CUDA 11.1 or higher is required for compute capability 8.6.") +# if nvcc_cuda_version < Version("11.8"): +# if any(cc.startswith("8.9") for cc in compute_capabilities): +# # CUDA 11.8 is required to generate the code targeting compute capability 8.9. +# # However, GPUs with compute capability 8.9 can also run the code generated by +# # the previous versions of CUDA 11 and targeting compute capability 8.0. +# # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 +# # instead of 8.9. +# warnings.warn( +# "CUDA 11.8 or higher is required for compute capability 8.9. " +# "Targeting compute capability 8.0 instead.", +# stacklevel=2) +# compute_capabilities = set(cc for cc in compute_capabilities +# if not cc.startswith("8.9")) +# compute_capabilities.add("8.0+PTX") +# if any(cc.startswith("9.0") for cc in compute_capabilities): +# raise RuntimeError( +# "CUDA 11.8 or higher is required for compute capability 9.0.") + +# # Add target compute capabilities to NVCC flags. +# for capability in compute_capabilities: +# num = capability[0] + capability[2] +# NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] +# if capability.endswith("+PTX"): +# NVCC_FLAGS += [ +# "-gencode", f"arch=compute_{num},code=compute_{num}" +# ] + +# # Use NVCC threads to parallelize the build. +# if nvcc_cuda_version >= Version("11.2"): +# nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) +# num_threads = min(os.cpu_count(), nvcc_threads) +# NVCC_FLAGS += ["--threads", str(num_threads)] + +# elif _is_hip(): +# amd_arch = get_amdgpu_offload_arch() +# if amd_arch not in ROCM_SUPPORTED_ARCHS: +# raise RuntimeError( +# f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" +# f"amdgpu_arch_found: {amd_arch}") + +# # ext_modules = [] + +# if _is_cuda() or _is_hip(): +# vllm_extension_sources = [ +# "csrc/cache_kernels.cu", +# "csrc/attention/attention_kernels.cu", +# "csrc/pos_encoding_kernels.cu", +# "csrc/activation_kernels.cu", +# "csrc/layernorm_kernels.cu", +# "csrc/quantization/squeezellm/quant_cuda_kernel.cu", +# "csrc/cuda_utils_kernels.cu", +# "csrc/pybind.cpp", +# ] + +# if _is_cuda(): +# vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") +# vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu") + +# vllm_extension = CUDAExtension( +# name="vllm._C", +# sources=vllm_extension_sources, +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": NVCC_FLAGS, +# }, +# ) +# ext_modules.append(vllm_extension) def get_path(*filepath) -> str: @@ -259,17 +259,17 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) - if _is_hip(): - # Get the HIP version - hipcc_version = get_hipcc_rocm_version() - if hipcc_version != MAIN_CUDA_VERSION: - rocm_version_str = hipcc_version.replace(".", "")[:3] - version += f"+rocm{rocm_version_str}" - else: - cuda_version = str(nvcc_cuda_version) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - version += f"+cu{cuda_version_str}" + # if _is_hip(): + # # Get the HIP version + # hipcc_version = get_hipcc_rocm_version() + # if hipcc_version != MAIN_CUDA_VERSION: + # rocm_version_str = hipcc_version.replace(".", "")[:3] + # version += f"+rocm{rocm_version_str}" + # else: + # cuda_version = str(nvcc_cuda_version) + # if cuda_version != MAIN_CUDA_VERSION: + # cuda_version_str = cuda_version.replace(".", "")[:3] + # version += f"+cu{cuda_version_str}" return version diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index d90ba37b27bb9..2eb1b2606b80e 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -26,7 +26,7 @@ def api_server(): "api_server_async_engine.py").absolute() uvicorn_process = subprocess.Popen([ sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m" + str(script_path), "--model", "lmsys/vicuna-7b-v1.3" ]) yield uvicorn_process.terminate() diff --git a/tests/conftest.py b/tests/conftest.py index 16c04e01d703c..7b73aaff6f6c9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,7 +57,7 @@ def __init__( model_name, torch_dtype=torch_dtype, trust_remote_code=True, - ).cuda() + )#.cuda() if tokenizer_name is None: tokenizer_name = model_name self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True) @@ -71,7 +71,7 @@ def generate( for prompt in prompts: input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids output_ids = self.model.generate( - input_ids.cuda(), + input_ids,#.cuda(), use_cache=True, **kwargs, ) @@ -127,7 +127,7 @@ def generate_greedy_logprobs( for prompt in prompts: input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids output = self.model.generate( - input_ids.cuda(), + input_ids,#.cuda(), use_cache=True, do_sample=False, max_new_tokens=max_tokens, diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 614b65f82ccbd..c6755b4d8f34e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -97,7 +97,7 @@ def ref_single_query_cached_kv_attention( output[i].copy_(out, non_blocking=True) -@pytest.mark.parametrize("version", ["v1", "v2"]) +@pytest.mark.parametrize("version", ["v1"])#, "v2"]) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -162,8 +162,7 @@ def test_paged_attention( # Call the paged attention kernel. output = torch.empty_like(query) if version == "v1": - ops.paged_attention_v1( - output, + output = ops.paged_attention_v1( query, key_cache, value_cache, diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index a491ffa763505..4cf777e2b9e6f 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -10,7 +10,7 @@ # 3. Use the model "huggyllama/llama-7b". MAX_TOKENS = [128] BEAM_WIDTHS = [4] -MODELS = ["facebook/opt-125m"] +MODELS = ["lmsys/vicuna-7b-v1.3"] @pytest.mark.parametrize("model", MODELS) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 1c67cc5bd7394..24b1572d9a325 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -3,7 +3,7 @@ from vllm import SamplingParams -MODELS = ["facebook/opt-125m"] +MODELS = ["lmsys/vicuna-7b-v1.3"] @pytest.mark.parametrize("model", MODELS) @@ -16,7 +16,7 @@ def test_get_prompt_logprobs( example_prompts, ): max_tokens = 5 - hf_model = hf_runner(model, dtype=dtype) + hf_model = hf_runner(model, dtype="float") hf_logprobs = hf_model.generate_greedy_logprobs( example_prompts, max_tokens=max_tokens, @@ -24,6 +24,8 @@ def test_get_prompt_logprobs( del hf_model vllm_model = vllm_runner(model, dtype=dtype) + import pdb + pdb.set_trace() vllm_sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=5, prompt_logprobs=5, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py old mode 100755 new mode 100644 diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py old mode 100755 new mode 100644 index 6482875d1c55b..1da061a6a52c3 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -143,17 +143,22 @@ def forward( key = key.unflatten(0, (batch_size, seq_len)) value = value.unflatten(0, (batch_size, seq_len)) + cu_seq_lens = [0] + for i in range(len(input_metadata.prompt_lens)): + cu_seq_lens.append(cu_seq_lens[-1] + input_metadata.prompt_lens[i]) + input_metadata.cu_seq_lens = cu_seq_lens out = xops.memory_efficient_attention_forward( query, key, value, + cu_seq_lens=cu_seq_lens, attn_bias=input_metadata.attn_bias, p=0.0, scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if - (is_hip()) else None, ) - output = out.view_as(query) + output = torch.zeros_like(query) + output[:, :out.shape[1], :, :] = out + output = output.view_as(query) else: # Decoding run. if key_cache is not None and value_cache is not None: @@ -236,8 +241,7 @@ def _paged_attention( max_num_partitions == 1 or num_seqs * num_heads > 512) if use_v1: # Run PagedAttention V1. - ops.paged_attention_v1( - output, + output = ops.paged_attention_v1( query, key_cache, value_cache, diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py old mode 100755 new mode 100644 index bf63d905b4378..ccfd55806d5c6 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -54,6 +54,12 @@ def forward( residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if residual is not None: + if x.device.type == "hpu" and FusedRMSNorm: + orig_dtype = x.dtype + residual += x + x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon) + return x.to(orig_dtype), residual + ops.fused_add_rms_norm( x, residual, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py old mode 100755 new mode 100644 index 40d61e2e91240..ecd78bd008d25 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -454,8 +454,8 @@ def get_rope( return _ROPE_DICT[key] if rope_scaling is None: - rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style) + rotary_emb = LlamaRotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) else: scaling_type = rope_scaling["type"] scaling_factor = rope_scaling["factor"] diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b3b24ea6fea44..7722cc140326d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -283,6 +283,7 @@ def forward( kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: + print(f'Input shape: {input_ids.shape}') hidden_states = self.model(input_ids, positions, kv_caches, input_metadata) return hidden_states diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 49013ec273787..388e55ba92e67 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -146,7 +146,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], dtype: torch.dtype) -> "SamplingTensors": # Note that the performance will be very bad without # pinned memory. - pin_memory = not in_wsl() + pin_memory = not in_wsl() and not device.type == "hpu" prompt_max_len = max(len(tokens) for tokens in prompt_tokens) prompt_padded_tokens = [ tokens + [vocab_size] * (prompt_max_len - len(tokens)) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d901035b1b2e8..22c53005aa7f5 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -8,6 +8,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.model_executor import set_random_seed +from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.parallel_utils.parallel_state import ( initialize_model_parallel) from vllm.sequence import SamplerOutput, SequenceGroupMetadata From 00df4867965c9921c1ab130e1ed807064bab161e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 21 Dec 2023 17:09:16 +0000 Subject: [PATCH 14/43] Minor Gaudi workarounds, add debugging to stock vLLM API server --- vllm/entrypoints/api_server.py | 14 +++++++++++++- vllm/entrypoints/openai/api_server.py | 4 +++- vllm/worker/model_runner.py | 2 +- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 6910b3265dfd2..b120210831fe5 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,7 +1,9 @@ import argparse import json from typing import AsyncGenerator - +import torch +import habana_frameworks.torch.core as htcore +import habana_frameworks.torch.gpu_migration from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse import uvicorn @@ -66,6 +68,16 @@ async def stream_results() -> AsyncGenerator[bytes, None]: prompt = final_output.prompt text_outputs = [prompt + output.text for output in final_output.outputs] ret = {"text": text_outputs} + DEBUG = True + if DEBUG: + text_tokens = [output.token_ids for output in final_output.outputs] + from vllm.transformers_utils.tokenizer import get_tokenizer + tokenizer = get_tokenizer('lmsys/vicuna-7b-v1.3') + decoded_tokens = [tokenizer.decode(token_ids) for token_ids in text_tokens] + ret["DEBUG"] = { + 'tokens': text_tokens, + 'decoded_tokens': decoded_tokens, + } return JSONResponse(ret) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index be5f4190e633f..bb5b921123460 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -8,7 +8,9 @@ import time from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union - +import torch +import habana_frameworks.torch.core as htcore +import habana_frameworks.torch.gpu_migration from aioprometheus import MetricsMiddleware from aioprometheus.asgi.starlette import metrics import fastapi diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 276ef0708847a..44f447420295a 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -402,7 +402,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None: input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda() input_positions = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda() - slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda() + slot_mapping = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda() # FIXME (kzawora): revert this to torch.empty after bridge bug is fixed slot_mapping.fill_(_PAD_SLOT_ID) context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda() block_tables = torch.from_numpy(self.graph_block_tables).cuda() From 16b55577160e38b64bc79b1a65369abe2a3460a8 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 21 Dec 2023 17:34:09 +0000 Subject: [PATCH 15/43] Fix post-merge pinned memory segfaults --- vllm/utils.py | 4 ++++ vllm/worker/model_runner.py | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index eff5d10fd4ee0..b1af1c740e9b9 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -59,3 +59,7 @@ def get_open_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] + + +def is_hpu() -> bool: + return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available() \ No newline at end of file diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 489cd16dabcfb..89dd1f054a258 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -10,7 +10,7 @@ from vllm.model_executor import get_model, InputMetadata, SamplingMetadata from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata -from vllm.utils import in_wsl +from vllm.utils import in_wsl, is_hpu logger = init_logger(__name__) @@ -307,9 +307,9 @@ def _prepare_sample( selected_token_indices = _async_h2d(selected_token_indices, dtype=torch.long, - pin_memory=not self.in_wsl) + pin_memory=not is_hpu() and not self.in_wsl) categorized_sample_indices = { - t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not self.in_wsl) + t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not is_hpu() and not self.in_wsl) for t, seq_ids in categorized_sample_indices.items() } From 2b6ec4e3438b131cbc211e743aa427a655cb9bb7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 21 Dec 2023 17:43:01 +0000 Subject: [PATCH 16/43] Re-enable sequence decode --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6ad70936dfd32..481ba1a17c808 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -425,7 +425,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, child_seqs.append((parent, parent)) for seq, _ in child_seqs: - # self._decode_sequence(seq, seq_group.sampling_params) + self._decode_sequence(seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) # Non-beam search case From 9d4bd9f7d55c88cd3b887d9be1ecd69e97bbb716 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 22 Dec 2023 14:58:12 +0200 Subject: [PATCH 17/43] Maintain GPU compatibility in cache_engine --- vllm/worker/cache_engine.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d7b4df272523d..0eaa3fa28f246 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,7 @@ from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import in_wsl +from vllm.utils import in_wsl, is_hpu logger = init_logger(__name__) @@ -41,8 +41,10 @@ def __init__( self.num_cpu_blocks = cache_config.num_cpu_blocks # Initialize the cache. - # self.gpu_cache = self.allocate_gpu_cache() - self.gpu_cache = self.allocate_hpu_cache() + if is_hpu(): + self.gpu_cache = self.allocate_hpu_cache() + else: + self.gpu_cache = self.allocate_gpu_cache() self.cpu_cache = self.allocate_cpu_cache() # Initialize the stream for caching operations. @@ -113,7 +115,7 @@ def allocate_cpu_cache(self) -> List[KVCache]: key_block_shape = self.get_key_block_shape() value_block_shape = self.get_value_block_shape() # pin_memory = not in_wsl() - pin_memory = not in_wsl() and not torch.hpu.is_available() + pin_memory = not in_wsl() and not is_hpu() if not pin_memory: # Pinning memory in WSL is not supported. # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications From 7a0337ab69527a375220dbaa5a7c1699717b690e Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Wed, 10 Jan 2024 15:48:29 +0200 Subject: [PATCH 18/43] Adjust HPU RoPE for non-query runs --- vllm/model_executor/layers/rotary_embedding.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index ecd78bd008d25..94aa842f71ff2 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -145,8 +145,12 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) if query.device.type == "hpu" and FusedRoPE: #print('using FusedRoPE') - cos = cos[positions].unsqueeze(2) - sin = sin[positions].unsqueeze(2) + if len(positions[0]) == 1: + cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) + sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) + else: + cos = cos[positions].unsqueeze(2) + sin = sin[positions].unsqueeze(2) query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) else: #print('using torch RoPE') From 6351d4115727d79f7ee5e5bba3a5dd94045cb3fa Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Tue, 23 Jan 2024 15:13:15 +0200 Subject: [PATCH 19/43] Integrate HPU primitive implementations --- .../kernels/benchmark_paged_attention.py | 6 +- tests/kernels/conftest.py | 6 +- tests/kernels/test_attention.py | 17 +- tests/kernels/test_cache.py | 6 +- vllm/hpu/__init__.py | 11 + vllm/hpu/attn_bias.py | 764 ++++++++++++++++++ vllm/hpu/cache_ops.py | 155 ++++ vllm/hpu/cuda_utils.py | 14 + vllm/hpu/ops.py | 172 ++++ vllm/model_executor/layers/activation.py | 6 +- vllm/model_executor/layers/attention.py | 19 +- vllm/model_executor/layers/layernorm.py | 6 +- .../model_executor/layers/quantization/awq.py | 6 +- .../layers/quantization/gptq.py | 6 +- .../layers/quantization/squeezellm.py | 7 +- .../model_executor/layers/rotary_embedding.py | 6 +- vllm/utils.py | 16 +- vllm/worker/cache_engine.py | 6 +- 18 files changed, 1201 insertions(+), 28 deletions(-) create mode 100644 vllm/hpu/__init__.py create mode 100644 vllm/hpu/attn_bias.py create mode 100644 vllm/hpu/cache_ops.py create mode 100644 vllm/hpu/cuda_utils.py create mode 100644 vllm/hpu/ops.py diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 935393e9942ce..f22acca3b7909 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -4,7 +4,11 @@ import torch -from vllm._C import ops +from vllm.utils import is_hpu() +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops NUM_BLOCKS = 1024 PARTITION_SIZE = 512 diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py index 97516bd3052cf..80b62e4e0ef7d 100644 --- a/tests/kernels/conftest.py +++ b/tests/kernels/conftest.py @@ -2,6 +2,7 @@ import pytest import torch +from vllm.utils import is_hpu def create_kv_caches( @@ -18,7 +19,10 @@ def create_kv_caches( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=dtype).element_size() - key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + if is_hpu(): + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + else: + key_cache_shape = (num_blocks, num_heads, head_size, block_size) key_caches = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index c6755b4d8f34e..8ff15aceac542 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -4,10 +4,14 @@ import pytest import torch from xformers import ops as xops -from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask -from vllm._C import ops -from vllm.utils import get_max_shared_memory_bytes +from vllm.utils import get_max_shared_memory_bytes, is_hpu +if is_hpu(): + from vllm.hpu import ops + from vllm.hpu.attn_bias import BlockDiagonalCausalMask +else: + from vllm._C import ops + from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. @@ -71,8 +75,11 @@ def ref_single_query_cached_kv_attention( block_number = int(block_table[j // block_size]) block_offset = j % block_size - k = key_cache[block_number, :, :, block_offset, :] - k = k.reshape(num_kv_heads, head_size) + if is_hpu(): + k = key_cache[block_number, :, :, block_offset] + else: + k = key_cache[block_number, :, :, block_offset, :] + k = k.reshape(num_kv_heads, head_size) keys.append(k) v = value_cache[block_number, :, :, block_offset] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 9b5d7687a3fec..bdef59b3b86b1 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -3,7 +3,11 @@ import pytest import torch -from vllm._C import cache_ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import cache_ops +else: + from vllm._C import cache_ops DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [83] # Arbitrary values for testing diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py new file mode 100644 index 0000000000000..ce3a3ce5d435c --- /dev/null +++ b/vllm/hpu/__init__.py @@ -0,0 +1,11 @@ +############################################################################### +# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company +# All Rights Reserved. +# +# Unauthorized copying of this file or any element(s) within it, via any medium +# is strictly prohibited. +# This file contains Habana Labs, Ltd. proprietary and confidential information +# and is subject to the confidentiality and license agreements under which it +# was provided. +# +############################################################################### \ No newline at end of file diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py new file mode 100644 index 0000000000000..ac3ce8e6784cc --- /dev/null +++ b/vllm/hpu/attn_bias.py @@ -0,0 +1,764 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + + +import math +from dataclasses import dataclass +from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union + +import torch + + +class AttentionBias: + """Base class for a custom bias that can be applied \ + as the attn_bias argument in + :attr:`xformers.ops.memory_efficient_attention`. + + That function has the ability to add a tensor, the + attention bias, to the QK^T matrix before it is used + in the softmax part of the attention calculation. + The attention bias tensor with shape + (B or 1, n_queries, number of keys) + can be given as the attn_bias input. + The most common use case is for an attention bias is + to contain only zeros and negative infinities, which forms + a mask so that some queries only attend to some keys. + + Children of this class define alternative things which can + be used as the attn_bias input to define an attention bias which + forms such a mask, for some common cases. + + When using an :attr:`xformers.ops.AttentionBias` + instead of a :attr:`torch.Tensor`, the mask matrix does + not need to be materialized, and can be + hardcoded into some kernels for better performance. + + See: + + - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask` + - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias` + - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask` + - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask` + + """ + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + """ + Materializes the bias as a `torch.Tensor`. This is very slow + and we don't attempt to make it fast. Only use for debugging/testing. + + Shape should be like `[*, q_seqlen, k_seqlen]` + """ + raise NotImplementedError() + + +class LowerTriangularMask(AttentionBias): + """ + A lower-triangular (aka causal) mask + + A query Q cannot attend to a key which is farther from the + initial key than Q is from the initial query. + """ + + def __init__(self, *tensor_args, **tensor_kwargs) -> None: + # NOTE: Unused arguments, we keep them for backward compatibility + super().__init__() + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=float("-inf"), + device=device, + ) + return torch.triu(tensor, diagonal=1).to(dtype) # type: ignore + + def add_bias(self, bias: torch.Tensor) -> "LowerTriangularMaskWithTensorBias": + return LowerTriangularMaskWithTensorBias(bias) + + +class LowerTriangularMaskWithTensorBias(LowerTriangularMask): + """A lower-triangular (aka causal) mask with an additive bias""" + + def __init__(self, bias: torch.Tensor) -> None: + self._bias = bias + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + return super().materialize(shape, dtype=dtype, device=device) + self._bias + + +@dataclass +class _SeqLenInfo: + """ + (Internal) Represents the division of a dimension into blocks. + + For example, to represents a dimension of length 7 divided into + three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`. + The members will be: + max_seqlen: 3 + min_seqlen: 2 + seqstart_py: [0, 2, 5, 7] + seqstart: torch.IntTensor([0, 2, 5, 7]) + """ + + seqstart: torch.Tensor + max_seqlen: int + min_seqlen: int + seqstart_py: List[int] + + def to(self, device: torch.device) -> None: + self.seqstart = self.seqstart.to(device, non_blocking=True) + + def intervals(self) -> Iterable[Tuple[int, int]]: + yield from zip(self.seqstart_py, self.seqstart_py[1:]) + + @classmethod + def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo": + """ + Input tensors are assumed to be in shape [B, M, *] + """ + assert not isinstance(seqlens, torch.Tensor) + seqstart_py = [0] + max_seqlen = -1 + min_seqlen = -1 + for seqlen in seqlens: + min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen + max_seqlen = max(max_seqlen, seqlen) + seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen) + seqstart = torch.tensor(seqstart_py, dtype=torch.int32) + return cls( + max_seqlen=max_seqlen, + min_seqlen=min_seqlen, + seqstart=seqstart, + seqstart_py=seqstart_py, + ) + + def split( + self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None + ) -> List[torch.Tensor]: + if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1: + raise ValueError( + f"Invalid `torch.Tensor` of shape {x.shape}, expected format " + f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n" + f" seqstart: {self.seqstart_py}" + ) + if batch_sizes is None: + batch_sizes = [1] * (len(self.seqstart_py) - 1) + split_chunks = [] + it = 0 + for batch_size in batch_sizes: + split_chunks.append( + self.seqstart_py[it + batch_size] - self.seqstart_py[it] + ) + it += batch_size + return [ + tensor.reshape([bs, -1, *tensor.shape[2:]]) + for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1)) + ] + + +@dataclass +class _PaddedSeqLenInfo(_SeqLenInfo): + """ + (Internal) Represents the division of a dimension into blocks which are + padded out to the same total length. + + For example, to represent a dimension of length 12 with space for + three blocks of length 4, but where the occupied lengths are + 2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`. + + The layout along the dimension is + + 0 ─► block 0 + block 0 + + + 4 ─► block 1 + block 1 + block 1 + + 8 ─► block 2 + block 2 + + + 12 ─► + + The members will be: + max_seqlen: 3 + min_seqlen: 2 + seqstart_py: [0, 4, 8, 12] + seqstart: torch.IntTensor([0, 4, 8, 12]) + seqlen_py: [2, 3, 2] + seqlen: torch.IntTensor([2, 3, 2]) + padding: 4 + """ + + seqlen: torch.Tensor + seqlen_py: Sequence[int] + padding: int + # From parent: seqstart[i] contains the start position + # of the i-th sequence + # seqstart: torch.Tensor + + def __post_init__(self) -> None: + assert len(self.seqstart_py) == len(self.seqlen_py) + 1 + + def to(self, device: torch.device) -> None: + self.seqlen = self.seqlen.to(device, non_blocking=True) + super().to(device) + + def intervals(self) -> Iterable[Tuple[int, int]]: + for (start, _), length in zip(super().intervals(), self.seqlen_py): + yield start, start + length + + @classmethod + def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo": + raise RuntimeError( + "Use either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`" + ) + + @classmethod + def from_seqlens_padded( + cls, seqlens: Sequence[int], padding: int + ) -> "_PaddedSeqLenInfo": + """ + Input tensors are assumed to be in shape [B, M, *] + seqstart = padding * torch.arange(batch_size) + """ + assert not isinstance(seqlens, torch.Tensor) + assert all(seqlen <= padding for seqlen in seqlens) + seqstart_py = list(range(0, len(seqlens) * padding + 1, padding)) + return cls( + seqlen=torch.tensor(seqlens, dtype=torch.int32), + seqlen_py=seqlens, + max_seqlen=max(seqlens), + min_seqlen=min(seqlens), + seqstart=torch.tensor(seqstart_py, dtype=torch.int32), + seqstart_py=seqstart_py, + padding=padding, + ) + + def split( + self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None + ) -> List[torch.Tensor]: + raise NotImplementedError("_PaddedSeqLenInfo.split") + + +@dataclass +class BlockDiagonalMask(AttentionBias): + """ + A block-diagonal mask that can be passed as ``attn_bias`` + argument to :attr:`xformers.ops.memory_efficient_attention`. + + Queries and Keys are each divided into the same number of blocks. + Queries in block i only attend to keys in block i. + + .. figure:: /_static/block_diag_bias.png + + This bias can be used to handle a batch of sequences of + different lengths, via :attr:`BlockDiagonalMask.from_tensor_list` + + :Example: + + .. code-block:: python + + import torch + from xformers.ops import fmha + + K = 16 + dtype = torch.float16 + device = "cuda" + list_x = [ + torch.randn([1, 3, 1, K], dtype=dtype, device=device), + torch.randn([1, 6, 1, K], dtype=dtype, device=device), + torch.randn([1, 2, 1, K], dtype=dtype, device=device), + ] + attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x) + linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype) + + q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2) + out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias) + list_out = attn_bias.split(out) + print(list_out[0].shape) # [1, 3, 1, K] + assert tuple(list_out[0].shape) == (1, 3, 1, K) + + """ + + q_seqinfo: _SeqLenInfo + k_seqinfo: _SeqLenInfo + _batch_sizes: Optional[Sequence[int]] = None + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + return torch.zeros( + shape, + dtype=dtype, + device=device, + ) + + def materialize( + self, + shape: Optional[Tuple[int, ...]] = None, + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + """Materialize the attention bias - for debugging & testing""" + if shape is None: + shape = (self.q_seqinfo.seqstart_py[-1], + self.k_seqinfo.seqstart_py[-1]) + assert shape[-1] == self.k_seqinfo.seqstart_py[-1], ( + shape[-1], + self.k_seqinfo.seqstart_py[-1], + ) + assert shape[-2] == self.q_seqinfo.seqstart_py[-1], ( + shape[-2], + self.q_seqinfo.seqstart_py[-1], + ) + mask = torch.empty(shape[-2:], dtype=dtype, device=device) + mask.fill_(-math.inf) + for i, ((q_start, q_end), (k_start, k_end)) in enumerate( + zip( + self.q_seqinfo.intervals(), + self.k_seqinfo.intervals(), + ) + ): + mask[q_start:q_end, k_start:k_end] = self._create_block_mask( + (q_end - q_start, k_end - k_start), + dtype=dtype, + device=device, + ) + for _ in range(len(shape) - 2): + mask = mask.unsqueeze(0) + return mask.expand(shape) + + @classmethod + def from_seqlens( + cls, + q_seqlen: Sequence[int], + kv_seqlen: Optional[Sequence[int]] = None, + ) -> "BlockDiagonalMask": + """Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value. + + Args: + q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors + kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value. + (Defaults to ``q_seqlen``.) + Returns: + BlockDiagonalMask + """ + assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen) + q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen) + if kv_seqlen is None or q_seqlen == kv_seqlen: + k_seqinfo = q_seqinfo + else: + k_seqinfo = _SeqLenInfo.from_seqlens(kv_seqlen) + return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo) + + @classmethod + def from_tensor_list( + cls, + tensors: Sequence[torch.Tensor], + ) -> Tuple["BlockDiagonalMask", torch.Tensor]: + """Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors + concatenated on the sequence length dimension + + .. figure:: /_static/block_diag_cat_split.png + + See also :attr:`BlockDiagonalMask.split` to split the returned + :attr:`torch.Tensor` back to a list of tensors of varying sequence length + + Args: + tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``. + All tensors should have the same dimension and the same batch size ``B``, but + they can have different sequence length ``M``. + + Returns: + Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention + along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]`` + """ + batch_sizes = [tensor.shape[0] for tensor in tensors] + seqlens = [] + for x in tensors: + for _ in range(x.shape[0]): + seqlens.append(x.shape[1]) + block_diag = cls.from_seqlens(seqlens) + block_diag._batch_sizes = batch_sizes + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in tensors) + concat_tensors = torch.cat(tensors_bs1, dim=1) + return block_diag, concat_tensors + + @classmethod + def from_tensor_lists_qkv( + cls, + tensors_q: Sequence[torch.Tensor], + tensors_k: Sequence[torch.Tensor], + tensors_v: Optional[Sequence[torch.Tensor]] = None, + ) -> Tuple["BlockDiagonalMask", torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + assert len(tensors_q) == len(tensors_k) + assert tensors_v is None or len(tensors_v) == len(tensors_q) + batch_sizes = [tensor.shape[0] for tensor in tensors_q] + q_seqlens, kv_seqlens = [], [] + for i, (q, k) in enumerate(zip(tensors_q, tensors_k)): + assert q.shape[0] == k.shape[0] + q_seqlens += [q.shape[1]] * q.shape[0] + kv_seqlens += [k.shape[1]] * k.shape[0] + assert tensors_v is None or tensors_v[i].shape[:2] == k.shape[:2] + block_diag = cls.from_seqlens(q_seqlens, kv_seqlens) + block_diag._batch_sizes = batch_sizes + return ( + block_diag, + torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_q], dim=1), + torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_k], dim=1), + torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_v], dim=1) + if tensors_v is not None + else None, + ) + + def split_queries(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: + return self.q_seqinfo.split(tensor, self._batch_sizes) + + def split_kv(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: + return self.k_seqinfo.split(tensor, self._batch_sizes) + + def split(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: + """The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list` + + Args: + tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]`` + + Returns: + Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths + """ + assert self.q_seqinfo is self.k_seqinfo + return self.q_seqinfo.split(tensor, self._batch_sizes) + + def make_causal(self) -> "BlockDiagonalCausalMask": + """Makes each block causal""" + return BlockDiagonalCausalMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + ) + + def make_causal_from_bottomright(self) -> "BlockDiagonalCausalFromBottomRightMask": + """Makes each block causal with a possible non-causal prefix""" + return BlockDiagonalCausalFromBottomRightMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + ) + + def make_local_attention( + self, window_size: int + ) -> "BlockDiagonalCausalLocalAttentionMask": + """Experimental: Makes each block causal with local attention""" + return BlockDiagonalCausalLocalAttentionMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + _window_size=window_size, + ) + + def make_local_attention_from_bottomright( + self, window_size: int + ) -> "BlockDiagonalCausalLocalAttentionFromBottomRightMask": + """Experimental: Makes each block causal with local attention, start from bottom right""" + return BlockDiagonalCausalLocalAttentionFromBottomRightMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + _window_size=window_size, + ) + + +@dataclass +class BlockDiagonalCausalMask(BlockDiagonalMask): + """ + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal. + + Queries and Keys are each divided into the same number of blocks. + A query Q in block i cannot attend to a key which is not in block i, + nor one which is farther from the initial key in block i than Q + is from the initial query in block i. + """ + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + return LowerTriangularMask().materialize( + shape, + dtype=dtype, + device=device, + ) + + +@dataclass +class BlockDiagonalCausalFromBottomRightMask(BlockDiagonalMask): + """ + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal. + This mask allows for a non-causal prefix + NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not + defined (softmax of vector of `-inf` in the attention) + + Queries and keys are each divided into the same number of blocks. + A query Q in block i cannot attend to a key which is not in block i, + nor one which nearer the final key in block i than Q is to the + final query in block i. + """ + + def __post_init__(self) -> None: + for i, ((q_start, q_end), (k_start, k_end)) in enumerate( + zip( + self.q_seqinfo.intervals(), + self.k_seqinfo.intervals(), + ) + ): + num_queries = q_end - q_start + num_keys = k_end - k_start + if num_keys < num_queries: + raise ValueError( + f"Block #{i} has num_keys={num_keys} and num_queries={num_queries}." + " Expected `num_keys >= num_queries`" + ) + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=float("-inf"), + device=device, + ) + num_queries, num_keys = shape[-2:] + return torch.triu(tensor, diagonal=num_keys - num_queries + 1).to(dtype) # type: ignore + + +@dataclass +class BlockDiagonalCausalWithOffsetPaddedKeysMask(AttentionBias): + """ + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`, + except an offset on causality is allowed for each block and we support padding for k/v + + The keys and values are divided into blocks which are padded out to + the same total length. + For example, if there is space for 12 keys, for three blocks of + max length 4, but we only want to use the first 2, 3 and 2 + of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`. + The queries are divided into blocks, without padding, of lengths given by + q_seqlen. + + A query Q in block i cannot attend to a key which is not in block i, + nor one which is not in use (i.e. in the padded area), + nor one which is nearer to the final key in block i + than Q is to the final query in block i. + """ + + q_seqinfo: _SeqLenInfo + k_seqinfo: _PaddedSeqLenInfo + causal_diagonal: Any = None # unused. Exists for BC only. + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=float("-inf"), + device=device, + ) + num_queries, num_keys = shape[-2:] + return torch.triu(tensor, diagonal=1 + num_keys - num_queries).to(dtype) # type: ignore + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + """Materialize the attention bias - for debugging & testing""" + if shape[-1] != self.k_seqinfo.seqstart_py[-1]: + raise ValueError("k shapes wrong") + if shape[-2] != self.q_seqinfo.seqstart_py[-1]: + raise ValueError("q shapes wrong") + mask = torch.empty(shape[-2:], dtype=dtype, device=device) + mask.fill_(-math.inf) + for i, ((q_start, q_end), (k_start, k_end)) in enumerate( + zip( + self.q_seqinfo.intervals(), + self.k_seqinfo.intervals(), + ) + ): + mask[q_start:q_end, k_start:k_end] = self._create_block_mask( + (q_end - q_start, k_end - k_start), + dtype=dtype, + device=device, + ) + for _ in range(len(shape) - 2): + mask = mask.unsqueeze(0) + return mask.expand(shape) + + @classmethod + def from_seqlens( + cls, + q_seqlen: Sequence[int], + kv_padding: int, + kv_seqlen: Sequence[int], + causal_diagonal: Any = None, + ) -> "BlockDiagonalCausalWithOffsetPaddedKeysMask": + """Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor + lengths for query and key/value. + + Args: + q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors + kv_padding (int): Padding for k/v - also an upperbound on each individual key length + kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value. + causal_diagonal: unused, for BC only + Returns: + BlockDiagonalCausalWithOffsetPaddedKeysMask + """ + assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen), ( + q_seqlen, + kv_seqlen, + ) + q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen) + k_seqinfo = _PaddedSeqLenInfo.from_seqlens_padded(kv_seqlen, kv_padding) + return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo) + + +@dataclass +class BlockDiagonalCausalLocalAttentionMask(BlockDiagonalCausalMask): + """ + (Experimental feature) + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`. + This makes the mask "local" and the attention pattern banded. + + Query i only attends to keys in its block and cannot attend keys further than "window_size" + from it. + """ + + _window_size: int = 0 # forced due to inheritance and default arguments + + def __post_init__(self): + if self._window_size <= 0: + raise ValueError( + f"Expected `window_size > 0`, but window_size={self._window_size}" + ) + q_seqlen = [ + y - x + for x, y in zip( + self.q_seqinfo.seqstart_py[:-1], self.q_seqinfo.seqstart_py[1:] + ) + ] + kv_seqlen = [ + y - x + for x, y in zip( + self.k_seqinfo.seqstart_py[:-1], self.k_seqinfo.seqstart_py[1:] + ) + ] + for q, k in zip(q_seqlen, kv_seqlen): + if q - self._window_size >= k: + # Each query only attends to keys no further than window_size back. + # When q > k + window_size, there will be a query for which the window doesn't reach any key. + raise RuntimeError( + f"No keys are attended in q_seqlen {q} k_seqlen {k} with sliding window {self._window_size}" + ) + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=1, + device=device, + ) + + num_queries, num_keys = shape[-2:] + mask = torch.tril(tensor, diagonal=0).to(dtype) # type: ignore + if self._window_size is not None and self._window_size > 0: + mask = torch.triu(mask, diagonal=-self._window_size + 1) + mask = torch.log(mask) + return mask.to(dtype) + + +@dataclass +class BlockDiagonalCausalLocalAttentionFromBottomRightMask( + BlockDiagonalCausalFromBottomRightMask +): + """ + (Experimental feature) + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`. + This makes the mask "local" and the attention pattern banded. + + Query i only attends to keys in its block and cannot attend keys further than "window_size" + from it. + """ + + _window_size: int = 0 # forced due to inheritance and default arguments + + def __post_init__(self): + super().__post_init__() + if self._window_size <= 0: + raise ValueError( + f"Expected `window_size > 0`, but window_size={self._window_size}" + ) + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=1, + device=device, + ) + num_queries, num_keys = shape[-2:] + mask = torch.tril(tensor, diagonal=num_keys - num_queries).to(dtype) # type: ignore + if self._window_size is not None: + mask = torch.triu( + mask, diagonal=num_keys - num_queries - self._window_size + 1 + ) + mask = torch.log(mask) + return mask.to(dtype) \ No newline at end of file diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py new file mode 100644 index 0000000000000..7de5ac6f84093 --- /dev/null +++ b/vllm/hpu/cache_ops.py @@ -0,0 +1,155 @@ +############################################################################### +# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company +# All Rights Reserved. +# +# Unauthorized copying of this file or any element(s) within it, via any medium +# is strictly prohibited. +# This file contains Habana Labs, Ltd. proprietary and confidential information +# and is subject to the confidentiality and license agreements under which it +# was provided. +# +############################################################################### + +from typing import Tuple +import torch +import habana_frameworks.torch as htorch + + +def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, is_prompt=False): + """ + key: [num_tokens, num_heads, head_size] + value: [num_tokens, num_heads, head_size] + key_cache: [num_heads, head_size, block_size] * num_blocks + value_cache: [num_heads, head_size, block_size] * num_blocks + slot_mapping: [num_tokens] + """ + num_tokens = key.shape[0] + block_size = key_cache.shape[-1] + slot_mapping = slot_mapping.to(key.device) + # block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + # indices = torch.tensor([i for i in range(0, block_size)], device=key.device) + for i in range(0, num_tokens, block_size): + # if block_idx_list[i] < 0: + # # indices.add_(block_size) + # continue + key_cache.index_put_([block_indices[i]], key[i:i+block_size].transpose(0,1).transpose(1,2)) + value_cache.index_put_([block_indices[i]], value[i:i+block_size].transpose(0,1).transpose(1,2)) + # key_cache.index_put_([block_indices[i]], key.index_select(0, indices).transpose(0,1).transpose(1,2)) + # value_cache.index_put_([block_indices[i]], value.index_select(0, indices).transpose(0,1).transpose(1,2)) + # indices.add_(block_size) + else: + # print(key_cache.data_ptr(), key_cache.shape) + # print(key_cache[2, :, :, 2]) + key_cache = key_cache.permute(0, 3, 1, 2) + value_cache = value_cache.permute(0, 3, 1, 2) + # print(key_cache.data_ptr(), key_cache.shape) + # print(key_cache[2, 2, :, :]) + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_offsets = torch.fmod(slot_mapping, block_size) + slot_indices = torch.stack([block_indices, block_offsets], dim=-1) + index = torch.tensor(0, device=key.device) + for i in range(num_tokens): + key_cache[slot_indices[i][0], slot_indices[i][1], :, :] = key[i] # key.index_select(0, index) + value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i] # value.index_select(0, index) + # key_cache.index_put_([slot_indices[i]], key[i]) + # value_cache.index_put_([slot_indices[i]], value[i]) + # key_cache.index_put_([slot_indices[i]], key.index_select(0, index)) + # value_cache.index_put_([slot_indices[i]], value.index_select(0, index)) + index.add_(1) + # print(key_cache.data_ptr(), key_cache.shape) + key_cache = key_cache.permute(0, 2, 3, 1) + value_cache = value_cache.permute(0, 2, 3, 1) + # print(key_cache.data_ptr(), key_cache.shape) + + + +''' +def create_cache_view( + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_idx: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + _, num_heads, head_size, block_size = key_cache.shape + cache_stride = key_cache.stride() + cache_offset = key_cache.storage_offset() + block_shape = (1, num_heads, head_size, block_size) + block_offset = block_idx * (cache_stride[-1] * cache_stride[-2] * cache_stride[-3]) + key_block = torch.as_strided(key_cache, + block_shape, + cache_stride, + cache_offset+block_offset).squeeze(0) + value_block = torch.as_strided(value_cache, + block_shape, + cache_stride, + cache_offset+block_offset).squeeze(0) + return key_block, value_block + + +def reshape_and_cache_backup1(key, value, key_cache, value_cache, slot_mapping, is_prompt=False): + """ + key: [num_tokens, num_heads, head_size] + value: [num_tokens, num_heads, head_size] + key_cache: [num_heads, head_size, block_size] * num_blocks + value_cache: [num_heads, head_size, block_size] * num_blocks + slot_mapping: [num_tokens] + """ + block_size = key_cache[0].shape[2] + block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + indices = torch.tensor([i for i in range(0, block_size)], device=key.device) + for i in range(0, len(block_idx_list), block_size): # for i in range(0, block_indices.shape[0], block_size): + if block_idx_list[i] < 0: + continue + block_idx_tensor = block_indices.index_select(0, torch.tensor(i, device=key.device)) + key_cache.index_put_([block_idx_tensor], key.index_select(0, indices).transpose(0,1).transpose(1,2)) + value_cache.index_put_([block_idx_tensor], value.index_select(0, indices).transpose(0,1).transpose(1,2)) + indices.add_(block_size) + else: + block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] + block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()] + index = torch.tensor(0, device=key.device) + for block_idx, block_offset in zip(block_idx_list, block_offset_list): + key_block, value_block = create_cache_view(key_cache, value_cache, block_idx) + slot_idx = torch.tensor(block_offset, device=key.device) + key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2)) + value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2)) + index.add_(1) + + +def reshape_and_cache_backup2(key, value, key_cache, value_cache, slot_mapping, is_prompt=False): + """ + key: [num_tokens, num_heads, head_size] + value: [num_tokens, num_heads, head_size] + key_cache: [num_heads, head_size, block_size] * num_blocks + value_cache: [num_heads, head_size, block_size] * num_blocks + slot_mapping: [num_tokens] + """ + block_size = key_cache[0].shape[2] + block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] + if is_prompt: + cached_set = set() + indices = torch.tensor([i for i in range(0, block_size)], device=key.device) + for block_idx in block_idx_list: + if block_idx in cached_set or block_idx < 0: + continue + else: + cached_set.add(block_idx) + key_block, value_block = create_cache_view(key_cache, value_cache, block_idx) + key_block.copy_(key.index_select(0, indices).transpose(0,1).transpose(1,2)) + value_block.copy_(value.index_select(0, indices).transpose(0,1).transpose(1,2)) + indices.add_(block_size) + else: + block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()] + index = torch.tensor(0, device=key.device) + # slot_idx = torch.tensor(0, device=key.device) + for block_idx, block_offset in zip(block_idx_list, block_offset_list): + key_block, value_block = create_cache_view(key_cache, value_cache, block_idx) + # slot_idx.copy_(block_offset) + slot_idx = torch.tensor(block_offset, device=key.device) + key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2)) + value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2)) + index.add_(1) +''' \ No newline at end of file diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py new file mode 100644 index 0000000000000..cb067fca13cca --- /dev/null +++ b/vllm/hpu/cuda_utils.py @@ -0,0 +1,14 @@ +############################################################################### +# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company +# All Rights Reserved. +# +# Unauthorized copying of this file or any element(s) within it, via any medium +# is strictly prohibited. +# This file contains Habana Labs, Ltd. proprietary and confidential information +# and is subject to the confidentiality and license agreements under which it +# was provided. +# +############################################################################### + +def get_device_attribute(attribute, device_id): + return 10240 # TODO: fake value now \ No newline at end of file diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py new file mode 100644 index 0000000000000..82556ebf78b20 --- /dev/null +++ b/vllm/hpu/ops.py @@ -0,0 +1,172 @@ +############################################################################### +# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company +# All Rights Reserved. +# +# Unauthorized copying of this file or any element(s) within it, via any medium +# is strictly prohibited. +# This file contains Habana Labs, Ltd. proprietary and confidential information +# and is subject to the confidentiality and license agreements under which it +# was provided. +# +############################################################################### + +import torch +import torch.nn as nn +import torch.nn.functional as F +import habana_frameworks.torch as htorch +from typing import List, Optional, Tuple + +def silu_and_mul(output, input): + htorch.core.mark_step() + d = input.shape[-1] // 2 + silu = torch.nn.SiLU().to(input.device) + x, y = torch.split(input, d, dim=-1) + output.copy_(silu(x) * y) + htorch.core.mark_step() + +def gelu_new(output, input): + raise NotImplementedError + +def gelu_fast(output, input): + raise NotImplementedError + +def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, attn_masks=None) -> None: + num_kv_heads = value_cache[0].shape[0] + head_size = value_cache[0].shape[1] + block_size = value_cache[0].shape[2] + num_seqs = query.shape[0] + num_query_heads = query.shape[1] + max_num_blocks_per_seq = block_tables.shape[1] + + if alibi_slopes or num_query_heads != num_kv_heads: #or attn_masks is None: + import pdb + pdb.set_trace() + raise NotImplementedError + + attn_weights_blocks = [] + value_blocks = [] + seq_index = torch.tensor([0], dtype=torch.int64, device="hpu") + + for i in range(0, max_num_blocks_per_seq): + # hard override for filler. These blocks would contribute nothing to the output due to zero attention_probs and will clog up compute resources + if (i - 2) * block_size > torch.max(context_lens): + break + attn_weights = torch.full((num_seqs, num_query_heads, 1, block_size), torch.finfo(query.dtype).min, dtype=query.dtype, device="hpu") + values = torch.zeros((num_seqs, num_query_heads, head_size, block_size), dtype=query.dtype, device="hpu") + for seq_id in range(num_seqs): + seq_index.fill_(seq_id) + if i * block_size < context_lens[seq_id]: + + q = torch.index_select(query, 0, seq_index).transpose(0, 1) + key = torch.index_select(key_cache, 0, block_tables[seq_id][i]).squeeze(0) + attn_weight = scale * torch.matmul(q, key) + + if attn_masks is not None: + attn_mask = torch.index_select(attn_masks[i], 0, seq_index) + attn_weight = torch.masked_fill(attn_weight, ~(attn_mask.unsqueeze(0).to(torch.bool)), torch.finfo(attn_weight.dtype).min) + + if context_lens[seq_id] < (i + 1) * block_size: + if context_lens[seq_id] - i*block_size < 0: + attn_weight = torch.finfo(query.dtype).min + else: + attn_weight[:, :, context_lens[seq_id] - i*block_size:] = torch.finfo(query.dtype).min + attn_weights.index_copy_(0, seq_index, attn_weight.unsqueeze(0)) + #attn_weights[attn_weights == 0.0] = torch.finfo(query.dtype).min + #if (i - 2) * block_size < max_context_len: + value = torch.index_select(value_cache, 0, block_tables[seq_id][i]) + value = torch.nan_to_num(value) + value[value < -1.0e+30] = 0.0 + values.index_copy_(0, seq_index, value) + torch.hpu.synchronize() + + attn_weights_blocks.append(attn_weights.reshape(num_seqs * num_query_heads, 1, block_size)) + value_blocks.append(values.reshape(num_seqs * num_query_heads, head_size, block_size).transpose(1, 2)) + + exp_sum = torch.zeros((*attn_weights_blocks[0].shape[:2], 1), dtype=attn_weights_blocks[0].dtype, device="hpu") + for x in attn_weights_blocks: + exp_sum.add_(torch.exp(x).sum(dim=-1, keepdim=True)) + + output = torch.zeros_like(query) + for i in range(len(attn_weights_blocks)): + attention_probs = torch.exp(attn_weights_blocks[i]) / exp_sum + value = value_blocks[i] + out = torch.matmul(attention_probs.to(value.dtype), value).reshape(num_seqs, num_query_heads, head_size) + output.add_(out) + htorch.core.mark_step() + return output + +def rms_norm(out, hidden_states, weight, eps): + htorch.core.mark_step() + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + eps) + out.copy_(weight * hidden_states.to(input_dtype)) + htorch.core.mark_step() + +def rotate_neox(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def rotate_gptj(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +def apply_rope( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, +) -> Tuple[torch.Tensor, torch.Tensor]: + rotate_fn = rotate_neox if is_neox_style else rotate_gptj + q_embed = (q * cos) + (rotate_fn(q) * sin) + k_embed = (k * cos) + (rotate_fn(k) * sin) + return q_embed, k_embed + + +def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_style): + raise NotImplementedError + # update query and key in-place + num_tokens = query.shape[0] + num_heads = query.shape[-1] // head_size + query = query.view(num_tokens, num_heads, head_size) + key = key.view(num_tokens, num_heads, head_size) + cos, sin = torch.split(cos_sin_cache, cos_sin_cache.shape[-1] // 2, dim=-1) + if is_neox_style: + sin = torch.cat((sin, sin), dim=-1) + cos = torch.cat((cos, cos), dim=-1) + else: + sin = torch.repeat_interleave(sin, 2, -1) + cos = torch.repeat_interleave(cos, 2, -1) + + query_rot = query[..., :head_size] + query_pass = query[..., head_size:] + key_rot = key[..., :head_size] + key_pass = key[..., head_size:] + + query_rot = query_rot.transpose(0, 1) + key_rot = key_rot.transpose(0, 1) + cos = F.embedding(positions, cos) + sin = F.embedding(positions, sin) + + query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin, + is_neox_style) + query_rot = query_rot.transpose(0, 1).contiguous() + key_rot = key_rot.transpose(0, 1).contiguous() + + query.copy_(torch.cat((query_rot, query_pass), dim=-1)) + key.copy_(torch.cat((key_rot, key_pass), dim=-1)) + htorch.core.mark_step() + + # Output query/key shape: [num_tokens, num_tokens, head_size] + return query, key + #raise NotImplementedError + +def awq_gemm(*args): + raise NotImplementedError diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 1af120d13cd4b..2bdd3a62b8b39 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -6,7 +6,11 @@ import torch.nn as nn import torch.nn.functional as F -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 1da061a6a52c3..d7b49e8d5a52c 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -4,13 +4,20 @@ import torch import torch.nn as nn from xformers import ops as xops -from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, - LowerTriangularMaskWithTensorBias) -from vllm._C import ops -from vllm._C import cache_ops from vllm.model_executor.input_metadata import InputMetadata -from vllm.utils import is_hip +from vllm.utils import is_hip, is_hpu + +if is_hpu(): + from vllm.hpu import ops + from vllm.hpu import cache_ops + from vllm.hpu.attn_bias import (BlockDiagonalCausalMask, + LowerTriangularMaskWithTensorBias) +else: + from vllm._C import ops + from vllm._C import cache_ops + from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, + LowerTriangularMaskWithTensorBias) _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. @@ -239,7 +246,7 @@ def _paged_attention( # For context len > 8192, use V2 kernel to avoid shared memory shortage. use_v1 = input_metadata.max_context_len <= 8192 and ( max_num_partitions == 1 or num_seqs * num_heads > 512) - if use_v1: + if use_v1 or is_hpu(): # Run PagedAttention V1. output = ops.paged_attention_v1( query, diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index ccfd55806d5c6..57e65c04e4019 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -4,7 +4,11 @@ import torch import torch.nn as nn -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops try: from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 831576b1d7cd7..4e0a0ec51beb6 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -3,7 +3,11 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 8fe96e7ddb98d..716c6f88d9e62 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -5,7 +5,11 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 1932bd145076b..c3d71e4309dbb 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -3,11 +3,14 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig -from vllm.utils import is_hip +from vllm.utils import is_hip, is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops class SqueezeLLMConfig(QuantizationConfig): diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 94aa842f71ff2..b2efae7d85540 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -27,7 +27,11 @@ import torch import torch.nn as nn -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops def _rotate_neox(x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/utils.py b/vllm/utils.py index b1af1c740e9b9..2b78f31946e24 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -6,8 +6,6 @@ import psutil import torch -from vllm._C import cuda_utils - class Device(enum.Enum): GPU = enum.auto() @@ -32,6 +30,16 @@ def is_hip() -> bool: return torch.version.hip is not None +def is_hpu() -> bool: + return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available() + + +if is_hpu(): + from vllm.hpu import cuda_utils +else: + from vllm._C import cuda_utils + + def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html @@ -59,7 +67,3 @@ def get_open_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] - - -def is_hpu() -> bool: - return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available() \ No newline at end of file diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 0eaa3fa28f246..64680ac5f8fc5 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -3,11 +3,15 @@ import torch -from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import in_wsl, is_hpu +if is_hpu(): + from vllm.hpu import cache_ops +else: + from vllm._C import cache_ops + logger = init_logger(__name__) KVCache = Tuple[torch.Tensor, torch.Tensor] From c0d3c69ce125f4bb3ee5db31bff3dec96c170c2c Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Tue, 23 Jan 2024 16:00:37 +0200 Subject: [PATCH 20/43] Add xops bindings --- tests/kernels/test_attention.py | 3 +- vllm/hpu/xops.py | 103 ++++++++++++++++++++++++ vllm/model_executor/layers/attention.py | 3 +- 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 vllm/hpu/xops.py diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 8ff15aceac542..f2242d7d95e49 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -3,14 +3,15 @@ import pytest import torch -from xformers import ops as xops from vllm.utils import get_max_shared_memory_bytes, is_hpu if is_hpu(): from vllm.hpu import ops + from vllm.hpu import xops from vllm.hpu.attn_bias import BlockDiagonalCausalMask else: from vllm._C import ops + from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py new file mode 100644 index 0000000000000..691d30b0fba90 --- /dev/null +++ b/vllm/hpu/xops.py @@ -0,0 +1,103 @@ +############################################################################### +# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company +# All Rights Reserved. +# +# Unauthorized copying of this file or any element(s) within it, via any medium +# is strictly prohibited. +# This file contains Habana Labs, Ltd. proprietary and confidential information +# and is subject to the confidentiality and license agreements under which it +# was provided. +# +############################################################################### + + +import habana_frameworks.torch as htorch +import torch +import torch.nn.functional as F +from typing import List, Optional, Tuple, Union +from .attn_bias import AttentionBias + + +# # xops.memory_efficient_attention_forward +# def memory_efficient_attention_forward( +# query: torch.Tensor, +# key: torch.Tensor, +# value: torch.Tensor, +# attn_bias = None, +# p: float = 0.0, +# scale: Optional[float] = None +# ) -> torch.Tensor: +# # scale = 1 / query.shape[-1] ** 0.5 +# query = query * scale +# attn = query @ key.transpose(-2, -1) +# if attn_bias is not None: +# shape=(query.shape[0], query.shape[1], query.shape[-2], query.shape[-2]) +# attn_mask = torch.full(shape, dtype=query.dtype, fill_value=float("-inf"), device=query.device) +# attn_mask = torch.triu(attn_mask, diagonal=1).to(query.dtype) +# attn = attn + attn_mask +# attn = attn.softmax(-1) +# attn = torch.nn.functional.dropout(attn, p) +# return attn @ value + + +def block_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, + attn_mask: Optional[torch.Tensor] = None, +) -> torch.Tensor: + query = query * scale + attn = query.transpose(0,1) @ key.transpose(0, 1).transpose(1, 2) + if attn_mask is not None: + attn = attn + attn_mask.to(attn.dtype) + attn = attn.softmax(-1) + out = attn @ value.transpose(0, 1) + out = out.transpose(0, 1) + return out + + +def memory_efficient_attention_forward( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seq_lens: List[int], + attn_bias: Optional[torch.Tensor] = None, + p: float = 0.0, + scale: Optional[float] = None, +) -> torch.Tensor: + dim = query.dim() + if dim == 4: + query, key, value = query.squeeze(0), key.squeeze(0), value.squeeze(0) + num_seqs = len(cu_seq_lens) - 1 + outputs = [] + for i in range(num_seqs): + start_idx = cu_seq_lens[i] + end_idx = cu_seq_lens[i + 1] + seq_len = end_idx - start_idx + mask_start_idx = i * seq_len + mask_end_idx = (i + 1) * seq_len + + # # Create attention mask. + # attn_mask = torch.ones(seq_len, seq_len, dtype=query.dtype) + # attn_mask[:seq_lens[i],:seq_lens[i]] = torch.triu( + # attn_mask[:seq_lens[i],:seq_lens[i]], + # diagonal=1 + # ) + # attn_mask = attn_mask * -10000.0 # torch.finfo(query.dtype).min + # attn_mask = attn_mask.to(dtype=query.dtype, device=query.device) + + attn_mask = attn_bias.materialize(device=query.device) + output = block_masked_attention( + query[start_idx:end_idx], + key[start_idx:end_idx], + value[start_idx:end_idx], + scale, + attn_mask=attn_mask[mask_start_idx:mask_end_idx, + mask_start_idx:mask_end_idx], # attn_mask=attn_mask, + ) + outputs.append(output) + out = torch.cat(outputs, dim=0) + if dim == 4: + out = out.unsqueeze(0) + return out diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index d7b49e8d5a52c..f0955671bdf82 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn -from xformers import ops as xops from vllm.model_executor.input_metadata import InputMetadata from vllm.utils import is_hip, is_hpu @@ -11,11 +10,13 @@ if is_hpu(): from vllm.hpu import ops from vllm.hpu import cache_ops + from vllm.hpu import xops from vllm.hpu.attn_bias import (BlockDiagonalCausalMask, LowerTriangularMaskWithTensorBias) else: from vllm._C import ops from vllm._C import cache_ops + from xformers import ops as xops from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, LowerTriangularMaskWithTensorBias) From 48b26d1912c75ea96f7e6559ed01c84c7f4cc863 Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Tue, 23 Jan 2024 17:20:16 +0200 Subject: [PATCH 21/43] Cast paged attention inputs to bfloat16 --- benchmarks/kernels/benchmark_paged_attention.py | 2 +- tests/kernels/conftest.py | 4 ++-- vllm/hpu/ops.py | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index f22acca3b7909..e47a5313c444c 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -4,7 +4,7 @@ import torch -from vllm.utils import is_hpu() +from vllm.utils import is_hpu if is_hpu(): from vllm.hpu import ops else: diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py index 80b62e4e0ef7d..17af2f5c3868d 100644 --- a/tests/kernels/conftest.py +++ b/tests/kernels/conftest.py @@ -20,9 +20,9 @@ def create_kv_caches( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=dtype).element_size() if is_hpu(): - key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - else: key_cache_shape = (num_blocks, num_heads, head_size, block_size) + else: + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) key_caches = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 82556ebf78b20..30ee852f7d367 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -30,7 +30,10 @@ def gelu_new(output, input): def gelu_fast(output, input): raise NotImplementedError -def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, attn_masks=None) -> None: +def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, attn_masks=None) -> None: + query = query_in.bfloat16() + key_cache = key_cache_in.bfloat16() + value_cache = value_cache_in.bfloat16() num_kv_heads = value_cache[0].shape[0] head_size = value_cache[0].shape[1] block_size = value_cache[0].shape[2] @@ -93,7 +96,7 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block out = torch.matmul(attention_probs.to(value.dtype), value).reshape(num_seqs, num_query_heads, head_size) output.add_(out) htorch.core.mark_step() - return output + return output.to(dtype=query_in.dtype) def rms_norm(out, hidden_states, weight, eps): htorch.core.mark_step() From aefa573b0fcf0a8c1bdfab65e93c3489f9b2b40a Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Fri, 26 Jan 2024 12:08:30 +0000 Subject: [PATCH 22/43] Remove leftover debug calls --- vllm/hpu/ops.py | 2 -- vllm/model_executor/layers/rotary_embedding.py | 6 ------ vllm/worker/worker.py | 2 -- 3 files changed, 10 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 30ee852f7d367..cfbe3c4ab8eac 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -42,8 +42,6 @@ def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, sca max_num_blocks_per_seq = block_tables.shape[1] if alibi_slopes or num_query_heads != num_kv_heads: #or attn_masks is None: - import pdb - pdb.set_trace() raise NotImplementedError attn_weights_blocks = [] diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index b2efae7d85540..dd64434f64f20 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -126,8 +126,6 @@ def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=100 ) def _set_cos_sin_cache(self, seq_len, device, dtype): - #import pdb - #pdb.set_trace() self.max_seq_len_cached = seq_len t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) @@ -138,8 +136,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): - #import pdb - #pdb.set_trace() seq_len = key.shape[-2] if seq_len > self.max_seq_len_cached: self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) @@ -148,7 +144,6 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size)) key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) if query.device.type == "hpu" and FusedRoPE: - #print('using FusedRoPE') if len(positions[0]) == 1: cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) @@ -157,7 +152,6 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso sin = sin[positions].unsqueeze(2) query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) else: - #print('using torch RoPE') query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 22c53005aa7f5..49600689f4c6a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -324,8 +324,6 @@ def round_up(n, multiple): elif (i + 1) * self.block_size <= context_lens[seq_id]: attn_masks[i][seq_id, :] = 1 input_metadata.attention_masks = attn_masks.to(device="cuda") - # import pdb - # pdb.set_trace() print("input token shape: ", tokens_tensor.shape) return tokens_tensor, positions_tensor, input_metadata From c49b68e3fed7410b672492cd9bb740cc067ce634 Mon Sep 17 00:00:00 2001 From: Mikhail Dvoretckii Date: Fri, 26 Jan 2024 12:23:09 +0000 Subject: [PATCH 23/43] Update comments on HPU ops --- vllm/hpu/ops.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index cfbe3c4ab8eac..f3dd9c2c575a3 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -49,7 +49,8 @@ def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, sca seq_index = torch.tensor([0], dtype=torch.int64, device="hpu") for i in range(0, max_num_blocks_per_seq): - # hard override for filler. These blocks would contribute nothing to the output due to zero attention_probs and will clog up compute resources + # FIXME: dynamic hard override for filler. These blocks would contribute nothing to the output due to zero attention_probs and + # will clog up compute resources. The override itself makes the code unsuitable for graph precompilation if (i - 2) * block_size > torch.max(context_lens): break attn_weights = torch.full((num_seqs, num_query_heads, 1, block_size), torch.finfo(query.dtype).min, dtype=query.dtype, device="hpu") @@ -66,15 +67,16 @@ def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, sca attn_mask = torch.index_select(attn_masks[i], 0, seq_index) attn_weight = torch.masked_fill(attn_weight, ~(attn_mask.unsqueeze(0).to(torch.bool)), torch.finfo(attn_weight.dtype).min) + # FIXME: these dynamic checks serve to ensure the -inf default value is not overwritten with fillers that would cause errors + # in logsoftmax computation. A change to custom block multiplication code is required to avoid incurring extra costs here if context_lens[seq_id] < (i + 1) * block_size: if context_lens[seq_id] - i*block_size < 0: attn_weight = torch.finfo(query.dtype).min else: attn_weight[:, :, context_lens[seq_id] - i*block_size:] = torch.finfo(query.dtype).min attn_weights.index_copy_(0, seq_index, attn_weight.unsqueeze(0)) - #attn_weights[attn_weights == 0.0] = torch.finfo(query.dtype).min - #if (i - 2) * block_size < max_context_len: value = torch.index_select(value_cache, 0, block_tables[seq_id][i]) + # FIXME: these checks concern filler values in the V cache and should be removed once the underlying issue is addressed value = torch.nan_to_num(value) value[value < -1.0e+30] = 0.0 values.index_copy_(0, seq_index, value) @@ -132,6 +134,8 @@ def apply_rope( def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_style): + # FIXME: the below code is unused legacy code not meant to be used. Use FusedRoPE + # on HPU and delete this once coverage is verified raise NotImplementedError # update query and key in-place num_tokens = query.shape[0] @@ -167,7 +171,6 @@ def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_st # Output query/key shape: [num_tokens, num_tokens, head_size] return query, key - #raise NotImplementedError def awq_gemm(*args): raise NotImplementedError From c5c2a9967e163c8235e65483883a7bd0fca6b5c6 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 2 Feb 2024 16:52:44 +0200 Subject: [PATCH 24/43] Restoring NVIDIA compatibility in setup.py --- setup.py | 457 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 231 insertions(+), 226 deletions(-) diff --git a/setup.py b/setup.py index 17cb874a0955b..33f9627b94f1b 100644 --- a/setup.py +++ b/setup.py @@ -17,225 +17,227 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} -# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) - +#SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) def _is_hip() -> bool: return torch.version.hip is not None def _is_cuda() -> bool: - return torch.version.cuda is not None - - -# # Compiler flags. -# CXX_FLAGS = ["-g", "-O2", "-std=c++17"] -# # TODO(woosuk): Should we use -O3? -# NVCC_FLAGS = ["-O2", "-std=c++17"] - -# if _is_hip(): -# if ROCM_HOME is None: -# raise RuntimeError( -# "Cannot find ROCM_HOME. ROCm must be available to build the package." -# ) -# NVCC_FLAGS += ["-DUSE_ROCM"] - -# if _is_cuda() and CUDA_HOME is None: -# raise RuntimeError( -# "Cannot find CUDA_HOME. CUDA must be available to build the package.") - -# ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 -# CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -# NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] - - -# def get_amdgpu_offload_arch(): -# command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" -# try: -# output = subprocess.check_output([command]) -# return output.decode('utf-8').strip() -# except subprocess.CalledProcessError as e: -# error_message = f"Error: {e}" -# raise RuntimeError(error_message) from e -# except FileNotFoundError as e: -# # If the command is not found, print an error message -# error_message = f"The command {command} was not found." -# raise RuntimeError(error_message) from e - -# return None - - -# def get_hipcc_rocm_version(): -# # Run the hipcc --version command -# result = subprocess.run(['hipcc', '--version'], -# stdout=subprocess.PIPE, -# stderr=subprocess.STDOUT, -# text=True) - -# # Check if the command was executed successfully -# if result.returncode != 0: -# print("Error running 'hipcc --version'") -# return None - -# # Extract the version using a regular expression -# match = re.search(r'HIP version: (\S+)', result.stdout) -# if match: -# # Return the version string -# return match.group(1) -# else: -# print("Could not find HIP version in the output") -# return None - - -# def get_nvcc_cuda_version(cuda_dir: str) -> Version: -# """Get the CUDA version from nvcc. - -# Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py -# """ -# nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], -# universal_newlines=True) -# output = nvcc_output.split() -# release_idx = output.index("release") + 1 -# nvcc_cuda_version = parse(output[release_idx].split(",")[0]) -# return nvcc_cuda_version - - -# def get_torch_arch_list() -> Set[str]: -# # TORCH_CUDA_ARCH_LIST can have one or more architectures, -# # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the -# # compiler to additionally include PTX code that can be runtime-compiled -# # and executed on the 8.6 or newer architectures. While the PTX code will -# # not give the best performance on the newer architectures, it provides -# # forward compatibility. -# env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) -# if env_arch_list is None: -# return set() - -# # List are separated by ; or space. -# torch_arch_list = set(env_arch_list.replace(" ", ";").split(";")) -# if not torch_arch_list: -# return set() - -# # Filter out the invalid architectures and print a warning. -# valid_archs = NVIDIA_SUPPORTED_ARCHS.union( -# {s + "+PTX" -# for s in NVIDIA_SUPPORTED_ARCHS}) -# arch_list = torch_arch_list.intersection(valid_archs) -# # If none of the specified architectures are valid, raise an error. -# if not arch_list: -# raise RuntimeError( -# "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " -# f"variable ({env_arch_list}) is supported. " -# f"Supported CUDA/ROCM architectures are: {valid_archs}.") -# invalid_arch_list = torch_arch_list - valid_archs -# if invalid_arch_list: -# warnings.warn( -# f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " -# "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " -# f"({env_arch_list}). Supported CUDA/ROCM architectures are: " -# f"{valid_archs}.", -# stacklevel=2) -# return arch_list - - -# # First, check the TORCH_CUDA_ARCH_LIST environment variable. -# compute_capabilities = get_torch_arch_list() -# if _is_cuda() and not compute_capabilities: -# # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available -# # GPUs on the current machine. -# device_count = torch.cuda.device_count() -# for i in range(device_count): -# major, minor = torch.cuda.get_device_capability(i) -# if major < 7: -# raise RuntimeError( -# "GPUs with compute capability below 7.0 are not supported.") -# compute_capabilities.add(f"{major}.{minor}") - -# if _is_cuda(): -# nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) -# if not compute_capabilities: -# # If no GPU is specified nor available, add all supported architectures -# # based on the NVCC CUDA version. -# compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() -# if nvcc_cuda_version < Version("11.1"): -# compute_capabilities.remove("8.6") -# if nvcc_cuda_version < Version("11.8"): -# compute_capabilities.remove("8.9") -# compute_capabilities.remove("9.0") -# # Validate the NVCC CUDA version. -# if nvcc_cuda_version < Version("11.0"): -# raise RuntimeError( -# "CUDA 11.0 or higher is required to build the package.") -# if (nvcc_cuda_version < Version("11.1") -# and any(cc.startswith("8.6") for cc in compute_capabilities)): -# raise RuntimeError( -# "CUDA 11.1 or higher is required for compute capability 8.6.") -# if nvcc_cuda_version < Version("11.8"): -# if any(cc.startswith("8.9") for cc in compute_capabilities): -# # CUDA 11.8 is required to generate the code targeting compute capability 8.9. -# # However, GPUs with compute capability 8.9 can also run the code generated by -# # the previous versions of CUDA 11 and targeting compute capability 8.0. -# # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 -# # instead of 8.9. -# warnings.warn( -# "CUDA 11.8 or higher is required for compute capability 8.9. " -# "Targeting compute capability 8.0 instead.", -# stacklevel=2) -# compute_capabilities = set(cc for cc in compute_capabilities -# if not cc.startswith("8.9")) -# compute_capabilities.add("8.0+PTX") -# if any(cc.startswith("9.0") for cc in compute_capabilities): -# raise RuntimeError( -# "CUDA 11.8 or higher is required for compute capability 9.0.") - -# # Add target compute capabilities to NVCC flags. -# for capability in compute_capabilities: -# num = capability[0] + capability[2] -# NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] -# if capability.endswith("+PTX"): -# NVCC_FLAGS += [ -# "-gencode", f"arch=compute_{num},code=compute_{num}" -# ] - -# # Use NVCC threads to parallelize the build. -# if nvcc_cuda_version >= Version("11.2"): -# nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) -# num_threads = min(os.cpu_count(), nvcc_threads) -# NVCC_FLAGS += ["--threads", str(num_threads)] - -# elif _is_hip(): -# amd_arch = get_amdgpu_offload_arch() -# if amd_arch not in ROCM_SUPPORTED_ARCHS: -# raise RuntimeError( -# f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" -# f"amdgpu_arch_found: {amd_arch}") - -# # ext_modules = [] - -# vllm_extension_sources = [ -# "csrc/cache_kernels.cu", -# "csrc/attention/attention_kernels.cu", -# "csrc/pos_encoding_kernels.cu", -# "csrc/activation_kernels.cu", -# "csrc/layernorm_kernels.cu", -# "csrc/quantization/squeezellm/quant_cuda_kernel.cu", -# "csrc/quantization/gptq/q_gemm.cu", -# "csrc/cuda_utils_kernels.cu", -# "csrc/pybind.cpp", -# ] -# -# if _is_cuda(): -# vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") - -# vllm_extension = CUDAExtension( -# name="vllm._C", -# sources=vllm_extension_sources, -# extra_compile_args={ -# "cxx": CXX_FLAGS, -# "nvcc": NVCC_FLAGS, -# }, -# ) -# ext_modules.append(vllm_extension) + return torch.version.cuda is not None and torch.cuda.is_available() + + +# Compiler flags. +CXX_FLAGS = [] +# TODO(woosuk): Should we use -O3? +NVCC_FLAGS = [] + +if _is_cuda() or _is_hip(): + CXX_FLAGS = ["-g", "-O2", "-std=c++17"] + NVCC_FLAGS = ["-O2", "-std=c++17"] + + ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 + CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] + NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] + +if _is_hip(): + if ROCM_HOME is None: + raise RuntimeError( + "Cannot find ROCM_HOME. ROCm must be available to build the package." + ) + NVCC_FLAGS += ["-DUSE_ROCM"] + +if _is_cuda() and CUDA_HOME is None: + raise RuntimeError( + "Cannot find CUDA_HOME. CUDA must be available to build the package.") + +def get_amdgpu_offload_arch(): + command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" + try: + output = subprocess.check_output([command]) + return output.decode('utf-8').strip() + except subprocess.CalledProcessError as e: + error_message = f"Error: {e}" + raise RuntimeError(error_message) from e + except FileNotFoundError as e: + # If the command is not found, print an error message + error_message = f"The command {command} was not found." + raise RuntimeError(error_message) from e + + return None + + +def get_hipcc_rocm_version(): + # Run the hipcc --version command + result = subprocess.run(['hipcc', '--version'], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True) + + # Check if the command was executed successfully + if result.returncode != 0: + print("Error running 'hipcc --version'") + return None + + # Extract the version using a regular expression + match = re.search(r'HIP version: (\S+)', result.stdout) + if match: + # Return the version string + return match.group(1) + else: + print("Could not find HIP version in the output") + return None + + +def get_nvcc_cuda_version(cuda_dir: str) -> Version: + """Get the CUDA version from nvcc. + + Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py + """ + nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], + universal_newlines=True) + output = nvcc_output.split() + release_idx = output.index("release") + 1 + nvcc_cuda_version = parse(output[release_idx].split(",")[0]) + return nvcc_cuda_version + + +def get_torch_arch_list() -> Set[str]: + if _is_cuda(): + # TORCH_CUDA_ARCH_LIST can have one or more architectures, + # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the + # compiler to additionally include PTX code that can be runtime-compiled + # and executed on the 8.6 or newer architectures. While the PTX code will + # not give the best performance on the newer architectures, it provides + # forward compatibility. + env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) + if env_arch_list is None: + return set() + + # List are separated by ; or space. + torch_arch_list = set(env_arch_list.replace(" ", ";").split(";")) + if not torch_arch_list: + return set() + + # Filter out the invalid architectures and print a warning. + valid_archs = NVIDIA_SUPPORTED_ARCHS.union( + {s + "+PTX" + for s in NVIDIA_SUPPORTED_ARCHS}) + arch_list = torch_arch_list.intersection(valid_archs) + # If none of the specified architectures are valid, raise an error. + if not arch_list: + raise RuntimeError( + "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " + f"variable ({env_arch_list}) is supported. " + f"Supported CUDA/ROCM architectures are: {valid_archs}.") + invalid_arch_list = torch_arch_list - valid_archs + if invalid_arch_list: + warnings.warn( + f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " + "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " + f"({env_arch_list}). Supported CUDA/ROCM architectures are: " + f"{valid_archs}.", + stacklevel=2) + return arch_list + +# First, check the TORCH_CUDA_ARCH_LIST environment variable. +compute_capabilities = get_torch_arch_list() +if _is_cuda() and not compute_capabilities: + # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available + # GPUs on the current machine. + device_count = torch.cuda.device_count() + for i in range(device_count): + major, minor = torch.cuda.get_device_capability(i) + if major < 7: + raise RuntimeError( + "GPUs with compute capability below 7.0 are not supported.") + compute_capabilities.add(f"{major}.{minor}") + +if _is_cuda(): + nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) + if not compute_capabilities: + # If no GPU is specified nor available, add all supported architectures + # based on the NVCC CUDA version. + compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() + if nvcc_cuda_version < Version("11.1"): + compute_capabilities.remove("8.6") + if nvcc_cuda_version < Version("11.8"): + compute_capabilities.remove("8.9") + compute_capabilities.remove("9.0") + # Validate the NVCC CUDA version. + if nvcc_cuda_version < Version("11.0"): + raise RuntimeError( + "CUDA 11.0 or higher is required to build the package.") + if (nvcc_cuda_version < Version("11.1") + and any(cc.startswith("8.6") for cc in compute_capabilities)): + raise RuntimeError( + "CUDA 11.1 or higher is required for compute capability 8.6.") + if nvcc_cuda_version < Version("11.8"): + if any(cc.startswith("8.9") for cc in compute_capabilities): + # CUDA 11.8 is required to generate the code targeting compute capability 8.9. + # However, GPUs with compute capability 8.9 can also run the code generated by + # the previous versions of CUDA 11 and targeting compute capability 8.0. + # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0 + # instead of 8.9. + warnings.warn( + "CUDA 11.8 or higher is required for compute capability 8.9. " + "Targeting compute capability 8.0 instead.", + stacklevel=2) + compute_capabilities = set(cc for cc in compute_capabilities + if not cc.startswith("8.9")) + compute_capabilities.add("8.0+PTX") + if any(cc.startswith("9.0") for cc in compute_capabilities): + raise RuntimeError( + "CUDA 11.8 or higher is required for compute capability 9.0.") + + # Add target compute capabilities to NVCC flags. + for capability in compute_capabilities: + num = capability[0] + capability[2] + NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] + if capability.endswith("+PTX"): + NVCC_FLAGS += [ + "-gencode", f"arch=compute_{num},code=compute_{num}" + ] + + # Use NVCC threads to parallelize the build. + if nvcc_cuda_version >= Version("11.2"): + nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) + num_threads = min(os.cpu_count(), nvcc_threads) + NVCC_FLAGS += ["--threads", str(num_threads)] + +elif _is_hip(): + amd_arch = get_amdgpu_offload_arch() + if amd_arch not in ROCM_SUPPORTED_ARCHS: + raise RuntimeError( + f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" + f"amdgpu_arch_found: {amd_arch}") + +ext_modules = [] + +vllm_extension_sources = [ + "csrc/cache_kernels.cu", + "csrc/attention/attention_kernels.cu", + "csrc/pos_encoding_kernels.cu", + "csrc/activation_kernels.cu", + "csrc/layernorm_kernels.cu", + "csrc/quantization/squeezellm/quant_cuda_kernel.cu", + "csrc/quantization/gptq/q_gemm.cu", + "csrc/cuda_utils_kernels.cu", + "csrc/pybind.cpp", +] + +if _is_cuda(): + vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") + + vllm_extension = CUDAExtension( + name="vllm._C", + sources=vllm_extension_sources, + extra_compile_args={ + "cxx": CXX_FLAGS, + "nvcc": NVCC_FLAGS, + }, + ) + ext_modules.append(vllm_extension) def get_path(*filepath) -> str: @@ -258,17 +260,17 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) - # if _is_hip(): - # # Get the HIP version - # hipcc_version = get_hipcc_rocm_version() - # if hipcc_version != MAIN_CUDA_VERSION: - # rocm_version_str = hipcc_version.replace(".", "")[:3] - # version += f"+rocm{rocm_version_str}" - # else: - # cuda_version = str(nvcc_cuda_version) - # if cuda_version != MAIN_CUDA_VERSION: - # cuda_version_str = cuda_version.replace(".", "")[:3] - # version += f"+cu{cuda_version_str}" + if _is_hip(): + # Get the HIP version + hipcc_version = get_hipcc_rocm_version() + if hipcc_version != MAIN_CUDA_VERSION: + rocm_version_str = hipcc_version.replace(".", "")[:3] + version += f"+rocm{rocm_version_str}" + elif _is_cuda(): + cuda_version = str(nvcc_cuda_version) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + version += f"+cu{cuda_version_str}" return version @@ -283,6 +285,9 @@ def get_requirements() -> List[str]: if _is_hip(): with open(get_path("requirements-rocm.txt")) as f: requirements = f.read().strip().split("\n") + elif _is_cuda(): + with open(get_path("requirements-cuda.txt")) as f: + requirements = f.read().strip().split("\n") else: with open(get_path("requirements.txt")) as f: requirements = f.read().strip().split("\n") @@ -315,6 +320,6 @@ def get_requirements() -> List[str]: "examples", "tests")), python_requires=">=3.8", install_requires=get_requirements(), - # ext_modules=ext_modules, - # cmdclass={"build_ext": BuildExtension}, + ext_modules=ext_modules, + cmdclass={"build_ext": BuildExtension} if _is_cuda() or _is_hip() else {}, ) From 1c669080dab477a9f71015db74136aa31391822f Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 5 Feb 2024 13:46:10 +0200 Subject: [PATCH 25/43] vllm.hpu cleanup --- vllm/hpu/cache_ops.py | 115 +----------------- vllm/hpu/ops.py | 34 ------ vllm/hpu/xops.py | 34 +----- .../model_executor/layers/rotary_embedding.py | 2 +- 4 files changed, 6 insertions(+), 179 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 7de5ac6f84093..5c678587c6ff9 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -26,130 +26,21 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, is_promp num_tokens = key.shape[0] block_size = key_cache.shape[-1] slot_mapping = slot_mapping.to(key.device) - # block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") if is_prompt: - # indices = torch.tensor([i for i in range(0, block_size)], device=key.device) for i in range(0, num_tokens, block_size): - # if block_idx_list[i] < 0: - # # indices.add_(block_size) - # continue key_cache.index_put_([block_indices[i]], key[i:i+block_size].transpose(0,1).transpose(1,2)) value_cache.index_put_([block_indices[i]], value[i:i+block_size].transpose(0,1).transpose(1,2)) - # key_cache.index_put_([block_indices[i]], key.index_select(0, indices).transpose(0,1).transpose(1,2)) - # value_cache.index_put_([block_indices[i]], value.index_select(0, indices).transpose(0,1).transpose(1,2)) - # indices.add_(block_size) else: - # print(key_cache.data_ptr(), key_cache.shape) - # print(key_cache[2, :, :, 2]) key_cache = key_cache.permute(0, 3, 1, 2) value_cache = value_cache.permute(0, 3, 1, 2) - # print(key_cache.data_ptr(), key_cache.shape) - # print(key_cache[2, 2, :, :]) block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") block_offsets = torch.fmod(slot_mapping, block_size) slot_indices = torch.stack([block_indices, block_offsets], dim=-1) index = torch.tensor(0, device=key.device) for i in range(num_tokens): - key_cache[slot_indices[i][0], slot_indices[i][1], :, :] = key[i] # key.index_select(0, index) - value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i] # value.index_select(0, index) - # key_cache.index_put_([slot_indices[i]], key[i]) - # value_cache.index_put_([slot_indices[i]], value[i]) - # key_cache.index_put_([slot_indices[i]], key.index_select(0, index)) - # value_cache.index_put_([slot_indices[i]], value.index_select(0, index)) + key_cache[slot_indices[i][0], slot_indices[i][1], :, :] = key[i] + value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i] index.add_(1) - # print(key_cache.data_ptr(), key_cache.shape) key_cache = key_cache.permute(0, 2, 3, 1) - value_cache = value_cache.permute(0, 2, 3, 1) - # print(key_cache.data_ptr(), key_cache.shape) - - - -''' -def create_cache_view( - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_idx: int, -) -> Tuple[torch.Tensor, torch.Tensor]: - _, num_heads, head_size, block_size = key_cache.shape - cache_stride = key_cache.stride() - cache_offset = key_cache.storage_offset() - block_shape = (1, num_heads, head_size, block_size) - block_offset = block_idx * (cache_stride[-1] * cache_stride[-2] * cache_stride[-3]) - key_block = torch.as_strided(key_cache, - block_shape, - cache_stride, - cache_offset+block_offset).squeeze(0) - value_block = torch.as_strided(value_cache, - block_shape, - cache_stride, - cache_offset+block_offset).squeeze(0) - return key_block, value_block - - -def reshape_and_cache_backup1(key, value, key_cache, value_cache, slot_mapping, is_prompt=False): - """ - key: [num_tokens, num_heads, head_size] - value: [num_tokens, num_heads, head_size] - key_cache: [num_heads, head_size, block_size] * num_blocks - value_cache: [num_heads, head_size, block_size] * num_blocks - slot_mapping: [num_tokens] - """ - block_size = key_cache[0].shape[2] - block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - if is_prompt: - indices = torch.tensor([i for i in range(0, block_size)], device=key.device) - for i in range(0, len(block_idx_list), block_size): # for i in range(0, block_indices.shape[0], block_size): - if block_idx_list[i] < 0: - continue - block_idx_tensor = block_indices.index_select(0, torch.tensor(i, device=key.device)) - key_cache.index_put_([block_idx_tensor], key.index_select(0, indices).transpose(0,1).transpose(1,2)) - value_cache.index_put_([block_idx_tensor], value.index_select(0, indices).transpose(0,1).transpose(1,2)) - indices.add_(block_size) - else: - block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] - block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()] - index = torch.tensor(0, device=key.device) - for block_idx, block_offset in zip(block_idx_list, block_offset_list): - key_block, value_block = create_cache_view(key_cache, value_cache, block_idx) - slot_idx = torch.tensor(block_offset, device=key.device) - key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2)) - value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2)) - index.add_(1) - - -def reshape_and_cache_backup2(key, value, key_cache, value_cache, slot_mapping, is_prompt=False): - """ - key: [num_tokens, num_heads, head_size] - value: [num_tokens, num_heads, head_size] - key_cache: [num_heads, head_size, block_size] * num_blocks - value_cache: [num_heads, head_size, block_size] * num_blocks - slot_mapping: [num_tokens] - """ - block_size = key_cache[0].shape[2] - block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()] - if is_prompt: - cached_set = set() - indices = torch.tensor([i for i in range(0, block_size)], device=key.device) - for block_idx in block_idx_list: - if block_idx in cached_set or block_idx < 0: - continue - else: - cached_set.add(block_idx) - key_block, value_block = create_cache_view(key_cache, value_cache, block_idx) - key_block.copy_(key.index_select(0, indices).transpose(0,1).transpose(1,2)) - value_block.copy_(value.index_select(0, indices).transpose(0,1).transpose(1,2)) - indices.add_(block_size) - else: - block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()] - index = torch.tensor(0, device=key.device) - # slot_idx = torch.tensor(0, device=key.device) - for block_idx, block_offset in zip(block_idx_list, block_offset_list): - key_block, value_block = create_cache_view(key_cache, value_cache, block_idx) - # slot_idx.copy_(block_offset) - slot_idx = torch.tensor(block_offset, device=key.device) - key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2)) - value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2)) - index.add_(1) -''' \ No newline at end of file + value_cache = value_cache.permute(0, 2, 3, 1) \ No newline at end of file diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index f3dd9c2c575a3..0454814091562 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -137,40 +137,6 @@ def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_st # FIXME: the below code is unused legacy code not meant to be used. Use FusedRoPE # on HPU and delete this once coverage is verified raise NotImplementedError - # update query and key in-place - num_tokens = query.shape[0] - num_heads = query.shape[-1] // head_size - query = query.view(num_tokens, num_heads, head_size) - key = key.view(num_tokens, num_heads, head_size) - cos, sin = torch.split(cos_sin_cache, cos_sin_cache.shape[-1] // 2, dim=-1) - if is_neox_style: - sin = torch.cat((sin, sin), dim=-1) - cos = torch.cat((cos, cos), dim=-1) - else: - sin = torch.repeat_interleave(sin, 2, -1) - cos = torch.repeat_interleave(cos, 2, -1) - - query_rot = query[..., :head_size] - query_pass = query[..., head_size:] - key_rot = key[..., :head_size] - key_pass = key[..., head_size:] - - query_rot = query_rot.transpose(0, 1) - key_rot = key_rot.transpose(0, 1) - cos = F.embedding(positions, cos) - sin = F.embedding(positions, sin) - - query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin, - is_neox_style) - query_rot = query_rot.transpose(0, 1).contiguous() - key_rot = key_rot.transpose(0, 1).contiguous() - - query.copy_(torch.cat((query_rot, query_pass), dim=-1)) - key.copy_(torch.cat((key_rot, key_pass), dim=-1)) - htorch.core.mark_step() - - # Output query/key shape: [num_tokens, num_tokens, head_size] - return query, key def awq_gemm(*args): raise NotImplementedError diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py index 691d30b0fba90..7309a1f0c1fbd 100644 --- a/vllm/hpu/xops.py +++ b/vllm/hpu/xops.py @@ -18,28 +18,6 @@ from .attn_bias import AttentionBias -# # xops.memory_efficient_attention_forward -# def memory_efficient_attention_forward( -# query: torch.Tensor, -# key: torch.Tensor, -# value: torch.Tensor, -# attn_bias = None, -# p: float = 0.0, -# scale: Optional[float] = None -# ) -> torch.Tensor: -# # scale = 1 / query.shape[-1] ** 0.5 -# query = query * scale -# attn = query @ key.transpose(-2, -1) -# if attn_bias is not None: -# shape=(query.shape[0], query.shape[1], query.shape[-2], query.shape[-2]) -# attn_mask = torch.full(shape, dtype=query.dtype, fill_value=float("-inf"), device=query.device) -# attn_mask = torch.triu(attn_mask, diagonal=1).to(query.dtype) -# attn = attn + attn_mask -# attn = attn.softmax(-1) -# attn = torch.nn.functional.dropout(attn, p) -# return attn @ value - - def block_masked_attention( query: torch.Tensor, key: torch.Tensor, @@ -78,15 +56,7 @@ def memory_efficient_attention_forward( mask_start_idx = i * seq_len mask_end_idx = (i + 1) * seq_len - # # Create attention mask. - # attn_mask = torch.ones(seq_len, seq_len, dtype=query.dtype) - # attn_mask[:seq_lens[i],:seq_lens[i]] = torch.triu( - # attn_mask[:seq_lens[i],:seq_lens[i]], - # diagonal=1 - # ) - # attn_mask = attn_mask * -10000.0 # torch.finfo(query.dtype).min - # attn_mask = attn_mask.to(dtype=query.dtype, device=query.device) - + # Create attention mask. attn_mask = attn_bias.materialize(device=query.device) output = block_masked_attention( query[start_idx:end_idx], @@ -94,7 +64,7 @@ def memory_efficient_attention_forward( value[start_idx:end_idx], scale, attn_mask=attn_mask[mask_start_idx:mask_end_idx, - mask_start_idx:mask_end_idx], # attn_mask=attn_mask, + mask_start_idx:mask_end_idx], ) outputs.append(output) out = torch.cat(outputs, dim=0) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index dd64434f64f20..19d52ba0fcaff 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -51,7 +51,7 @@ def get_device_name(): """ Returns the name of the current device: Gaudi or Gaudi2. - Inspired from: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274 + Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274 """ import habana_frameworks.torch.utils.experimental as htexp From 5725b31ae187f26ed89956075a54160e016eaf39 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Wed, 7 Feb 2024 12:03:53 +0200 Subject: [PATCH 26/43] Added HPU-specific requirements --- requirements-hpu.txt | 14 ++++++++++++++ requirements.txt | 4 ++-- setup.py | 4 ++-- 3 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 requirements-hpu.txt diff --git a/requirements-hpu.txt b/requirements-hpu.txt new file mode 100644 index 0000000000000..73a64a94391f0 --- /dev/null +++ b/requirements-hpu.txt @@ -0,0 +1,14 @@ +ninja # For faster builds. +psutil +ray >= 2.5.1 +pandas # Required for Ray data. +pyarrow # Required for Ray data. +sentencepiece # Required for LLaMA tokenizer. +numpy +#torch == 2.1.2 +transformers >= 4.36.0 # Required for Mixtral. +#xformers == 0.0.23.post1 # Required for CUDA 12.1. +fastapi +uvicorn[standard] +pydantic == 1.10.13 # Required for OpenAI server. +aioprometheus[starlette] diff --git a/requirements.txt b/requirements.txt index 73a64a94391f0..92ba0a716c45c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,9 +5,9 @@ pandas # Required for Ray data. pyarrow # Required for Ray data. sentencepiece # Required for LLaMA tokenizer. numpy -#torch == 2.1.2 +torch == 2.1.2 transformers >= 4.36.0 # Required for Mixtral. -#xformers == 0.0.23.post1 # Required for CUDA 12.1. +xformers == 0.0.23.post1 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic == 1.10.13 # Required for OpenAI server. diff --git a/setup.py b/setup.py index 33f9627b94f1b..f4e8f1b9545c1 100644 --- a/setup.py +++ b/setup.py @@ -285,8 +285,8 @@ def get_requirements() -> List[str]: if _is_hip(): with open(get_path("requirements-rocm.txt")) as f: requirements = f.read().strip().split("\n") - elif _is_cuda(): - with open(get_path("requirements-cuda.txt")) as f: + elif not _is_cuda() and not _is_hip(): + with open(get_path("requirements-hpu.txt")) as f: requirements = f.read().strip().split("\n") else: with open(get_path("requirements.txt")) as f: From 97d31b0d8829626f4651954d6dc2b8146188ff94 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Wed, 7 Feb 2024 12:18:56 +0200 Subject: [PATCH 27/43] Restored full functionality on NVIDIA --- vllm/entrypoints/api_server.py | 5 +- vllm/entrypoints/openai/api_server.py | 5 +- vllm/hpu/rotary_embed.py | 110 ++++++++++++++++ vllm/model_executor/layers/attention.py | 123 +++++++++++------- .../model_executor/layers/rotary_embedding.py | 118 +---------------- 5 files changed, 199 insertions(+), 162 deletions(-) create mode 100644 vllm/hpu/rotary_embed.py diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index b120210831fe5..74d18efe3c7f8 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -2,8 +2,9 @@ import json from typing import AsyncGenerator import torch -import habana_frameworks.torch.core as htcore -import habana_frameworks.torch.gpu_migration +if torch.version.cuda is None and torch.version.hip is None: + import habana_frameworks.torch.core as htcore + import habana_frameworks.torch.gpu_migration from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse import uvicorn diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bb5b921123460..1d7272fb8b05e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -9,8 +9,9 @@ from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union import torch -import habana_frameworks.torch.core as htcore -import habana_frameworks.torch.gpu_migration +if torch.version.cuda is None and torch.version.hip is None: + import habana_frameworks.torch.core as htcore + import habana_frameworks.torch.gpu_migration from aioprometheus import MetricsMiddleware from aioprometheus.asgi.starlette import metrics import fastapi diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py new file mode 100644 index 0000000000000..e4be97ff285ee --- /dev/null +++ b/vllm/hpu/rotary_embed.py @@ -0,0 +1,110 @@ +import torch +import torch.nn as nn + +def get_device_name(): + """ + Returns the name of the current device: Gaudi or Gaudi2. + + Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274 + """ + import habana_frameworks.torch.utils.experimental as htexp + + device_type = htexp._get_device_type() + + if device_type == htexp.synDeviceType.synDeviceGaudi: + return "gaudi" + elif device_type == htexp.synDeviceType.synDeviceGaudi2: + return "gaudi2" + else: + raise ValueError(f"Unsupported device: the device type is {device_type}.") + +# TODO: remove this workaround when FusedRoPE properly works on Gaudi +if get_device_name() == "gaudi2": + try: + from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE + except ImportError: + print("Not using HPU fused kernel for apply_rotary_pos_emb") + FusedRoPE = None +else: + FusedRoPE = None + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids]#.unsqueeze(unsqueeze_dim) + sin = sin[position_ids]#.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class HpuRotaryEmbedding(nn.Module): + def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='cuda'): + super().__init__() + + self.head_size = head_size + self.dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): + seq_len = key.shape[-2] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) + + cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) + query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size)) + key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) + if query.device.type == "hpu" and FusedRoPE: + if len(positions[0]) == 1: + cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) + sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) + else: + cos = cos[positions].unsqueeze(2) + sin = sin[positions].unsqueeze(2) + query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) + else: + query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) + return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) \ No newline at end of file diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index f0955671bdf82..a6f6cca70480d 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -151,22 +151,35 @@ def forward( key = key.unflatten(0, (batch_size, seq_len)) value = value.unflatten(0, (batch_size, seq_len)) - cu_seq_lens = [0] - for i in range(len(input_metadata.prompt_lens)): - cu_seq_lens.append(cu_seq_lens[-1] + input_metadata.prompt_lens[i]) - input_metadata.cu_seq_lens = cu_seq_lens - out = xops.memory_efficient_attention_forward( - query, - key, - value, - cu_seq_lens=cu_seq_lens, - attn_bias=input_metadata.attn_bias, - p=0.0, - scale=self.scale, - ) - output = torch.zeros_like(query) - output[:, :out.shape[1], :, :] = out - output = output.view_as(query) + if is_hpu(): + cu_seq_lens = [0] + for i in range(len(input_metadata.prompt_lens)): + cu_seq_lens.append(cu_seq_lens[-1] + input_metadata.prompt_lens[i]) + input_metadata.cu_seq_lens = cu_seq_lens + out = xops.memory_efficient_attention_forward( + query, + key, + value, + cu_seq_lens, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + ) + output = torch.zeros_like(query) + output[:, :out.shape[1], :, :] = out + output = output.view_as(query) + else: + out = xops.memory_efficient_attention_forward( + query, + key, + value, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (is_hip()) else None, + ) + output = out.view_as(query) else: # Decoding run. if key_cache is not None and value_cache is not None: @@ -247,8 +260,8 @@ def _paged_attention( # For context len > 8192, use V2 kernel to avoid shared memory shortage. use_v1 = input_metadata.max_context_len <= 8192 and ( max_num_partitions == 1 or num_seqs * num_heads > 512) - if use_v1 or is_hpu(): - # Run PagedAttention V1. + + if is_hpu(): output = ops.paged_attention_v1( query, key_cache, @@ -262,33 +275,49 @@ def _paged_attention( alibi_slopes, ) else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - ) + if use_v1: + # Run PagedAttention V1. + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + ) return output diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 19d52ba0fcaff..201a5142e6466 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -30,6 +30,7 @@ from vllm.utils import is_hpu if is_hpu(): from vllm.hpu import ops + from vllm.hpu.rotary_embed import HpuRotaryEmbedding else: from vllm._C import ops @@ -47,115 +48,6 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor: return x.flatten(-2) -def get_device_name(): - """ - Returns the name of the current device: Gaudi or Gaudi2. - - Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274 - """ - import habana_frameworks.torch.utils.experimental as htexp - - device_type = htexp._get_device_type() - - if device_type == htexp.synDeviceType.synDeviceGaudi: - return "gaudi" - elif device_type == htexp.synDeviceType.synDeviceGaudi2: - return "gaudi2" - else: - raise ValueError(f"Unsupported device: the device type is {device_type}.") - -# TODO: remove this workaround when FusedRoPE properly works on Gaudi -if get_device_name() == "gaudi2": - try: - from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE - except ImportError: - print("Not using HPU fused kernel for apply_rotary_pos_emb") - FusedRoPE = None -else: - FusedRoPE = None - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids]#.unsqueeze(unsqueeze_dim) - sin = sin[position_ids]#.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class LlamaRotaryEmbedding(nn.Module): - def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='cuda'): - super().__init__() - - self.head_size = head_size - self.dim = rotary_dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): - seq_len = key.shape[-2] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) - - cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) - query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size)) - key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) - if query.device.type == "hpu" and FusedRoPE: - if len(positions[0]) == 1: - cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) - sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) - else: - cos = cos[positions].unsqueeze(2) - sin = sin[positions].unsqueeze(2) - query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) - else: - query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) - return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) - - class RotaryEmbedding(nn.Module): """Original rotary positional embedding.""" @@ -456,8 +348,12 @@ def get_rope( return _ROPE_DICT[key] if rope_scaling is None: - rotary_emb = LlamaRotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style) + if is_hpu(): + rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) + else: + rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) else: scaling_type = rope_scaling["type"] scaling_factor = rope_scaling["factor"] From 07671d7ebeb313acbd92366327feacd3435d047f Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Thu, 8 Feb 2024 12:40:17 +0200 Subject: [PATCH 28/43] vllm.core cleanup --- vllm/core/scheduler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index e13da6f88580a..ca28bbdc2fb95 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -137,8 +137,6 @@ def _schedule(self) -> SchedulerOutputs: # sequence groups are added to the front and the new sequence groups # are added to the back. while self.waiting: - if len(scheduled) == 4: - break seq_group = self.waiting[0] assert seq_group.num_seqs() == 1, ( From 413fb6065bde5bda44409e15994a80874e4526b4 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Thu, 8 Feb 2024 12:41:11 +0200 Subject: [PATCH 29/43] vllm init cleanup --- vllm/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/__init__.py b/vllm/__init__.py index 9f25f62bd2c1a..138882d1a5a24 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,6 +1,4 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" -import habana_frameworks.torch.gpu_migration - from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine From a38686e127cf9313748ecc2a80cd14e42d6b0aa0 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 9 Feb 2024 14:09:20 +0200 Subject: [PATCH 30/43] vllm.hpu cleanup --- vllm/hpu/__init__.py | 2 +- vllm/hpu/attn_bias.py | 2 +- vllm/hpu/cache_ops.py | 2 +- vllm/hpu/cuda_utils.py | 2 +- vllm/hpu/rotary_embed.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py index ce3a3ce5d435c..3edd0d0f2dc99 100644 --- a/vllm/hpu/__init__.py +++ b/vllm/hpu/__init__.py @@ -8,4 +8,4 @@ # and is subject to the confidentiality and license agreements under which it # was provided. # -############################################################################### \ No newline at end of file +############################################################################### diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py index ac3ce8e6784cc..ff508a59cc56a 100644 --- a/vllm/hpu/attn_bias.py +++ b/vllm/hpu/attn_bias.py @@ -761,4 +761,4 @@ def _create_block_mask( mask, diagonal=num_keys - num_queries - self._window_size + 1 ) mask = torch.log(mask) - return mask.to(dtype) \ No newline at end of file + return mask.to(dtype) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 5c678587c6ff9..de1bc9909ee85 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -43,4 +43,4 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, is_promp value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i] index.add_(1) key_cache = key_cache.permute(0, 2, 3, 1) - value_cache = value_cache.permute(0, 2, 3, 1) \ No newline at end of file + value_cache = value_cache.permute(0, 2, 3, 1) diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py index cb067fca13cca..f9a019431e4c5 100644 --- a/vllm/hpu/cuda_utils.py +++ b/vllm/hpu/cuda_utils.py @@ -11,4 +11,4 @@ ############################################################################### def get_device_attribute(attribute, device_id): - return 10240 # TODO: fake value now \ No newline at end of file + return 10240 # TODO: fake value now diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index e4be97ff285ee..72489c568920a 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -107,4 +107,4 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) else: query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) - return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) \ No newline at end of file + return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) From bed7da6e56184098cc8f451de6876bb4c88a7327 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 9 Feb 2024 17:09:46 +0200 Subject: [PATCH 31/43] vllm.benchmarks cleanup --- benchmarks/benchmark_latency.py | 2 ++ benchmarks/benchmark_serving.py | 3 +++ benchmarks/benchmark_throughput.py | 6 ++---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index e33d5fb2dc247..17b207544295b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -6,6 +6,8 @@ import numpy as np import torch +if torch.version.cuda is None and torch.version.hip is None: + import habana_frameworks.torch as htorch from tqdm import tqdm from vllm import LLM, SamplingParams diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3a80e679191e3..bbf59b034ac24 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,6 +25,9 @@ import aiohttp import numpy as np from transformers import PreTrainedTokenizerBase +import torch +if torch.version.cuda is None and torch.version.hip is None: + import habana_frameworks.torch as htorch from vllm.transformers_utils.tokenizer import get_tokenizer # (prompt len, output len, latency) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 43b368f020471..97ee9b687fb6b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,6 +6,8 @@ from typing import List, Optional, Tuple import torch +if torch.version.cuda is None and torch.version.hip is None: + import habana_frameworks.torch as htorch from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) from tqdm import tqdm @@ -90,10 +92,6 @@ def run_vllm( dtype=dtype, max_model_len=max_model_len, enforce_eager=enforce_eager, - max_num_batched_tokens=(16 * 128), - max_num_seqs=20, - max_paddings=(16 * 128), - block_size=32, ) # Add the requests to the engine. From 0baa2ef8f1bf70962cca88c363686826c096a5be Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 9 Feb 2024 17:11:28 +0200 Subject: [PATCH 32/43] vllm.entrypoint cleanup --- setup.py | 8 +++++--- vllm/entrypoints/api_server.py | 10 ---------- vllm/entrypoints/llm.py | 15 +++++---------- 3 files changed, 10 insertions(+), 23 deletions(-) diff --git a/setup.py b/setup.py index f4e8f1b9545c1..f182a0084fae1 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} -#SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) +# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) def _is_hip() -> bool: return torch.version.hip is not None @@ -103,7 +103,7 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version: def get_torch_arch_list() -> Set[str]: - if _is_cuda(): + if _is_cuda() or _is_hip(): # TORCH_CUDA_ARCH_LIST can have one or more architectures, # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the # compiler to additionally include PTX code that can be runtime-compiled @@ -139,7 +139,9 @@ def get_torch_arch_list() -> Set[str]: f"{valid_archs}.", stacklevel=2) return arch_list - + else: + return set() + # First, check the TORCH_CUDA_ARCH_LIST environment variable. compute_capabilities = get_torch_arch_list() if _is_cuda() and not compute_capabilities: diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 74d18efe3c7f8..bdb35df8878ca 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -69,16 +69,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]: prompt = final_output.prompt text_outputs = [prompt + output.text for output in final_output.outputs] ret = {"text": text_outputs} - DEBUG = True - if DEBUG: - text_tokens = [output.token_ids for output in final_output.outputs] - from vllm.transformers_utils.tokenizer import get_tokenizer - tokenizer = get_tokenizer('lmsys/vicuna-7b-v1.3') - decoded_tokens = [tokenizer.decode(token_ids) for token_ids in text_tokens] - ret["DEBUG"] = { - 'tokens': text_tokens, - 'decoded_tokens': decoded_tokens, - } return JSONResponse(ret) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e399de249c9c3..8220ccd406f03 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -8,9 +8,9 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.utils import Counter +from vllm.utils import is_hpu import torch -import habana_frameworks.torch as htorch class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -181,7 +181,8 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() pbar = tqdm(total=num_requests, desc="Processed prompts") - if profiling: + + if profiling and is_hpu(): prof = torch.profiler.profile( schedule = torch.profiler.schedule(wait=6, warmup=0, active=2, repeat=1), activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU], @@ -196,21 +197,15 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu outputs: List[RequestOutput] = [] while self.llm_engine.has_unfinished_requests(): step_outputs = self.llm_engine.step() - print("vLLM completed a step") - if profiling: - count += 1 - print(f"Processing step {count}") - if count == 8: - break for output in step_outputs: if output.finished: outputs.append(output) if use_tqdm: pbar.update(1) - if profiling: + if profiling and is_hpu(): htorch.core.mark_step() prof.step() - if profiling: + if profiling and is_hpu(): htorch.hpu.synchronize() prof.stop() if use_tqdm: From 1f22aa177ab82b76ce4700951a17190278ac53a4 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Tue, 13 Feb 2024 16:53:13 +0200 Subject: [PATCH 33/43] Changed is_hpu logic --- vllm/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index 2b78f31946e24..29bb24a5f8b56 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,6 +1,7 @@ import enum import socket import uuid +import importlib from platform import uname import psutil @@ -31,7 +32,7 @@ def is_hip() -> bool: def is_hpu() -> bool: - return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available() + return importlib.util.find_spec('habana_frameworks') is not None if is_hpu(): From eb2c22a46874c0e0d4011c17d6bce7ad54dfb0d1 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Tue, 13 Feb 2024 16:54:00 +0200 Subject: [PATCH 34/43] vllm.benchmark cleanup --- benchmarks/benchmark_latency.py | 3 ++- benchmarks/benchmark_serving.py | 3 ++- benchmarks/benchmark_throughput.py | 3 ++- benchmarks/run_benchmark_bloom560m.sh | 28 --------------------------- 4 files changed, 6 insertions(+), 31 deletions(-) delete mode 100755 benchmarks/run_benchmark_bloom560m.sh diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 17b207544295b..f550aba060e38 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -6,7 +6,8 @@ import numpy as np import torch -if torch.version.cuda is None and torch.version.hip is None: +from vllm.utils import is_hpu +if is_hpu(): import habana_frameworks.torch as htorch from tqdm import tqdm diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index bbf59b034ac24..bb28d700fc321 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -26,7 +26,8 @@ import numpy as np from transformers import PreTrainedTokenizerBase import torch -if torch.version.cuda is None and torch.version.hip is None: +from vllm.utils import is_hpu +if is_hpu(): import habana_frameworks.torch as htorch from vllm.transformers_utils.tokenizer import get_tokenizer diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 97ee9b687fb6b..6b4a0ff031ee8 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,7 +6,8 @@ from typing import List, Optional, Tuple import torch -if torch.version.cuda is None and torch.version.hip is None: +from vllm.utils import is_hpu +if is_hpu(): import habana_frameworks.torch as htorch from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) diff --git a/benchmarks/run_benchmark_bloom560m.sh b/benchmarks/run_benchmark_bloom560m.sh deleted file mode 100755 index 13726bc3f46c0..0000000000000 --- a/benchmarks/run_benchmark_bloom560m.sh +++ /dev/null @@ -1,28 +0,0 @@ -cd /software/users/mdvoretckii/huda -source reset.sh -cd /software/users/mdvoretckii/habana_vllm -python -m pip install -e . -python -m pip install xformers --no-deps -cd benchmarks -#python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100 -python benchmark_throughput.py --tokenizer lmsys/vicuna-7b-v1.3 --model lmsys/vicuna-7b-v1.3 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100 -#curl -X POST -H "Accept: Application/json" -H "Content-Type: application/json" http://localhost:8000/generate -d '{"prompt":"Would you like a jelly baby?","use_beam_search":false,"n":1}' - - -# Missing ops: -# Bloom: alibi -# llama: RMS Norm, RoPE, fused silu, fail in sample -# --- -# GPT2: gelu_new -# Aquila: issues with external source -# Baichuan: no tokenizer -# Falcon: fail in sample -# Falcon RW: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens' -# GPT BigCode: gated, santacoder fails in sample (not affected by CPU RoPE) -# GPT-J: gelu_new -# GPT-NeoX: gelu_fast -# InternLM: no tokenizer class -# Mistral: max_num_batched_tokens (2048) is smaller than max_model_len (32768). -# MPT: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens' -# OPT: fail in sample -# Qwen: no tokenizer class From e69fca6e11c9f9f618c4833e13f232bed12134f1 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Tue, 13 Feb 2024 16:56:20 +0200 Subject: [PATCH 35/43] Fixed importing condition --- vllm/entrypoints/api_server.py | 3 ++- vllm/entrypoints/openai/api_server.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index bdb35df8878ca..629f329a568c4 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -2,7 +2,8 @@ import json from typing import AsyncGenerator import torch -if torch.version.cuda is None and torch.version.hip is None: +from vllm.utils import is_hpu +if is_hpu(): import habana_frameworks.torch.core as htcore import habana_frameworks.torch.gpu_migration from fastapi import FastAPI, Request diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1d7272fb8b05e..d3062e9220dd8 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -9,7 +9,8 @@ from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union import torch -if torch.version.cuda is None and torch.version.hip is None: +from vllm.utils import is_hpu +if is_hpu(): import habana_frameworks.torch.core as htcore import habana_frameworks.torch.gpu_migration from aioprometheus import MetricsMiddleware From 38cc53bebd71e638819e1febc3fe474b4c13cdfe Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Tue, 13 Feb 2024 16:58:19 +0200 Subject: [PATCH 36/43] tests cleanup --- tests/async_engine/test_api_server.py | 2 +- tests/conftest.py | 35 +++++++--- tests/kernels/test_attention.py | 95 ++++++++++++++++----------- tests/samplers/test_beam_search.py | 2 +- tests/samplers/test_logprobs.py | 4 +- 5 files changed, 86 insertions(+), 52 deletions(-) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 2eb1b2606b80e..c61b2394cd88a 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -26,7 +26,7 @@ def api_server(): "api_server_async_engine.py").absolute() uvicorn_process = subprocess.Popen([ sys.executable, "-u", - str(script_path), "--model", "lmsys/vicuna-7b-v1.3" + str(script_path), "--model", "facebook/opt-125m", ]) yield uvicorn_process.terminate() diff --git a/tests/conftest.py b/tests/conftest.py index 7b73aaff6f6c9..fa24c667f93d3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,10 @@ import pytest import torch +from vllm.utils import is_hpu +if is_hpu(): + import habana_frameworks.torch.core as htcore + import habana_frameworks.torch.gpu_migration from transformers import AutoModelForCausalLM from vllm import LLM, SamplingParams @@ -53,11 +57,18 @@ def __init__( ) -> None: assert dtype in _STR_DTYPE_TO_TORCH_DTYPE torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - )#.cuda() + if is_hpu(): + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) + else: + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ).cuda() if tokenizer_name is None: tokenizer_name = model_name self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True) @@ -69,9 +80,12 @@ def generate( ) -> List[Tuple[List[int], str]]: outputs: List[Tuple[List[int], str]] = [] for prompt in prompts: - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + if is_hpu(): + input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + else: + input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.cuda output_ids = self.model.generate( - input_ids,#.cuda(), + input_ids, use_cache=True, **kwargs, ) @@ -125,9 +139,12 @@ def generate_greedy_logprobs( ) -> List[List[torch.Tensor]]: all_logprobs = [] for prompt in prompts: - input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + if is_hpu(): + input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + else: + input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.cuda() output = self.model.generate( - input_ids,#.cuda(), + input_ids, use_cache=True, do_sample=False, max_new_tokens=max_tokens, diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index f2242d7d95e49..f2054bba05a74 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,6 +6,8 @@ from vllm.utils import get_max_shared_memory_bytes, is_hpu if is_hpu(): + import habana_frameworks.torch.core as htcore + import habana_frameworks.torch.gpu_migration from vllm.hpu import ops from vllm.hpu import xops from vllm.hpu.attn_bias import BlockDiagonalCausalMask @@ -21,6 +23,9 @@ NUM_BLOCKS = 40000 # Arbitrary values for testing PARTITION_SIZE = 512 +VERSION = ["v1", "v2"] +if is_hpu(): + VERSION.pop() DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing @@ -105,7 +110,7 @@ def ref_single_query_cached_kv_attention( output[i].copy_(out, non_blocking=True) -@pytest.mark.parametrize("version", ["v1"])#, "v2"]) +@pytest.mark.parametrize("version", VERSION) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -169,53 +174,67 @@ def test_paged_attention( # Call the paged attention kernel. output = torch.empty_like(query) - if version == "v1": + if is_hpu(): output = ops.paged_attention_v1( query, key_cache, value_cache, num_kv_heads, scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes, - ) - elif version == "v2": - num_partitions = ((max_context_len + PARTITION_SIZE - 1) // - PARTITION_SIZE) - assert PARTITION_SIZE % block_size == 0 - num_seqs, num_heads, head_size = output.shape - tmp_output = torch.empty( - size=(num_seqs, num_heads, num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - context_lens, + input_metadata.block_tables, + input_metadata.context_lens, block_size, - max_context_len, + input_metadata.max_context_len, alibi_slopes, ) else: - raise AssertionError(f"Unknown version: {version}") + if version == "v1": + output = ops.paged_attention_v1( + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + elif version == "v2": + num_partitions = ((max_context_len + PARTITION_SIZE - 1) // + PARTITION_SIZE) + assert PARTITION_SIZE % block_size == 0 + num_seqs, num_heads, head_size = output.shape + tmp_output = torch.empty( + size=(num_seqs, num_heads, num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + ) + else: + raise AssertionError(f"Unknown version: {version}") # Run the reference implementation. ref_output = torch.empty_like(query) diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 4cf777e2b9e6f..a491ffa763505 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -10,7 +10,7 @@ # 3. Use the model "huggyllama/llama-7b". MAX_TOKENS = [128] BEAM_WIDTHS = [4] -MODELS = ["lmsys/vicuna-7b-v1.3"] +MODELS = ["facebook/opt-125m"] @pytest.mark.parametrize("model", MODELS) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 24b1572d9a325..b1a5e1f538a7b 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -3,7 +3,7 @@ from vllm import SamplingParams -MODELS = ["lmsys/vicuna-7b-v1.3"] +MODELS = ["facebook/opt-125m"] @pytest.mark.parametrize("model", MODELS) @@ -24,8 +24,6 @@ def test_get_prompt_logprobs( del hf_model vllm_model = vllm_runner(model, dtype=dtype) - import pdb - pdb.set_trace() vllm_sampling_params = SamplingParams(max_tokens=max_tokens, logprobs=5, prompt_logprobs=5, From 54d499a006a23a9740f65153b74accc2b3ab27a3 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Tue, 13 Feb 2024 17:00:59 +0200 Subject: [PATCH 37/43] removed dummy printings --- vllm/model_executor/models/llama.py | 1 - vllm/worker/worker.py | 1 - 2 files changed, 2 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 7722cc140326d..b3b24ea6fea44 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -283,7 +283,6 @@ def forward( kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: - print(f'Input shape: {input_ids.shape}') hidden_states = self.model(input_ids, positions, kv_caches, input_metadata) return hidden_states diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 49600689f4c6a..94cf44e5f6d6b 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -324,7 +324,6 @@ def round_up(n, multiple): elif (i + 1) * self.block_size <= context_lens[seq_id]: attn_masks[i][seq_id, :] = 1 input_metadata.attention_masks = attn_masks.to(device="cuda") - print("input token shape: ", tokens_tensor.shape) return tokens_tensor, positions_tensor, input_metadata @torch.inference_mode() From c0ea99caeb0b380867a7f838ed6c9ed33959f836 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Tue, 13 Feb 2024 16:02:48 +0100 Subject: [PATCH 38/43] Update test_api_server.py --- tests/async_engine/test_api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index c61b2394cd88a..d90ba37b27bb9 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -26,7 +26,7 @@ def api_server(): "api_server_async_engine.py").absolute() uvicorn_process = subprocess.Popen([ sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m", + str(script_path), "--model", "facebook/opt-125m" ]) yield uvicorn_process.terminate() From ea3ea4410158c41c54177310899560fddbcdc27a Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Wed, 14 Feb 2024 18:41:58 +0200 Subject: [PATCH 39/43] restored attention and logprobs tests functionality on Nvidia --- tests/kernels/test_attention.py | 41 ++++++++++++++++++++++----------- tests/samplers/test_logprobs.py | 2 +- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index f2054bba05a74..e7f2f5bb395ef 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -181,15 +181,16 @@ def test_paged_attention( value_cache, num_kv_heads, scale, - input_metadata.block_tables, - input_metadata.context_lens, + block_tables, + context_lens, block_size, - input_metadata.max_context_len, + max_context_len, alibi_slopes, ) else: if version == "v1": - output = ops.paged_attention_v1( + ops.paged_attention_v1( + output, query, key_cache, value_cache, @@ -331,19 +332,31 @@ def test_multi_query_kv_attention( key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens) - output = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), - attn_bias=attn_bias, - p=0.0, - scale=scale, - ) - output = output.squeeze(0) - cu_seq_lens = [0] for seq_len in seq_lens: cu_seq_lens.append(cu_seq_lens[-1] + seq_len) + + if is_hpu(): + output = xops.memory_efficient_attention_forward( + query.unsqueeze(0), + key.unsqueeze(0), + value.unsqueeze(0), + cu_seq_lens, + attn_bias=attn_bias, + p=0.0, + scale=scale, + ) + else: + output = xops.memory_efficient_attention_forward( + query.unsqueeze(0), + key.unsqueeze(0), + value.unsqueeze(0), + attn_bias=attn_bias, + p=0.0, + scale=scale, + ) + output = output.squeeze(0) + ref_output = ref_multi_query_kv_attention( cu_seq_lens, query, diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index b1a5e1f538a7b..1c67cc5bd7394 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -16,7 +16,7 @@ def test_get_prompt_logprobs( example_prompts, ): max_tokens = 5 - hf_model = hf_runner(model, dtype="float") + hf_model = hf_runner(model, dtype=dtype) hf_logprobs = hf_model.generate_greedy_logprobs( example_prompts, max_tokens=max_tokens, From 5543642d19229a898fdee4b36a2faf169207d1c7 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 16 Feb 2024 12:43:45 +0200 Subject: [PATCH 40/43] throughput benchmark cleanup --- benchmarks/benchmark_throughput.py | 12 +++--------- vllm/__init__.py | 1 + 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 6b4a0ff031ee8..9afb4721dd01c 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -38,16 +38,11 @@ def sample_requests( completions = [completion for _, completion in dataset] completion_token_ids = tokenizer(completions).input_ids tokenized_dataset = [] - count = 0 for i in range(len(dataset)): - count += 1 - i = i % 4 output_len = len(completion_token_ids[i]) if fixed_output_len is not None: output_len = fixed_output_len tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) - if count == num_requests: - break # Filter out too long sequences. filtered_dataset: List[Tuple[str, int, int]] = [] @@ -61,10 +56,9 @@ def sample_requests( continue filtered_dataset.append((prompt, prompt_len, output_len)) - # # Sample the requests. - # sampled_requests = random.sample(filtered_dataset, num_requests) - # return sampled_requests - return filtered_dataset + # Sample the requests. + sampled_requests = random.sample(filtered_dataset, num_requests) + return sampled_requests def run_vllm( diff --git a/vllm/__init__.py b/vllm/__init__.py index 138882d1a5a24..e5cd1c2f3334b 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,5 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" + from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine From a2acb8699b2b6316b3e4085ae94087b3acf47ea6 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 16 Feb 2024 16:12:06 +0200 Subject: [PATCH 41/43] Changed Habana copyright header --- vllm/hpu/__init__.py | 10 +--------- vllm/hpu/cache_ops.py | 10 +--------- vllm/hpu/cuda_utils.py | 10 +--------- vllm/hpu/ops.py | 10 +--------- vllm/hpu/rotary_embed.py | 4 ++++ vllm/hpu/xops.py | 11 +---------- 6 files changed, 9 insertions(+), 46 deletions(-) diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py index 3edd0d0f2dc99..c6a95a54d3d95 100644 --- a/vllm/hpu/__init__.py +++ b/vllm/hpu/__init__.py @@ -1,11 +1,3 @@ ############################################################################### -# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company -# All Rights Reserved. -# -# Unauthorized copying of this file or any element(s) within it, via any medium -# is strictly prohibited. -# This file contains Habana Labs, Ltd. proprietary and confidential information -# and is subject to the confidentiality and license agreements under which it -# was provided. -# +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index de1bc9909ee85..913fca2ce56f0 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -1,13 +1,5 @@ ############################################################################### -# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company -# All Rights Reserved. -# -# Unauthorized copying of this file or any element(s) within it, via any medium -# is strictly prohibited. -# This file contains Habana Labs, Ltd. proprietary and confidential information -# and is subject to the confidentiality and license agreements under which it -# was provided. -# +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### from typing import Tuple diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py index f9a019431e4c5..50e8c39076dc0 100644 --- a/vllm/hpu/cuda_utils.py +++ b/vllm/hpu/cuda_utils.py @@ -1,13 +1,5 @@ ############################################################################### -# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company -# All Rights Reserved. -# -# Unauthorized copying of this file or any element(s) within it, via any medium -# is strictly prohibited. -# This file contains Habana Labs, Ltd. proprietary and confidential information -# and is subject to the confidentiality and license agreements under which it -# was provided. -# +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### def get_device_attribute(attribute, device_id): diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 0454814091562..9e75695b8846f 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -1,13 +1,5 @@ ############################################################################### -# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company -# All Rights Reserved. -# -# Unauthorized copying of this file or any element(s) within it, via any medium -# is strictly prohibited. -# This file contains Habana Labs, Ltd. proprietary and confidential information -# and is subject to the confidentiality and license agreements under which it -# was provided. -# +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### import torch diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 72489c568920a..679acba6924b1 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -1,3 +1,7 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + import torch import torch.nn as nn diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py index 7309a1f0c1fbd..a28bd7e1dfcf2 100644 --- a/vllm/hpu/xops.py +++ b/vllm/hpu/xops.py @@ -1,16 +1,7 @@ ############################################################################### -# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company -# All Rights Reserved. -# -# Unauthorized copying of this file or any element(s) within it, via any medium -# is strictly prohibited. -# This file contains Habana Labs, Ltd. proprietary and confidential information -# and is subject to the confidentiality and license agreements under which it -# was provided. -# +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### - import habana_frameworks.torch as htorch import torch import torch.nn.functional as F From 956bab7095c5f2c831b36cb3bde87f888fee7f31 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 16 Feb 2024 16:19:57 +0200 Subject: [PATCH 42/43] Restored alibi in bloom --- vllm/model_executor/models/bloom.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 3a557288c3dd1..6d1aeeed78e93 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -109,8 +109,8 @@ def __init__( scaling = self.head_dim**-0.5 self.attn = PagedAttention(self.num_heads, self.head_dim, - scaling)#, - #alibi_slopes=alibi_slopes) + scaling, + alibi_slopes=alibi_slopes) def forward( self, From 702d8a733ea8ef65970ff3bb81bcbbe5aeae4aa4 Mon Sep 17 00:00:00 2001 From: Sebastian Urwan Date: Fri, 16 Feb 2024 16:27:17 +0200 Subject: [PATCH 43/43] Added BSD license header --- vllm/hpu/__init__.py | 3 +++ vllm/hpu/cache_ops.py | 3 +++ vllm/hpu/cuda_utils.py | 3 +++ vllm/hpu/ops.py | 3 +++ vllm/hpu/rotary_embed.py | 3 +++ vllm/hpu/xops.py | 3 +++ 6 files changed, 18 insertions(+) diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py index c6a95a54d3d95..b8e4d3aac98a7 100644 --- a/vllm/hpu/__init__.py +++ b/vllm/hpu/__init__.py @@ -1,3 +1,6 @@ ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. ############################################################################### diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 913fca2ce56f0..fb08e4167a10a 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -1,5 +1,8 @@ ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. ############################################################################### from typing import Tuple diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py index 50e8c39076dc0..bec242cf985c2 100644 --- a/vllm/hpu/cuda_utils.py +++ b/vllm/hpu/cuda_utils.py @@ -1,5 +1,8 @@ ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. ############################################################################### def get_device_attribute(attribute, device_id): diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 9e75695b8846f..79f8f186a2b21 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -1,5 +1,8 @@ ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. ############################################################################### import torch diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 679acba6924b1..3def58b11feb6 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -1,5 +1,8 @@ ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. ############################################################################### import torch diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py index a28bd7e1dfcf2..6460cb6ac4f33 100644 --- a/vllm/hpu/xops.py +++ b/vllm/hpu/xops.py @@ -1,5 +1,8 @@ ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. ############################################################################### import habana_frameworks.torch as htorch