From e528d064d73a0a56fa3ff19585b309fb3466d726 Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Wed, 8 Nov 2023 05:59:56 +0200
Subject: [PATCH 01/43] Porting vllm to HPU

---
 benchmarks/benchmark_throughput.py      | 23 ++++++--
 requirements.txt                        |  4 +-
 setup.py                                | 74 ++++++++++++-------------
 vllm/__init__.py                        |  1 +
 vllm/core/scheduler.py                  |  2 +
 vllm/entrypoints/llm.py                 | 25 ++++++++-
 vllm/model_executor/layers/attention.py |  4 +-
 vllm/model_executor/models/bloom.py     |  4 +-
 vllm/worker/cache_engine.py             |  3 +-
 9 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 3aac479c01bd2..ab3f2944b1e5f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -35,11 +35,16 @@ def sample_requests(
     completions = [completion for _, completion in dataset]
     completion_token_ids = tokenizer(completions).input_ids
     tokenized_dataset = []
+    count = 0
     for i in range(len(dataset)):
+        count += 1
+        i = i % 10
         output_len = len(completion_token_ids[i])
         if fixed_output_len is not None:
             output_len = fixed_output_len
         tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+        if count == num_requests:
+            break
 
     # Filter out too long sequences.
     filtered_dataset: List[Tuple[str, int, int]] = []
@@ -53,9 +58,10 @@ def sample_requests(
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
 
-    # Sample the requests.
-    sampled_requests = random.sample(filtered_dataset, num_requests)
-    return sampled_requests
+    # # Sample the requests.
+    # sampled_requests = random.sample(filtered_dataset, num_requests)
+    # return sampled_requests
+    return filtered_dataset
 
 
 def run_vllm(
@@ -71,6 +77,7 @@ def run_vllm(
     dtype: str,
     max_model_len: Optional[int],
     enforce_eager: bool,
+    profiling: bool = False, # For Gaudi2
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
@@ -83,6 +90,10 @@ def run_vllm(
         dtype=dtype,
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
+        max_num_batched_tokens=(16 * 512),
+        max_num_seqs=256,
+        max_paddings=(16 * 512),
+        block_size=16,
     )
 
     # Add the requests to the engine.
@@ -104,7 +115,7 @@ def run_vllm(
 
     start = time.perf_counter()
     # FIXME(woosuk): Do not use internal method.
-    llm._run_engine(use_tqdm=True)
+    llm._run_engine(use_tqdm=True, profiling=profiling)
     end = time.perf_counter()
     return end - start
 
@@ -206,7 +217,8 @@ def main(args: argparse.Namespace):
                                 args.quantization, args.tensor_parallel_size,
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager)
+                                args.max_model_len, args.enforce_eager,
+                                args.profiling)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -284,6 +296,7 @@ def main(args: argparse.Namespace):
     parser.add_argument("--enforce-eager",
                         action="store_true",
                         help="enforce eager execution")
+    parser.add_argument("--profiling", action='store_true', help='Profiling first 4 steps')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/requirements.txt b/requirements.txt
index 92ba0a716c45c..73a64a94391f0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,9 +5,9 @@ pandas  # Required for Ray data.
 pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-torch == 2.1.2
+#torch == 2.1.2
 transformers >= 4.36.0  # Required for Mixtral.
-xformers == 0.0.23.post1  # Required for CUDA 12.1.
+#xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic == 1.10.13  # Required for OpenAI server.
diff --git a/setup.py b/setup.py
index 45a18776798fb..da56a61fc0278 100644
--- a/setup.py
+++ b/setup.py
@@ -28,10 +28,10 @@ def _is_cuda() -> bool:
     return torch.version.cuda is not None
 
 
-# Compiler flags.
-CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# TODO(woosuk): Should we use -O3?
-NVCC_FLAGS = ["-O2", "-std=c++17"]
+# # Compiler flags.
+# CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
+# # TODO(woosuk): Should we use -O3?
+# NVCC_FLAGS = ["-O2", "-std=c++17"]
 
 if _is_hip():
     if ROCM_HOME is None:
@@ -210,32 +210,33 @@ def get_torch_arch_list() -> Set[str]:
             f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
             f"amdgpu_arch_found: {amd_arch}")
 
-ext_modules = []
-
-vllm_extension_sources = [
-    "csrc/cache_kernels.cu",
-    "csrc/attention/attention_kernels.cu",
-    "csrc/pos_encoding_kernels.cu",
-    "csrc/activation_kernels.cu",
-    "csrc/layernorm_kernels.cu",
-    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-    "csrc/cuda_utils_kernels.cu",
-    "csrc/pybind.cpp",
-]
-
-if _is_cuda():
-    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
-    vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu")
-
-vllm_extension = CUDAExtension(
-    name="vllm._C",
-    sources=vllm_extension_sources,
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(vllm_extension)
+# ext_modules = []
+
+if _is_cuda() or _is_hip():
+    vllm_extension_sources = [
+        "csrc/cache_kernels.cu",
+        "csrc/attention/attention_kernels.cu",
+        "csrc/pos_encoding_kernels.cu",
+        "csrc/activation_kernels.cu",
+        "csrc/layernorm_kernels.cu",
+        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+        "csrc/cuda_utils_kernels.cu",
+        "csrc/pybind.cpp",
+    ]
+
+    if _is_cuda():
+        vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+        vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu")
+
+    vllm_extension = CUDAExtension(
+        name="vllm._C",
+        sources=vllm_extension_sources,
+        extra_compile_args={
+            "cxx": CXX_FLAGS,
+            "nvcc": NVCC_FLAGS,
+        },
+    )
+    ext_modules.append(vllm_extension)
 
 
 def get_path(*filepath) -> str:
@@ -274,12 +275,8 @@ def get_vllm_version() -> str:
 
 
 def read_readme() -> str:
-    """Read the README file if present."""
-    p = get_path("README.md")
-    if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
-    else:
-        return ""
+    """Read the README file."""
+    return io.open(get_path("README.md"), "r", encoding="utf-8").read()
 
 
 def get_requirements() -> List[str]:
@@ -319,7 +316,6 @@ def get_requirements() -> List[str]:
                                                "examples", "tests")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension},
-    package_data={"vllm": ["py.typed"]},
+    # ext_modules=ext_modules,
+    # cmdclass={"build_ext": BuildExtension},
 )
diff --git a/vllm/__init__.py b/vllm/__init__.py
index e5cd1c2f3334b..9f25f62bd2c1a 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,5 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+import habana_frameworks.torch.gpu_migration
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index ca28bbdc2fb95..1fc07fe85dc0b 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -137,6 +137,8 @@ def _schedule(self) -> SchedulerOutputs:
             # sequence groups are added to the front and the new sequence groups
             # are added to the back.
             while self.waiting:
+                if len(scheduled) == 10:
+                    break
                 seq_group = self.waiting[0]
 
                 assert seq_group.num_seqs() == 1, (
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 0700298b03a3d..1df7b1a80b1b2 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -9,6 +9,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import Counter
 
+import torch
+import habana_frameworks.torch as htorch
 
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
@@ -174,20 +176,41 @@ def _add_request(
         self.llm_engine.add_request(request_id, prompt, sampling_params,
                                     prompt_token_ids)
 
-    def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+    def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOutput]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
             pbar = tqdm(total=num_requests, desc="Processed prompts")
+        if profiling:
+            prof = torch.profiler.profile(
+                schedule = torch.profiler.schedule(wait=0, warmup=0, active=4, repeat=1),
+                activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU],
+                with_stack = True,
+                record_shapes = False,
+                on_trace_ready = torch.profiler.tensorboard_trace_handler("./", use_gzip = True)
+            )
+            prof.start()
+            count = 0
+
         # Run the engine.
         outputs: List[RequestOutput] = []
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
+            if profiling:
+                count += 1
+                if count == 4:
+                    break
             for output in step_outputs:
                 if output.finished:
                     outputs.append(output)
                     if use_tqdm:
                         pbar.update(1)
+            if profiling:
+                htorch.core.mark_step()
+                htorch.hpu.synchronize()
+                prof.step()
+        if profiling:
+            prof.stop()
         if use_tqdm:
             pbar.close()
         # Sort the outputs by request ID.
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 6482875d1c55b..37d30bd1e3790 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
-                                         LowerTriangularMaskWithTensorBias)
+# from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
+#                                          LowerTriangularMaskWithTensorBias)
 
 from vllm._C import ops
 from vllm._C import cache_ops
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 6d1aeeed78e93..3a557288c3dd1 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -109,8 +109,8 @@ def __init__(
         scaling = self.head_dim**-0.5
         self.attn = PagedAttention(self.num_heads,
                                    self.head_dim,
-                                   scaling,
-                                   alibi_slopes=alibi_slopes)
+                                   scaling)#,
+                                   #alibi_slopes=alibi_slopes)
 
     def forward(
         self,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 1dd0243f8f3a3..4c8a00c15126c 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -89,7 +89,8 @@ def allocate_cpu_cache(self) -> List[KVCache]:
         cpu_cache: List[KVCache] = []
         key_block_shape = self.get_key_block_shape()
         value_block_shape = self.get_value_block_shape()
-        pin_memory = not in_wsl()
+        # pin_memory = not in_wsl()
+        pin_memory = not in_wsl() and not torch.hpu.is_available()
         if not pin_memory:
             # Pinning memory in WSL is not supported.
             # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications

From d8da01f8625e37054aaebe63147f75adeda1efed Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Wed, 8 Nov 2023 08:04:22 +0200
Subject: [PATCH 02/43] add hpu cache allocate

---
 vllm/worker/cache_engine.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 4c8a00c15126c..d2f5d53de4938 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -41,7 +41,8 @@ def __init__(
         self.num_cpu_blocks = cache_config.num_cpu_blocks
 
         # Initialize the cache.
-        self.gpu_cache = self.allocate_gpu_cache()
+        # self.gpu_cache = self.allocate_gpu_cache()
+        self.gpu_cache = self.allocate_hpu_cache()
         self.cpu_cache = self.allocate_cpu_cache()
 
         # Initialize the stream for caching operations.
@@ -67,6 +68,29 @@ def get_value_block_shape(self) -> Tuple[int, int, int]:
             self.block_size,
         )
 
+    def allocate_hpu_cache(self) -> List[KVCache]:
+        hpu_cache: List[KVCache] = []
+        kv_block_shape = (
+            self.num_heads,
+            self.head_size,
+            self.block_size)
+        for _ in range(self.num_layers):
+            key_blocks = []
+            value_blocks = []
+            for _ in range(self.num_gpu_blocks):
+                key_blocks.append(torch.empty(
+                    size=kv_block_shape,
+                    dtype=self.dtype,
+                    device="hpu",
+                ))
+                value_blocks.append(torch.empty(
+                    size=kv_block_shape,
+                    dtype=self.dtype,
+                    device="hpu",
+                ))
+            hpu_cache.append((key_blocks, value_blocks))
+        return hpu_cache
+
     def allocate_gpu_cache(self) -> List[KVCache]:
         gpu_cache: List[KVCache] = []
         key_block_shape = self.get_key_block_shape()

From 4d1538faf369125752cd65effd6ddbed91442bdd Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Wed, 8 Nov 2023 09:25:17 +0200
Subject: [PATCH 03/43] move slot_mapping to cpu and add is_prompt in
 cache_ops.reshape_and_cache

---
 vllm/worker/worker.py | 182 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 182 insertions(+)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 8698b15721507..68e4fa99b79f8 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -125,6 +125,188 @@ def warm_up_model(self) -> None:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+    def _prepare_inputs(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata]:
+        seq_groups: List[Tuple[List[int], SamplingParams]] = []
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        selected_token_indices: List[int] = []
+        selected_token_start_idx = 0
+        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices_start_idx = 0
+
+        # Add prompt tokens.
+        prompt_lens: List[int] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            if not seq_group_metadata.is_prompt:
+                continue
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_groups.append((seq_ids, sampling_params))
+
+            # Use any sequence in the group.
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            prompt_len = len(prompt_tokens)
+            prompt_lens.append(prompt_len)
+
+            if sampling_params.prompt_logprobs is not None:
+                # NOTE: prompt token positions do not need sample, skip
+                categorized_sample_indices_start_idx += prompt_len - 1
+
+            categorized_sample_indices[sampling_params.sampling_type].append(
+                categorized_sample_indices_start_idx)
+            categorized_sample_indices_start_idx += 1
+
+            input_tokens.append(prompt_tokens)
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.append(list(range(prompt_len)))
+
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.append([0] * prompt_len)
+                continue
+
+            # Compute the slot mapping.
+            slot_mapping.append([])
+            block_table = seq_group_metadata.block_tables[seq_id]
+            for i in range(prompt_len):
+                block_number = block_table[i // self.block_size]
+                block_offset = i % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping[-1].append(slot)
+
+        # Add generation tokens.
+        max_context_len = 0
+        max_num_blocks_per_seq = 0
+        context_lens: List[int] = []
+        generation_block_tables: List[List[int]] = []
+        max_seq_len = max(prompt_lens) if prompt_lens else 1
+        for seq_group_metadata in seq_group_metadata_list:
+            if seq_group_metadata.is_prompt:
+                # We need to do this in this loop as we need to know max_seq_len
+                assert len(
+                    seq_ids) == 1, "Prompt input should have only one seq."
+                sampling_params = seq_group_metadata.sampling_params
+                if sampling_params.prompt_logprobs is not None:
+                    selected_token_indices.extend(
+                        range(selected_token_start_idx,
+                              selected_token_start_idx + prompt_len - 1))
+                selected_token_indices.append(selected_token_start_idx +
+                                              prompt_len - 1)
+                selected_token_start_idx += max_seq_len
+                continue
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_groups.append((seq_ids, sampling_params))
+
+            num_seqs = len(seq_ids)
+            selected_token_indices.extend(
+                range(selected_token_start_idx,
+                      selected_token_start_idx + num_seqs))
+            selected_token_start_idx += num_seqs
+
+            categorized_sample_indices[sampling_params.sampling_type].extend(
+                range(categorized_sample_indices_start_idx,
+                      categorized_sample_indices_start_idx + num_seqs))
+            categorized_sample_indices_start_idx += num_seqs
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+
+                context_len = seq_data.get_len()
+                position = context_len - 1
+                if self.sliding_window is not None:
+                    context_len = min(context_len, self.sliding_window)
+                input_positions.append([position])
+
+                block_table = seq_group_metadata.block_tables[seq_id]
+
+                max_context_len = max(max_context_len, context_len)
+                max_num_blocks_per_seq = max(max_num_blocks_per_seq,
+                                             len(block_table))
+                context_lens.append(context_len)
+
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append([slot])
+
+                if self.sliding_window is not None:
+                    sliding_window_blocks = (self.sliding_window //
+                                             self.block_size)
+                    block_table = block_table[-sliding_window_blocks:]
+                generation_block_tables.append(block_table)
+
+        padded_input_tokens = [
+            _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens
+        ]
+        padded_input_positions = [
+            _pad_to_max(positions, max_seq_len, pad=0)
+            for positions in input_positions
+        ]
+        padded_slot_mapping = [
+            _pad_to_max(mapping, max_seq_len, pad=-1)
+            for mapping in slot_mapping
+        ]
+        padded_block_tables = [
+            _pad_to_max(block_table, max_num_blocks_per_seq, pad=0)
+            for block_table in generation_block_tables
+        ]
+
+        # Convert to tensors.
+        tokens_tensor = torch.tensor(padded_input_tokens,
+                                     dtype=torch.long,
+                                     device="cuda")
+        positions_tensor = torch.tensor(padded_input_positions,
+                                        dtype=torch.long,
+                                        device="cuda")
+        slot_mapping_tensor = torch.tensor(padded_slot_mapping,
+                                           dtype=torch.long,
+                                           device="cpu")
+        context_lens_tensor = torch.tensor(context_lens,
+                                           dtype=torch.int,
+                                           device="cuda")
+        selected_token_indices = torch.tensor(selected_token_indices,
+                                              dtype=torch.long,
+                                              device="cuda")
+        categorized_sample_indices = {
+            t: torch.tensor(seq_ids, dtype=torch.int, device="cuda")
+            for t, seq_ids in categorized_sample_indices.items()
+        }
+        block_tables_tensor = torch.tensor(padded_block_tables,
+                                           dtype=torch.int,
+                                           device="cuda")
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        input_metadata = InputMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+            slot_mapping=slot_mapping_tensor,
+            context_lens=context_lens_tensor,
+            max_context_len=max_context_len,
+            block_tables=block_tables_tensor,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=categorized_sample_indices,
+            sliding_window=self.sliding_window,
+        )
+        return tokens_tensor, positions_tensor, input_metadata
+
     @torch.inference_mode()
     def execute_model(
         self,

From c3368243df6d4a5bb152413d2315406765603c3a Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Wed, 8 Nov 2023 10:39:24 +0200
Subject: [PATCH 04/43] add bucket to input metadata

---
 vllm/worker/worker.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 68e4fa99b79f8..7faf379ceca2a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -249,6 +249,16 @@ def _prepare_inputs(
                     block_table = block_table[-sliding_window_blocks:]
                 generation_block_tables.append(block_table)
 
+        def round_up(n, multiple):
+            print(n, multiple)
+            return (n + multiple - 1) // multiple * multiple
+
+        if self.block_size is not None:
+            if max_seq_len != 1:
+                max_seq_len = round_up(max_seq_len, self.block_size)
+            if max_num_blocks_per_seq != 0:
+                max_num_blocks_per_seq = round_up(max_num_blocks_per_seq, self.block_size)
+
         padded_input_tokens = [
             _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens
         ]

From 068c7484426804f606f62154bcff9b8f9fba9e97 Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Fri, 10 Nov 2023 06:14:25 +0200
Subject: [PATCH 05/43] 1. limit max block number for lazy mode (TODO) 2. set
 some inpu metadata from cuda to cpu

---
 vllm/engine/llm_engine.py | 4 ++--
 vllm/worker/worker.py     | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d6e388bf135b2..6ad70936dfd32 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -217,7 +217,7 @@ def _init_cache(self) -> None:
         # Since we use a shared centralized controller, we take the minimum
         # number of blocks across all workers to make sure all the memory
         # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_gpu_blocks = min(10500, min(b[0] for b in num_blocks))
         num_cpu_blocks = min(b[1] for b in num_blocks)
         # FIXME(woosuk): Change to debug log.
         logger.info(f"# GPU blocks: {num_gpu_blocks}, "
@@ -425,7 +425,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             child_seqs.append((parent, parent))
 
         for seq, _ in child_seqs:
-            self._decode_sequence(seq, seq_group.sampling_params)
+            # self._decode_sequence(seq, seq_group.sampling_params)
             self._check_stop(seq, seq_group.sampling_params)
 
         # Non-beam search case
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 7faf379ceca2a..c948a6701e0c2 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -250,7 +250,6 @@ def _prepare_inputs(
                 generation_block_tables.append(block_table)
 
         def round_up(n, multiple):
-            print(n, multiple)
             return (n + multiple - 1) // multiple * multiple
 
         if self.block_size is not None:
@@ -287,7 +286,7 @@ def round_up(n, multiple):
                                            device="cpu")
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
-                                           device="cuda")
+                                           device="cpu")
         selected_token_indices = torch.tensor(selected_token_indices,
                                               dtype=torch.long,
                                               device="cuda")
@@ -297,7 +296,7 @@ def round_up(n, multiple):
         }
         block_tables_tensor = torch.tensor(padded_block_tables,
                                            dtype=torch.int,
-                                           device="cuda")
+                                           device="cpu")
 
         seq_data: Dict[int, SequenceData] = {}
         for seq_group_metadata in seq_group_metadata_list:

From 9a042f7a6be3fd48af20165c9288442f34b9d5ca Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Fri, 10 Nov 2023 15:02:23 +0200
Subject: [PATCH 06/43] remove bucket for block tables

---
 vllm/worker/worker.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c948a6701e0c2..6068311307f7b 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -255,8 +255,6 @@ def round_up(n, multiple):
         if self.block_size is not None:
             if max_seq_len != 1:
                 max_seq_len = round_up(max_seq_len, self.block_size)
-            if max_num_blocks_per_seq != 0:
-                max_num_blocks_per_seq = round_up(max_num_blocks_per_seq, self.block_size)
 
         padded_input_tokens = [
             _pad_to_max(tokens, max_seq_len, pad=0) for tokens in input_tokens

From 1e7e16d54360885dd2ba8bcb73248f3c628ee1c8 Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Sat, 11 Nov 2023 11:56:08 +0200
Subject: [PATCH 07/43] add run bash script and change benchmark config

---
 benchmarks/benchmark_throughput.py    | 10 +++++-----
 benchmarks/run_benchmark_bloom560m.sh |  1 +
 vllm/core/scheduler.py                |  2 +-
 vllm/worker/worker.py                 | 13 +++++++++++++
 4 files changed, 20 insertions(+), 6 deletions(-)
 create mode 100755 benchmarks/run_benchmark_bloom560m.sh

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ab3f2944b1e5f..43b368f020471 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -38,7 +38,7 @@ def sample_requests(
     count = 0
     for i in range(len(dataset)):
         count += 1
-        i = i % 10
+        i = i % 4
         output_len = len(completion_token_ids[i])
         if fixed_output_len is not None:
             output_len = fixed_output_len
@@ -90,10 +90,10 @@ def run_vllm(
         dtype=dtype,
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
-        max_num_batched_tokens=(16 * 512),
-        max_num_seqs=256,
-        max_paddings=(16 * 512),
-        block_size=16,
+        max_num_batched_tokens=(16 * 128),
+        max_num_seqs=20,
+        max_paddings=(16 * 128),
+        block_size=32,
     )
 
     # Add the requests to the engine.
diff --git a/benchmarks/run_benchmark_bloom560m.sh b/benchmarks/run_benchmark_bloom560m.sh
new file mode 100755
index 0000000000000..404860a95372d
--- /dev/null
+++ b/benchmarks/run_benchmark_bloom560m.sh
@@ -0,0 +1 @@
+python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 1fc07fe85dc0b..e13da6f88580a 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -137,7 +137,7 @@ def _schedule(self) -> SchedulerOutputs:
             # sequence groups are added to the front and the new sequence groups
             # are added to the back.
             while self.waiting:
-                if len(scheduled) == 10:
+                if len(scheduled) == 4:
                     break
                 seq_group = self.waiting[0]
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 6068311307f7b..16672086e5bae 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -312,6 +312,19 @@ def round_up(n, multiple):
             categorized_sample_indices=categorized_sample_indices,
             sliding_window=self.sliding_window,
         )
+
+        # Create attention mask
+        attn_masks = [
+            torch.zeros((len(input_tokens), self.block_size), dtype=torch.int64) for _ in range(max_num_blocks_per_seq)]
+        for i in range(0, max_num_blocks_per_seq):
+            for seq_id in range(len(input_tokens)):
+                if (i * self.block_size) < context_lens[seq_id] and (i + 1) * self.block_size > context_lens[seq_id]:
+                    attn_masks[i][seq_id, :context_lens[seq_id] % self.block_size] = 1
+                elif (i+1) * self.block_size <= context_lens[seq_id]:
+                    attn_masks[i][seq_id, :] = 1
+            attn_masks[i] = attn_masks[i].to(device="cuda", non_blocking=True)
+        input_metadata.attention_masks = attn_masks
+        print("input token shape: ", tokens_tensor.shape)
         return tokens_tensor, positions_tensor, input_metadata
 
     @torch.inference_mode()

From 153eb716f81d094437069e1a75f7a9699412cf6d Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Tue, 14 Nov 2023 03:34:05 +0200
Subject: [PATCH 08/43] 1. modify kv cache structure to tensors 2. update hpu
 paged attention API (for hpu graph compatibility)

---
 vllm/worker/cache_engine.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index d2f5d53de4938..d7b4df272523d 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -77,17 +77,16 @@ def allocate_hpu_cache(self) -> List[KVCache]:
         for _ in range(self.num_layers):
             key_blocks = []
             value_blocks = []
-            for _ in range(self.num_gpu_blocks):
-                key_blocks.append(torch.empty(
-                    size=kv_block_shape,
-                    dtype=self.dtype,
-                    device="hpu",
-                ))
-                value_blocks.append(torch.empty(
-                    size=kv_block_shape,
-                    dtype=self.dtype,
-                    device="hpu",
-                ))
+            key_blocks = torch.empty(
+                size=(self.num_gpu_blocks, *kv_block_shape),
+                dtype=self.dtype,
+                device="hpu",
+            )
+            value_blocks = torch.empty(
+                size=(self.num_gpu_blocks, *kv_block_shape),
+                dtype=self.dtype,
+                device="hpu",
+            )
             hpu_cache.append((key_blocks, value_blocks))
         return hpu_cache
 

From 9b7e0a71a7965e790d9be692fd9dfe09fa11c9c6 Mon Sep 17 00:00:00 2001
From: Xiaotong Chen <xchen@habana.ai>
Date: Thu, 16 Nov 2023 07:04:08 +0200
Subject: [PATCH 09/43] add attention mask for generation

---
 vllm/worker/worker.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 16672086e5bae..7e6a7769bd4a7 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -314,16 +314,15 @@ def round_up(n, multiple):
         )
 
         # Create attention mask
-        attn_masks = [
-            torch.zeros((len(input_tokens), self.block_size), dtype=torch.int64) for _ in range(max_num_blocks_per_seq)]
-        for i in range(0, max_num_blocks_per_seq):
-            for seq_id in range(len(input_tokens)):
-                if (i * self.block_size) < context_lens[seq_id] and (i + 1) * self.block_size > context_lens[seq_id]:
-                    attn_masks[i][seq_id, :context_lens[seq_id] % self.block_size] = 1
-                elif (i+1) * self.block_size <= context_lens[seq_id]:
-                    attn_masks[i][seq_id, :] = 1
-            attn_masks[i] = attn_masks[i].to(device="cuda", non_blocking=True)
-        input_metadata.attention_masks = attn_masks
+        if max_num_blocks_per_seq != 0:
+            attn_masks = torch.zeros((max_num_blocks_per_seq, len(input_tokens), self.block_size), dtype=torch.int64)
+            for i in range(0, max_num_blocks_per_seq):
+                for seq_id in range(len(input_tokens)):
+                    if (i * self.block_size) < context_lens[seq_id] and (i + 1) * self.block_size > context_lens[seq_id]:
+                        attn_masks[i][seq_id, :context_lens[seq_id] % self.block_size] = 1
+                    elif (i + 1) * self.block_size <= context_lens[seq_id]:
+                        attn_masks[i][seq_id, :] = 1
+            input_metadata.attention_masks = attn_masks.to(device="cuda")
         print("input token shape: ", tokens_tensor.shape)
         return tokens_tensor, positions_tensor, input_metadata
 

From c99eefc1d22415e00f0c0bd7af4181cd9942117d Mon Sep 17 00:00:00 2001
From: Jinyan Chen <jychen@habana.ai>
Date: Sun, 19 Nov 2023 02:32:48 -0800
Subject: [PATCH 10/43] add multi_query_kv_attention attn_bias

---
 vllm/model_executor/layers/attention.py | 4 ++--
 vllm/worker/worker.py                   | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 37d30bd1e3790..6482875d1c55b 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 from xformers import ops as xops
-# from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
-#                                          LowerTriangularMaskWithTensorBias)
+from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
+                                         LowerTriangularMaskWithTensorBias)
 
 from vllm._C import ops
 from vllm._C import cache_ops
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 7e6a7769bd4a7..d901035b1b2e8 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -323,6 +323,8 @@ def round_up(n, multiple):
                     elif (i + 1) * self.block_size <= context_lens[seq_id]:
                         attn_masks[i][seq_id, :] = 1
             input_metadata.attention_masks = attn_masks.to(device="cuda")
+        # import pdb
+        # pdb.set_trace()
         print("input token shape: ", tokens_tensor.shape)
         return tokens_tensor, positions_tensor, input_metadata
 

From 1327be851d4da3cd0a2e2d9ce8bd8b2c7f47246f Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Fri, 8 Dec 2023 10:05:21 +0000
Subject: [PATCH 11/43] Temp commit

---
 vllm/entrypoints/llm.py                       |   8 +-
 vllm/model_executor/layers/attention.py       |   6 +
 .../model_executor/layers/rotary_embedding.py | 112 ++++++++++++++++++
 3 files changed, 123 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 vllm/entrypoints/llm.py
 mode change 100644 => 100755 vllm/model_executor/layers/attention.py
 mode change 100644 => 100755 vllm/model_executor/layers/rotary_embedding.py

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
old mode 100644
new mode 100755
index 1df7b1a80b1b2..e399de249c9c3
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -183,7 +183,7 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu
             pbar = tqdm(total=num_requests, desc="Processed prompts")
         if profiling:
             prof = torch.profiler.profile(
-                schedule = torch.profiler.schedule(wait=0, warmup=0, active=4, repeat=1),
+                schedule = torch.profiler.schedule(wait=6, warmup=0, active=2, repeat=1),
                 activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU],
                 with_stack = True,
                 record_shapes = False,
@@ -196,9 +196,11 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu
         outputs: List[RequestOutput] = []
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
+            print("vLLM completed a step")
             if profiling:
                 count += 1
-                if count == 4:
+                print(f"Processing step {count}")
+                if count == 8:
                     break
             for output in step_outputs:
                 if output.finished:
@@ -207,9 +209,9 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu
                         pbar.update(1)
             if profiling:
                 htorch.core.mark_step()
-                htorch.hpu.synchronize()
                 prof.step()
         if profiling:
+            htorch.hpu.synchronize()
             prof.stop()
         if use_tqdm:
             pbar.close()
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
old mode 100644
new mode 100755
index 6482875d1c55b..109569944eecb
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -10,7 +10,13 @@
 from vllm._C import ops
 from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
+<<<<<<< HEAD
 from vllm.utils import is_hip
+=======
+from vllm.model_executor.layers.rotary_embedding import (
+    DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding,
+    LlamaRotaryEmbedding, YaRNScalingRotaryEmbedding)
+>>>>>>> 0077e65 (Temp commit)
 
 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
old mode 100644
new mode 100755
index 91c093e33e3c9..76ee077bb7b82
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -43,6 +43,118 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
+def get_device_name():
+    """
+    Returns the name of the current device: Gaudi or Gaudi2.
+
+    Inspired from: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
+    """
+    import habana_frameworks.torch.utils.experimental as htexp
+
+    device_type = htexp._get_device_type()
+
+    if device_type == htexp.synDeviceType.synDeviceGaudi:
+        return "gaudi"
+    elif device_type == htexp.synDeviceType.synDeviceGaudi2:
+        return "gaudi2"
+    else:
+        raise ValueError(f"Unsupported device: the device type is {device_type}.")
+
+# TODO: remove this workaround when FusedRoPE properly works on Gaudi
+if get_device_name() == "gaudi2":
+    try:
+        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+    except ImportError:
+        print("Not using HPU fused kernel for apply_rotary_pos_emb")
+        FusedRoPE = None
+else:
+    FusedRoPE = None
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids]#.unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids]#.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='cuda'):
+        super().__init__()
+
+        self.head_size = head_size
+        self.dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        #import pdb
+        #pdb.set_trace()
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor):
+        #import pdb
+        #pdb.set_trace()
+        seq_len = key.shape[-2]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
+
+        cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
+        queries = torch.split(query, self.head_size, dim=-1)
+        keys = torch.split(key, self.head_size, dim=-1)
+        qs = []
+        ks = []
+        for i in range(len(keys)):
+            if query.device.type == "hpu" and FusedRoPE:
+                q, k = FusedRoPE.apply(queries[i], cos, sin, positions), FusedRoPE.apply(keys[i], cos, sin, positions)
+            else:
+                q, k = apply_rotary_pos_emb(queries[i], keys[i], cos, sin, positions)
+            qs.append(q)
+            ks.append(k)
+        return torch.cat(qs, dim=-1), torch.cat(ks, dim=-1)
+
+
 class RotaryEmbedding(nn.Module):
     """Original rotary positional embedding."""
 

From de7799fce09fe7adb20d009c80f568306096de20 Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Mon, 18 Dec 2023 19:17:59 +0200
Subject: [PATCH 12/43] Integrate fused kernels for RMSNorm and RoPE

---
 vllm/model_executor/layers/attention.py       |  6 -----
 vllm/model_executor/layers/layernorm.py       | 11 ++++++++
 .../model_executor/layers/rotary_embedding.py | 25 +++++++++----------
 3 files changed, 23 insertions(+), 19 deletions(-)
 mode change 100644 => 100755 vllm/model_executor/layers/layernorm.py

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 109569944eecb..6482875d1c55b 100755
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -10,13 +10,7 @@
 from vllm._C import ops
 from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
-<<<<<<< HEAD
 from vllm.utils import is_hip
-=======
-from vllm.model_executor.layers.rotary_embedding import (
-    DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding,
-    LlamaRotaryEmbedding, YaRNScalingRotaryEmbedding)
->>>>>>> 0077e65 (Temp commit)
 
 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
old mode 100644
new mode 100755
index cb3cee2bad5ad..bf63d905b4378
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -6,6 +6,11 @@
 
 from vllm._C import ops
 
+try:
+    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+except ImportError:
+    print("Not using HPU fused kernel for RMSNorm")
+    FusedRMSNorm = None
 
 class RMSNorm(nn.Module):
     """Root mean square normalization.
@@ -56,6 +61,12 @@ def forward(
                 self.variance_epsilon,
             )
             return x, residual
+
+        if x.device.type == "hpu" and FusedRMSNorm:
+            orig_dtype = x.dtype
+            x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon)
+            return x.to(orig_dtype)
+
         out = torch.empty_like(x)
         ops.rms_norm(
             out,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 76ee077bb7b82..40d61e2e91240 100755
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -63,7 +63,7 @@ def get_device_name():
 # TODO: remove this workaround when FusedRoPE properly works on Gaudi
 if get_device_name() == "gaudi2":
     try:
-        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV2 as FusedRoPE
+        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE
     except ImportError:
         print("Not using HPU fused kernel for apply_rotary_pos_emb")
         FusedRoPE = None
@@ -141,18 +141,17 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso
             self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
 
         cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
-        queries = torch.split(query, self.head_size, dim=-1)
-        keys = torch.split(key, self.head_size, dim=-1)
-        qs = []
-        ks = []
-        for i in range(len(keys)):
-            if query.device.type == "hpu" and FusedRoPE:
-                q, k = FusedRoPE.apply(queries[i], cos, sin, positions), FusedRoPE.apply(keys[i], cos, sin, positions)
-            else:
-                q, k = apply_rotary_pos_emb(queries[i], keys[i], cos, sin, positions)
-            qs.append(q)
-            ks.append(k)
-        return torch.cat(qs, dim=-1), torch.cat(ks, dim=-1)
+        query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size))
+        key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
+        if query.device.type == "hpu" and FusedRoPE:
+            #print('using FusedRoPE')
+            cos = cos[positions].unsqueeze(2)
+            sin = sin[positions].unsqueeze(2)
+            query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
+        else:
+            #print('using torch RoPE')
+            query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
+        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
 
 
 class RotaryEmbedding(nn.Module):

From b8391810cb5baeca5cd1628b33efa2b286d5518b Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Thu, 21 Dec 2023 13:47:25 +0200
Subject: [PATCH 13/43] Resolve merge conflicts

---
 benchmarks/run_benchmark_bloom560m.sh         |  29 +-
 setup.py                                      | 430 +++++++++---------
 tests/async_engine/test_api_server.py         |   2 +-
 tests/conftest.py                             |   6 +-
 tests/kernels/test_attention.py               |   5 +-
 tests/samplers/test_beam_search.py            |   2 +-
 tests/samplers/test_logprobs.py               |   6 +-
 vllm/entrypoints/llm.py                       |   0
 vllm/model_executor/layers/attention.py       |  14 +-
 vllm/model_executor/layers/layernorm.py       |   6 +
 .../model_executor/layers/rotary_embedding.py |   4 +-
 vllm/model_executor/models/llama.py           |   1 +
 vllm/model_executor/sampling_metadata.py      |   2 +-
 vllm/worker/worker.py                         |   1 +
 14 files changed, 274 insertions(+), 234 deletions(-)
 mode change 100755 => 100644 vllm/entrypoints/llm.py
 mode change 100755 => 100644 vllm/model_executor/layers/attention.py
 mode change 100755 => 100644 vllm/model_executor/layers/layernorm.py
 mode change 100755 => 100644 vllm/model_executor/layers/rotary_embedding.py

diff --git a/benchmarks/run_benchmark_bloom560m.sh b/benchmarks/run_benchmark_bloom560m.sh
index 404860a95372d..13726bc3f46c0 100755
--- a/benchmarks/run_benchmark_bloom560m.sh
+++ b/benchmarks/run_benchmark_bloom560m.sh
@@ -1 +1,28 @@
-python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100
+cd /software/users/mdvoretckii/huda
+source reset.sh
+cd /software/users/mdvoretckii/habana_vllm
+python -m pip install -e .
+python -m pip install xformers --no-deps
+cd benchmarks
+#python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100
+python benchmark_throughput.py --tokenizer lmsys/vicuna-7b-v1.3 --model lmsys/vicuna-7b-v1.3 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100
+#curl -X POST  -H "Accept: Application/json" -H "Content-Type: application/json" http://localhost:8000/generate -d '{"prompt":"Would you like a jelly baby?","use_beam_search":false,"n":1}'
+
+
+# Missing ops:
+# Bloom: alibi
+# llama: RMS Norm, RoPE, fused silu, fail in sample
+# ---
+# GPT2: gelu_new
+# Aquila: issues with external source
+# Baichuan: no tokenizer
+# Falcon: fail in sample
+# Falcon RW: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens'
+# GPT BigCode: gated, santacoder fails in sample (not affected by CPU RoPE)
+# GPT-J: gelu_new
+# GPT-NeoX: gelu_fast
+# InternLM: no tokenizer class
+# Mistral: max_num_batched_tokens (2048) is smaller than max_model_len (32768).
+# MPT: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens'
+# OPT: fail in sample
+# Qwen: no tokenizer class
diff --git a/setup.py b/setup.py
index da56a61fc0278..57d02e9229022 100644
--- a/setup.py
+++ b/setup.py
@@ -33,210 +33,210 @@ def _is_cuda() -> bool:
 # # TODO(woosuk): Should we use -O3?
 # NVCC_FLAGS = ["-O2", "-std=c++17"]
 
-if _is_hip():
-    if ROCM_HOME is None:
-        raise RuntimeError(
-            "Cannot find ROCM_HOME. ROCm must be available to build the package."
-        )
-    NVCC_FLAGS += ["-DUSE_ROCM"]
-
-if _is_cuda() and CUDA_HOME is None:
-    raise RuntimeError(
-        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
-
-ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-
-
-def get_amdgpu_offload_arch():
-    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
-    try:
-        output = subprocess.check_output([command])
-        return output.decode('utf-8').strip()
-    except subprocess.CalledProcessError as e:
-        error_message = f"Error: {e}"
-        raise RuntimeError(error_message) from e
-    except FileNotFoundError as e:
-        # If the command is not found, print an error message
-        error_message = f"The command {command} was not found."
-        raise RuntimeError(error_message) from e
-
-    return None
-
-
-def get_hipcc_rocm_version():
-    # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            text=True)
-
-    # Check if the command was executed successfully
-    if result.returncode != 0:
-        print("Error running 'hipcc --version'")
-        return None
-
-    # Extract the version using a regular expression
-    match = re.search(r'HIP version: (\S+)', result.stdout)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        print("Could not find HIP version in the output")
-        return None
-
-
-def get_nvcc_cuda_version(cuda_dir: str) -> Version:
-    """Get the CUDA version from nvcc.
-
-    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
-    """
-    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
-                                          universal_newlines=True)
-    output = nvcc_output.split()
-    release_idx = output.index("release") + 1
-    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
-    return nvcc_cuda_version
-
-
-def get_torch_arch_list() -> Set[str]:
-    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
-    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
-    # compiler to additionally include PTX code that can be runtime-compiled
-    # and executed on the 8.6 or newer architectures. While the PTX code will
-    # not give the best performance on the newer architectures, it provides
-    # forward compatibility.
-    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
-    if env_arch_list is None:
-        return set()
-
-    # List are separated by ; or space.
-    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
-    if not torch_arch_list:
-        return set()
-
-    # Filter out the invalid architectures and print a warning.
-    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
-        {s + "+PTX"
-         for s in NVIDIA_SUPPORTED_ARCHS})
-    arch_list = torch_arch_list.intersection(valid_archs)
-    # If none of the specified architectures are valid, raise an error.
-    if not arch_list:
-        raise RuntimeError(
-            "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
-            f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA/ROCM architectures are: {valid_archs}.")
-    invalid_arch_list = torch_arch_list - valid_archs
-    if invalid_arch_list:
-        warnings.warn(
-            f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
-            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
-            f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
-            f"{valid_archs}.",
-            stacklevel=2)
-    return arch_list
-
-
-# First, check the TORCH_CUDA_ARCH_LIST environment variable.
-compute_capabilities = get_torch_arch_list()
-if _is_cuda() and not compute_capabilities:
-    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
-    # GPUs on the current machine.
-    device_count = torch.cuda.device_count()
-    for i in range(device_count):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 7:
-            raise RuntimeError(
-                "GPUs with compute capability below 7.0 are not supported.")
-        compute_capabilities.add(f"{major}.{minor}")
-
-if _is_cuda():
-    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
-    if not compute_capabilities:
-        # If no GPU is specified nor available, add all supported architectures
-        # based on the NVCC CUDA version.
-        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
-        if nvcc_cuda_version < Version("11.1"):
-            compute_capabilities.remove("8.6")
-        if nvcc_cuda_version < Version("11.8"):
-            compute_capabilities.remove("8.9")
-            compute_capabilities.remove("9.0")
-    # Validate the NVCC CUDA version.
-    if nvcc_cuda_version < Version("11.0"):
-        raise RuntimeError(
-            "CUDA 11.0 or higher is required to build the package.")
-    if (nvcc_cuda_version < Version("11.1")
-            and any(cc.startswith("8.6") for cc in compute_capabilities)):
-        raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6.")
-    if nvcc_cuda_version < Version("11.8"):
-        if any(cc.startswith("8.9") for cc in compute_capabilities):
-            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
-            # However, GPUs with compute capability 8.9 can also run the code generated by
-            # the previous versions of CUDA 11 and targeting compute capability 8.0.
-            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
-            # instead of 8.9.
-            warnings.warn(
-                "CUDA 11.8 or higher is required for compute capability 8.9. "
-                "Targeting compute capability 8.0 instead.",
-                stacklevel=2)
-            compute_capabilities = set(cc for cc in compute_capabilities
-                                       if not cc.startswith("8.9"))
-            compute_capabilities.add("8.0+PTX")
-        if any(cc.startswith("9.0") for cc in compute_capabilities):
-            raise RuntimeError(
-                "CUDA 11.8 or higher is required for compute capability 9.0.")
-
-    # Add target compute capabilities to NVCC flags.
-    for capability in compute_capabilities:
-        num = capability[0] + capability[2]
-        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
-        if capability.endswith("+PTX"):
-            NVCC_FLAGS += [
-                "-gencode", f"arch=compute_{num},code=compute_{num}"
-            ]
-
-    # Use NVCC threads to parallelize the build.
-    if nvcc_cuda_version >= Version("11.2"):
-        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
-        num_threads = min(os.cpu_count(), nvcc_threads)
-        NVCC_FLAGS += ["--threads", str(num_threads)]
-
-elif _is_hip():
-    amd_arch = get_amdgpu_offload_arch()
-    if amd_arch not in ROCM_SUPPORTED_ARCHS:
-        raise RuntimeError(
-            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
-            f"amdgpu_arch_found: {amd_arch}")
-
-# ext_modules = []
-
-if _is_cuda() or _is_hip():
-    vllm_extension_sources = [
-        "csrc/cache_kernels.cu",
-        "csrc/attention/attention_kernels.cu",
-        "csrc/pos_encoding_kernels.cu",
-        "csrc/activation_kernels.cu",
-        "csrc/layernorm_kernels.cu",
-        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-        "csrc/cuda_utils_kernels.cu",
-        "csrc/pybind.cpp",
-    ]
-
-    if _is_cuda():
-        vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
-        vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu")
-
-    vllm_extension = CUDAExtension(
-        name="vllm._C",
-        sources=vllm_extension_sources,
-        extra_compile_args={
-            "cxx": CXX_FLAGS,
-            "nvcc": NVCC_FLAGS,
-        },
-    )
-    ext_modules.append(vllm_extension)
+# if _is_hip():
+#     if ROCM_HOME is None:
+#         raise RuntimeError(
+#             "Cannot find ROCM_HOME. ROCm must be available to build the package."
+#         )
+#     NVCC_FLAGS += ["-DUSE_ROCM"]
+
+# if _is_cuda() and CUDA_HOME is None:
+#     raise RuntimeError(
+#         "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+
+# ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+# CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+# NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+
+
+# def get_amdgpu_offload_arch():
+#     command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
+#     try:
+#         output = subprocess.check_output([command])
+#         return output.decode('utf-8').strip()
+#     except subprocess.CalledProcessError as e:
+#         error_message = f"Error: {e}"
+#         raise RuntimeError(error_message) from e
+#     except FileNotFoundError as e:
+#         # If the command is not found, print an error message
+#         error_message = f"The command {command} was not found."
+#         raise RuntimeError(error_message) from e
+
+#     return None
+
+
+# def get_hipcc_rocm_version():
+#     # Run the hipcc --version command
+#     result = subprocess.run(['hipcc', '--version'],
+#                             stdout=subprocess.PIPE,
+#                             stderr=subprocess.STDOUT,
+#                             text=True)
+
+#     # Check if the command was executed successfully
+#     if result.returncode != 0:
+#         print("Error running 'hipcc --version'")
+#         return None
+
+#     # Extract the version using a regular expression
+#     match = re.search(r'HIP version: (\S+)', result.stdout)
+#     if match:
+#         # Return the version string
+#         return match.group(1)
+#     else:
+#         print("Could not find HIP version in the output")
+#         return None
+
+
+# def get_nvcc_cuda_version(cuda_dir: str) -> Version:
+#     """Get the CUDA version from nvcc.
+
+#     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
+#     """
+#     nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+#                                           universal_newlines=True)
+#     output = nvcc_output.split()
+#     release_idx = output.index("release") + 1
+#     nvcc_cuda_version = parse(output[release_idx].split(",")[0])
+#     return nvcc_cuda_version
+
+
+# def get_torch_arch_list() -> Set[str]:
+#     # TORCH_CUDA_ARCH_LIST can have one or more architectures,
+#     # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
+#     # compiler to additionally include PTX code that can be runtime-compiled
+#     # and executed on the 8.6 or newer architectures. While the PTX code will
+#     # not give the best performance on the newer architectures, it provides
+#     # forward compatibility.
+#     env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+#     if env_arch_list is None:
+#         return set()
+
+#     # List are separated by ; or space.
+#     torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
+#     if not torch_arch_list:
+#         return set()
+
+#     # Filter out the invalid architectures and print a warning.
+#     valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
+#         {s + "+PTX"
+#          for s in NVIDIA_SUPPORTED_ARCHS})
+#     arch_list = torch_arch_list.intersection(valid_archs)
+#     # If none of the specified architectures are valid, raise an error.
+#     if not arch_list:
+#         raise RuntimeError(
+#             "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
+#             f"variable ({env_arch_list}) is supported. "
+#             f"Supported CUDA/ROCM architectures are: {valid_archs}.")
+#     invalid_arch_list = torch_arch_list - valid_archs
+#     if invalid_arch_list:
+#         warnings.warn(
+#             f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
+#             "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
+#             f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
+#             f"{valid_archs}.",
+#             stacklevel=2)
+#     return arch_list
+
+
+# # First, check the TORCH_CUDA_ARCH_LIST environment variable.
+# compute_capabilities = get_torch_arch_list()
+# if _is_cuda() and not compute_capabilities:
+#     # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
+#     # GPUs on the current machine.
+#     device_count = torch.cuda.device_count()
+#     for i in range(device_count):
+#         major, minor = torch.cuda.get_device_capability(i)
+#         if major < 7:
+#             raise RuntimeError(
+#                 "GPUs with compute capability below 7.0 are not supported.")
+#         compute_capabilities.add(f"{major}.{minor}")
+
+# if _is_cuda():
+#     nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
+#     if not compute_capabilities:
+#         # If no GPU is specified nor available, add all supported architectures
+#         # based on the NVCC CUDA version.
+#         compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
+#         if nvcc_cuda_version < Version("11.1"):
+#             compute_capabilities.remove("8.6")
+#         if nvcc_cuda_version < Version("11.8"):
+#             compute_capabilities.remove("8.9")
+#             compute_capabilities.remove("9.0")
+#     # Validate the NVCC CUDA version.
+#     if nvcc_cuda_version < Version("11.0"):
+#         raise RuntimeError(
+#             "CUDA 11.0 or higher is required to build the package.")
+#     if (nvcc_cuda_version < Version("11.1")
+#             and any(cc.startswith("8.6") for cc in compute_capabilities)):
+#         raise RuntimeError(
+#             "CUDA 11.1 or higher is required for compute capability 8.6.")
+#     if nvcc_cuda_version < Version("11.8"):
+#         if any(cc.startswith("8.9") for cc in compute_capabilities):
+#             # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
+#             # However, GPUs with compute capability 8.9 can also run the code generated by
+#             # the previous versions of CUDA 11 and targeting compute capability 8.0.
+#             # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
+#             # instead of 8.9.
+#             warnings.warn(
+#                 "CUDA 11.8 or higher is required for compute capability 8.9. "
+#                 "Targeting compute capability 8.0 instead.",
+#                 stacklevel=2)
+#             compute_capabilities = set(cc for cc in compute_capabilities
+#                                        if not cc.startswith("8.9"))
+#             compute_capabilities.add("8.0+PTX")
+#         if any(cc.startswith("9.0") for cc in compute_capabilities):
+#             raise RuntimeError(
+#                 "CUDA 11.8 or higher is required for compute capability 9.0.")
+
+#     # Add target compute capabilities to NVCC flags.
+#     for capability in compute_capabilities:
+#         num = capability[0] + capability[2]
+#         NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
+#         if capability.endswith("+PTX"):
+#             NVCC_FLAGS += [
+#                 "-gencode", f"arch=compute_{num},code=compute_{num}"
+#             ]
+
+#     # Use NVCC threads to parallelize the build.
+#     if nvcc_cuda_version >= Version("11.2"):
+#         nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
+#         num_threads = min(os.cpu_count(), nvcc_threads)
+#         NVCC_FLAGS += ["--threads", str(num_threads)]
+
+# elif _is_hip():
+#     amd_arch = get_amdgpu_offload_arch()
+#     if amd_arch not in ROCM_SUPPORTED_ARCHS:
+#         raise RuntimeError(
+#             f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
+#             f"amdgpu_arch_found: {amd_arch}")
+
+# # ext_modules = []
+
+# if _is_cuda() or _is_hip():
+#     vllm_extension_sources = [
+#         "csrc/cache_kernels.cu",
+#         "csrc/attention/attention_kernels.cu",
+#         "csrc/pos_encoding_kernels.cu",
+#         "csrc/activation_kernels.cu",
+#         "csrc/layernorm_kernels.cu",
+#         "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+#         "csrc/cuda_utils_kernels.cu",
+#         "csrc/pybind.cpp",
+#     ]
+
+#     if _is_cuda():
+#         vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+#         vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu")
+
+#     vllm_extension = CUDAExtension(
+#         name="vllm._C",
+#         sources=vllm_extension_sources,
+#         extra_compile_args={
+#             "cxx": CXX_FLAGS,
+#             "nvcc": NVCC_FLAGS,
+#         },
+#     )
+#     ext_modules.append(vllm_extension)
 
 
 def get_path(*filepath) -> str:
@@ -259,17 +259,17 @@ def find_version(filepath: str) -> str:
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
 
-    if _is_hip():
-        # Get the HIP version
-        hipcc_version = get_hipcc_rocm_version()
-        if hipcc_version != MAIN_CUDA_VERSION:
-            rocm_version_str = hipcc_version.replace(".", "")[:3]
-            version += f"+rocm{rocm_version_str}"
-    else:
-        cuda_version = str(nvcc_cuda_version)
-        if cuda_version != MAIN_CUDA_VERSION:
-            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
+    # if _is_hip():
+    #     # Get the HIP version
+    #     hipcc_version = get_hipcc_rocm_version()
+    #     if hipcc_version != MAIN_CUDA_VERSION:
+    #         rocm_version_str = hipcc_version.replace(".", "")[:3]
+    #         version += f"+rocm{rocm_version_str}"
+    # else:
+    #     cuda_version = str(nvcc_cuda_version)
+    #     if cuda_version != MAIN_CUDA_VERSION:
+    #         cuda_version_str = cuda_version.replace(".", "")[:3]
+    #         version += f"+cu{cuda_version_str}"
 
     return version
 
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index d90ba37b27bb9..2eb1b2606b80e 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -26,7 +26,7 @@ def api_server():
         "api_server_async_engine.py").absolute()
     uvicorn_process = subprocess.Popen([
         sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m"
+        str(script_path), "--model", "lmsys/vicuna-7b-v1.3"
     ])
     yield
     uvicorn_process.terminate()
diff --git a/tests/conftest.py b/tests/conftest.py
index 16c04e01d703c..7b73aaff6f6c9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -57,7 +57,7 @@ def __init__(
             model_name,
             torch_dtype=torch_dtype,
             trust_remote_code=True,
-        ).cuda()
+        )#.cuda()
         if tokenizer_name is None:
             tokenizer_name = model_name
         self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
@@ -71,7 +71,7 @@ def generate(
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             output_ids = self.model.generate(
-                input_ids.cuda(),
+                input_ids,#.cuda(),
                 use_cache=True,
                 **kwargs,
             )
@@ -127,7 +127,7 @@ def generate_greedy_logprobs(
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             output = self.model.generate(
-                input_ids.cuda(),
+                input_ids,#.cuda(),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 614b65f82ccbd..c6755b4d8f34e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -97,7 +97,7 @@ def ref_single_query_cached_kv_attention(
         output[i].copy_(out, non_blocking=True)
 
 
-@pytest.mark.parametrize("version", ["v1", "v2"])
+@pytest.mark.parametrize("version", ["v1"])#, "v2"])
 @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -162,8 +162,7 @@ def test_paged_attention(
     # Call the paged attention kernel.
     output = torch.empty_like(query)
     if version == "v1":
-        ops.paged_attention_v1(
-            output,
+        output = ops.paged_attention_v1(
             query,
             key_cache,
             value_cache,
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index a491ffa763505..4cf777e2b9e6f 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -10,7 +10,7 @@
 #   3. Use the model "huggyllama/llama-7b".
 MAX_TOKENS = [128]
 BEAM_WIDTHS = [4]
-MODELS = ["facebook/opt-125m"]
+MODELS = ["lmsys/vicuna-7b-v1.3"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 1c67cc5bd7394..24b1572d9a325 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -3,7 +3,7 @@
 
 from vllm import SamplingParams
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["lmsys/vicuna-7b-v1.3"]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -16,7 +16,7 @@ def test_get_prompt_logprobs(
     example_prompts,
 ):
     max_tokens = 5
-    hf_model = hf_runner(model, dtype=dtype)
+    hf_model = hf_runner(model, dtype="float")
     hf_logprobs = hf_model.generate_greedy_logprobs(
         example_prompts,
         max_tokens=max_tokens,
@@ -24,6 +24,8 @@ def test_get_prompt_logprobs(
     del hf_model
 
     vllm_model = vllm_runner(model, dtype=dtype)
+    import pdb
+    pdb.set_trace()
     vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                           logprobs=5,
                                           prompt_logprobs=5,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
old mode 100755
new mode 100644
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
old mode 100755
new mode 100644
index 6482875d1c55b..1da061a6a52c3
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -143,17 +143,22 @@ def forward(
                 key = key.unflatten(0, (batch_size, seq_len))
                 value = value.unflatten(0, (batch_size, seq_len))
 
+            cu_seq_lens = [0]
+            for i in range(len(input_metadata.prompt_lens)):
+                cu_seq_lens.append(cu_seq_lens[-1] + input_metadata.prompt_lens[i])
+            input_metadata.cu_seq_lens = cu_seq_lens
             out = xops.memory_efficient_attention_forward(
                 query,
                 key,
                 value,
+                cu_seq_lens=cu_seq_lens,
                 attn_bias=input_metadata.attn_bias,
                 p=0.0,
                 scale=self.scale,
-                op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
-                (is_hip()) else None,
             )
-            output = out.view_as(query)
+            output = torch.zeros_like(query)
+            output[:, :out.shape[1], :, :] = out
+            output = output.view_as(query)
         else:
             # Decoding run.
             if key_cache is not None and value_cache is not None:
@@ -236,8 +241,7 @@ def _paged_attention(
         max_num_partitions == 1 or num_seqs * num_heads > 512)
     if use_v1:
         # Run PagedAttention V1.
-        ops.paged_attention_v1(
-            output,
+        output = ops.paged_attention_v1(
             query,
             key_cache,
             value_cache,
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
old mode 100755
new mode 100644
index bf63d905b4378..ccfd55806d5c6
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -54,6 +54,12 @@ def forward(
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         if residual is not None:
+            if x.device.type == "hpu" and FusedRMSNorm:
+                orig_dtype = x.dtype
+                residual += x
+                x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon)
+                return x.to(orig_dtype), residual
+
             ops.fused_add_rms_norm(
                 x,
                 residual,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
old mode 100755
new mode 100644
index 40d61e2e91240..ecd78bd008d25
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -454,8 +454,8 @@ def get_rope(
         return _ROPE_DICT[key]
 
     if rope_scaling is None:
-        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                     is_neox_style)
+        rotary_emb = LlamaRotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                          is_neox_style)
     else:
         scaling_type = rope_scaling["type"]
         scaling_factor = rope_scaling["factor"]
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index b3b24ea6fea44..7722cc140326d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -283,6 +283,7 @@ def forward(
         kv_caches: List[KVCache],
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
+        print(f'Input shape: {input_ids.shape}')
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    input_metadata)
         return hidden_states
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 49013ec273787..388e55ba92e67 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -146,7 +146,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                    dtype: torch.dtype) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
-        pin_memory = not in_wsl()
+        pin_memory = not in_wsl() and not device.type == "hpu"
         prompt_max_len = max(len(tokens) for tokens in prompt_tokens)
         prompt_padded_tokens = [
             tokens + [vocab_size] * (prompt_max_len - len(tokens))
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d901035b1b2e8..22c53005aa7f5 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -8,6 +8,7 @@
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig)
 from vllm.model_executor import set_random_seed
+from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.parallel_utils.parallel_state import (
     initialize_model_parallel)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata

From 00df4867965c9921c1ab130e1ed807064bab161e Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 21 Dec 2023 17:09:16 +0000
Subject: [PATCH 14/43] Minor Gaudi workarounds, add debugging to stock vLLM
 API server

---
 vllm/entrypoints/api_server.py        | 14 +++++++++++++-
 vllm/entrypoints/openai/api_server.py |  4 +++-
 vllm/worker/model_runner.py           |  2 +-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 6910b3265dfd2..b120210831fe5 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,7 +1,9 @@
 import argparse
 import json
 from typing import AsyncGenerator
-
+import torch
+import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.gpu_migration
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 import uvicorn
@@ -66,6 +68,16 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     prompt = final_output.prompt
     text_outputs = [prompt + output.text for output in final_output.outputs]
     ret = {"text": text_outputs}
+    DEBUG = True
+    if DEBUG:
+        text_tokens = [output.token_ids for output in final_output.outputs]
+        from vllm.transformers_utils.tokenizer import get_tokenizer
+        tokenizer = get_tokenizer('lmsys/vicuna-7b-v1.3')
+        decoded_tokens = [tokenizer.decode(token_ids) for token_ids in text_tokens]
+        ret["DEBUG"] = {
+            'tokens': text_tokens,
+            'decoded_tokens': decoded_tokens,
+        }
     return JSONResponse(ret)
 
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index be5f4190e633f..bb5b921123460 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -8,7 +8,9 @@
 import time
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
-
+import torch
+import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.gpu_migration
 from aioprometheus import MetricsMiddleware
 from aioprometheus.asgi.starlette import metrics
 import fastapi
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 276ef0708847a..44f447420295a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -402,7 +402,7 @@ def capture_model(self, kv_caches: List[KVCache]) -> None:
         input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
         input_positions = torch.zeros(max_batch_size, 1,
                                       dtype=torch.long).cuda()
-        slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda()
+        slot_mapping = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda() # FIXME (kzawora): revert this to torch.empty after bridge bug is fixed
         slot_mapping.fill_(_PAD_SLOT_ID)
         context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()

From 16b55577160e38b64bc79b1a65369abe2a3460a8 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 21 Dec 2023 17:34:09 +0000
Subject: [PATCH 15/43] Fix post-merge pinned memory segfaults

---
 vllm/utils.py               | 4 ++++
 vllm/worker/model_runner.py | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index eff5d10fd4ee0..b1af1c740e9b9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -59,3 +59,7 @@ def get_open_port():
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(("", 0))
         return s.getsockname()[1]
+
+
+def is_hpu() -> bool:
+    return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available()
\ No newline at end of file
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 489cd16dabcfb..89dd1f054a258 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -10,7 +10,7 @@
 from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.utils import in_wsl
+from vllm.utils import in_wsl, is_hpu
 
 logger = init_logger(__name__)
 
@@ -307,9 +307,9 @@ def _prepare_sample(
 
         selected_token_indices = _async_h2d(selected_token_indices,
                                             dtype=torch.long,
-                                            pin_memory=not self.in_wsl)
+                                            pin_memory=not is_hpu() and not self.in_wsl)
         categorized_sample_indices = {
-            t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not self.in_wsl)
+            t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not is_hpu() and not self.in_wsl)
             for t, seq_ids in categorized_sample_indices.items()
         }
 

From 2b6ec4e3438b131cbc211e743aa427a655cb9bb7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Thu, 21 Dec 2023 17:43:01 +0000
Subject: [PATCH 16/43] Re-enable sequence decode

---
 vllm/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 6ad70936dfd32..481ba1a17c808 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -425,7 +425,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
             child_seqs.append((parent, parent))
 
         for seq, _ in child_seqs:
-            # self._decode_sequence(seq, seq_group.sampling_params)
+            self._decode_sequence(seq, seq_group.sampling_params)
             self._check_stop(seq, seq_group.sampling_params)
 
         # Non-beam search case

From 9d4bd9f7d55c88cd3b887d9be1ecd69e97bbb716 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 22 Dec 2023 14:58:12 +0200
Subject: [PATCH 17/43] Maintain GPU compatibility in cache_engine

---
 vllm/worker/cache_engine.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index d7b4df272523d..0eaa3fa28f246 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -6,7 +6,7 @@
 from vllm._C import cache_ops
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import in_wsl
+from vllm.utils import in_wsl, is_hpu
 
 logger = init_logger(__name__)
 
@@ -41,8 +41,10 @@ def __init__(
         self.num_cpu_blocks = cache_config.num_cpu_blocks
 
         # Initialize the cache.
-        # self.gpu_cache = self.allocate_gpu_cache()
-        self.gpu_cache = self.allocate_hpu_cache()
+        if is_hpu():
+            self.gpu_cache = self.allocate_hpu_cache()
+        else:
+            self.gpu_cache = self.allocate_gpu_cache()
         self.cpu_cache = self.allocate_cpu_cache()
 
         # Initialize the stream for caching operations.
@@ -113,7 +115,7 @@ def allocate_cpu_cache(self) -> List[KVCache]:
         key_block_shape = self.get_key_block_shape()
         value_block_shape = self.get_value_block_shape()
         # pin_memory = not in_wsl()
-        pin_memory = not in_wsl() and not torch.hpu.is_available()
+        pin_memory = not in_wsl() and not is_hpu()
         if not pin_memory:
             # Pinning memory in WSL is not supported.
             # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications

From 7a0337ab69527a375220dbaa5a7c1699717b690e Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Wed, 10 Jan 2024 15:48:29 +0200
Subject: [PATCH 18/43] Adjust HPU RoPE for non-query runs

---
 vllm/model_executor/layers/rotary_embedding.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index ecd78bd008d25..94aa842f71ff2 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -145,8 +145,12 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso
         key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
         if query.device.type == "hpu" and FusedRoPE:
             #print('using FusedRoPE')
-            cos = cos[positions].unsqueeze(2)
-            sin = sin[positions].unsqueeze(2)
+            if len(positions[0]) == 1:
+                cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+                sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+            else:
+                cos = cos[positions].unsqueeze(2)
+                sin = sin[positions].unsqueeze(2)
             query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
         else:
             #print('using torch RoPE')

From 6351d4115727d79f7ee5e5bba3a5dd94045cb3fa Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Tue, 23 Jan 2024 15:13:15 +0200
Subject: [PATCH 19/43] Integrate HPU primitive implementations

---
 .../kernels/benchmark_paged_attention.py      |   6 +-
 tests/kernels/conftest.py                     |   6 +-
 tests/kernels/test_attention.py               |  17 +-
 tests/kernels/test_cache.py                   |   6 +-
 vllm/hpu/__init__.py                          |  11 +
 vllm/hpu/attn_bias.py                         | 764 ++++++++++++++++++
 vllm/hpu/cache_ops.py                         | 155 ++++
 vllm/hpu/cuda_utils.py                        |  14 +
 vllm/hpu/ops.py                               | 172 ++++
 vllm/model_executor/layers/activation.py      |   6 +-
 vllm/model_executor/layers/attention.py       |  19 +-
 vllm/model_executor/layers/layernorm.py       |   6 +-
 .../model_executor/layers/quantization/awq.py |   6 +-
 .../layers/quantization/gptq.py               |   6 +-
 .../layers/quantization/squeezellm.py         |   7 +-
 .../model_executor/layers/rotary_embedding.py |   6 +-
 vllm/utils.py                                 |  16 +-
 vllm/worker/cache_engine.py                   |   6 +-
 18 files changed, 1201 insertions(+), 28 deletions(-)
 create mode 100644 vllm/hpu/__init__.py
 create mode 100644 vllm/hpu/attn_bias.py
 create mode 100644 vllm/hpu/cache_ops.py
 create mode 100644 vllm/hpu/cuda_utils.py
 create mode 100644 vllm/hpu/ops.py

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 935393e9942ce..f22acca3b7909 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -4,7 +4,11 @@
 
 import torch
 
-from vllm._C import ops
+from vllm.utils import is_hpu()
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py
index 97516bd3052cf..80b62e4e0ef7d 100644
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -2,6 +2,7 @@
 
 import pytest
 import torch
+from vllm.utils import is_hpu
 
 
 def create_kv_caches(
@@ -18,7 +19,10 @@ def create_kv_caches(
 
     scale = head_size**-0.5
     x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    if is_hpu():
+        key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
+    else:
+        key_cache_shape = (num_blocks, num_heads, head_size, block_size)
     key_caches = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index c6755b4d8f34e..8ff15aceac542 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -4,10 +4,14 @@
 import pytest
 import torch
 from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
-from vllm._C import ops
-from vllm.utils import get_max_shared_memory_bytes
+from vllm.utils import get_max_shared_memory_bytes, is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+    from vllm.hpu.attn_bias import BlockDiagonalCausalMask
+else:
+    from vllm._C import ops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
@@ -71,8 +75,11 @@ def ref_single_query_cached_kv_attention(
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
 
-            k = key_cache[block_number, :, :, block_offset, :]
-            k = k.reshape(num_kv_heads, head_size)
+            if is_hpu():
+                k = key_cache[block_number, :, :, block_offset]
+            else:
+                k = key_cache[block_number, :, :, block_offset, :]
+                k = k.reshape(num_kv_heads, head_size)
             keys.append(k)
 
             v = value_cache[block_number, :, :, block_offset]
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 9b5d7687a3fec..bdef59b3b86b1 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -3,7 +3,11 @@
 import pytest
 import torch
 
-from vllm._C import cache_ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import cache_ops
+else:
+    from vllm._C import cache_ops
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [83]  # Arbitrary values for testing
diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
new file mode 100644
index 0000000000000..ce3a3ce5d435c
--- /dev/null
+++ b/vllm/hpu/__init__.py
@@ -0,0 +1,11 @@
+###############################################################################
+# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+###############################################################################
\ No newline at end of file
diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py
new file mode 100644
index 0000000000000..ac3ce8e6784cc
--- /dev/null
+++ b/vllm/hpu/attn_bias.py
@@ -0,0 +1,764 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import math
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union
+
+import torch
+
+
+class AttentionBias:
+    """Base class for a custom bias that can be applied \
+        as the attn_bias argument in
+        :attr:`xformers.ops.memory_efficient_attention`.
+
+    That function has the ability to add a tensor, the
+    attention bias, to the QK^T matrix before it is used
+    in the softmax part of the attention calculation.
+    The attention bias tensor with shape
+    (B or 1, n_queries, number of keys)
+    can be given as the attn_bias input.
+    The most common use case is for an attention bias is
+    to contain only zeros and negative infinities, which forms
+    a mask so that some queries only attend to some keys.
+
+    Children of this class define alternative things which can
+    be used as the attn_bias input to define an attention bias which
+    forms such a mask, for some common cases.
+
+    When using an :attr:`xformers.ops.AttentionBias`
+    instead of a :attr:`torch.Tensor`, the mask matrix does
+    not need to be materialized, and can be
+    hardcoded into some kernels for better performance.
+
+    See:
+
+    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask`
+    - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias`
+    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`
+    - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`
+
+    """
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        """
+        Materializes the bias as a `torch.Tensor`. This is very slow
+        and we don't attempt to make it fast. Only use for debugging/testing.
+
+        Shape should be like `[*, q_seqlen, k_seqlen]`
+        """
+        raise NotImplementedError()
+
+
+class LowerTriangularMask(AttentionBias):
+    """
+    A lower-triangular (aka causal) mask
+
+    A query Q cannot attend to a key which is farther from the
+    initial key than Q is from the initial query.
+    """
+
+    def __init__(self, *tensor_args, **tensor_kwargs) -> None:
+        # NOTE: Unused arguments, we keep them for backward compatibility
+        super().__init__()
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=float("-inf"),
+            device=device,
+        )
+        return torch.triu(tensor, diagonal=1).to(dtype)  # type: ignore
+
+    def add_bias(self, bias: torch.Tensor) -> "LowerTriangularMaskWithTensorBias":
+        return LowerTriangularMaskWithTensorBias(bias)
+
+
+class LowerTriangularMaskWithTensorBias(LowerTriangularMask):
+    """A lower-triangular (aka causal) mask with an additive bias"""
+
+    def __init__(self, bias: torch.Tensor) -> None:
+        self._bias = bias
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        return super().materialize(shape, dtype=dtype, device=device) + self._bias
+
+
+@dataclass
+class _SeqLenInfo:
+    """
+    (Internal) Represents the division of a dimension into blocks.
+
+    For example, to represents a dimension of length 7 divided into
+    three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`.
+    The members will be:
+        max_seqlen: 3
+        min_seqlen: 2
+        seqstart_py: [0, 2, 5, 7]
+        seqstart: torch.IntTensor([0, 2, 5, 7])
+    """
+
+    seqstart: torch.Tensor
+    max_seqlen: int
+    min_seqlen: int
+    seqstart_py: List[int]
+
+    def to(self, device: torch.device) -> None:
+        self.seqstart = self.seqstart.to(device, non_blocking=True)
+
+    def intervals(self) -> Iterable[Tuple[int, int]]:
+        yield from zip(self.seqstart_py, self.seqstart_py[1:])
+
+    @classmethod
+    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
+        """
+        Input tensors are assumed to be in shape [B, M, *]
+        """
+        assert not isinstance(seqlens, torch.Tensor)
+        seqstart_py = [0]
+        max_seqlen = -1
+        min_seqlen = -1
+        for seqlen in seqlens:
+            min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen
+            max_seqlen = max(max_seqlen, seqlen)
+            seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen)
+        seqstart = torch.tensor(seqstart_py, dtype=torch.int32)
+        return cls(
+            max_seqlen=max_seqlen,
+            min_seqlen=min_seqlen,
+            seqstart=seqstart,
+            seqstart_py=seqstart_py,
+        )
+
+    def split(
+        self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
+    ) -> List[torch.Tensor]:
+        if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1:
+            raise ValueError(
+                f"Invalid `torch.Tensor` of shape {x.shape}, expected format "
+                f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n"
+                f" seqstart: {self.seqstart_py}"
+            )
+        if batch_sizes is None:
+            batch_sizes = [1] * (len(self.seqstart_py) - 1)
+        split_chunks = []
+        it = 0
+        for batch_size in batch_sizes:
+            split_chunks.append(
+                self.seqstart_py[it + batch_size] - self.seqstart_py[it]
+            )
+            it += batch_size
+        return [
+            tensor.reshape([bs, -1, *tensor.shape[2:]])
+            for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1))
+        ]
+
+
+@dataclass
+class _PaddedSeqLenInfo(_SeqLenInfo):
+    """
+    (Internal)  Represents the division of a dimension into blocks which are
+    padded out to the same total length.
+
+    For example, to represent a dimension of length 12 with space for
+    three blocks of length 4, but where the occupied lengths are
+    2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`.
+
+    The layout along the dimension is
+
+     0 ─►  block 0
+           block 0
+           <space>
+           <space>
+     4 ─►  block 1
+           block 1
+           block 1
+           <space>
+     8 ─►  block 2
+           block 2
+           <space>
+           <space>
+    12 ─►
+
+    The members will be:
+        max_seqlen: 3
+        min_seqlen: 2
+        seqstart_py: [0, 4, 8, 12]
+        seqstart: torch.IntTensor([0, 4, 8, 12])
+        seqlen_py: [2, 3, 2]
+        seqlen: torch.IntTensor([2, 3, 2])
+        padding: 4
+    """
+
+    seqlen: torch.Tensor
+    seqlen_py: Sequence[int]
+    padding: int
+    # From parent: seqstart[i] contains the start position
+    # of the i-th sequence
+    # seqstart: torch.Tensor
+
+    def __post_init__(self) -> None:
+        assert len(self.seqstart_py) == len(self.seqlen_py) + 1
+
+    def to(self, device: torch.device) -> None:
+        self.seqlen = self.seqlen.to(device, non_blocking=True)
+        super().to(device)
+
+    def intervals(self) -> Iterable[Tuple[int, int]]:
+        for (start, _), length in zip(super().intervals(), self.seqlen_py):
+            yield start, start + length
+
+    @classmethod
+    def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo":
+        raise RuntimeError(
+            "Use either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`"
+        )
+
+    @classmethod
+    def from_seqlens_padded(
+        cls, seqlens: Sequence[int], padding: int
+    ) -> "_PaddedSeqLenInfo":
+        """
+        Input tensors are assumed to be in shape [B, M, *]
+        seqstart = padding * torch.arange(batch_size)
+        """
+        assert not isinstance(seqlens, torch.Tensor)
+        assert all(seqlen <= padding for seqlen in seqlens)
+        seqstart_py = list(range(0, len(seqlens) * padding + 1, padding))
+        return cls(
+            seqlen=torch.tensor(seqlens, dtype=torch.int32),
+            seqlen_py=seqlens,
+            max_seqlen=max(seqlens),
+            min_seqlen=min(seqlens),
+            seqstart=torch.tensor(seqstart_py, dtype=torch.int32),
+            seqstart_py=seqstart_py,
+            padding=padding,
+        )
+
+    def split(
+        self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None
+    ) -> List[torch.Tensor]:
+        raise NotImplementedError("_PaddedSeqLenInfo.split")
+
+
+@dataclass
+class BlockDiagonalMask(AttentionBias):
+    """
+    A block-diagonal mask that can be passed as ``attn_bias``
+    argument to :attr:`xformers.ops.memory_efficient_attention`.
+
+    Queries and Keys are each divided into the same number of blocks.
+    Queries in block i only attend to keys in block i.
+
+    .. figure:: /_static/block_diag_bias.png
+
+        This bias can be used to handle a batch of sequences of
+        different lengths, via :attr:`BlockDiagonalMask.from_tensor_list`
+
+    :Example:
+
+    .. code-block:: python
+
+        import torch
+        from xformers.ops import fmha
+
+        K = 16
+        dtype = torch.float16
+        device = "cuda"
+        list_x = [
+            torch.randn([1, 3, 1, K], dtype=dtype, device=device),
+            torch.randn([1, 6, 1, K], dtype=dtype, device=device),
+            torch.randn([1, 2, 1, K], dtype=dtype, device=device),
+        ]
+        attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x)
+        linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype)
+
+        q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2)
+        out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        list_out = attn_bias.split(out)
+        print(list_out[0].shape)  # [1, 3, 1, K]
+        assert tuple(list_out[0].shape) == (1, 3, 1, K)
+
+    """
+
+    q_seqinfo: _SeqLenInfo
+    k_seqinfo: _SeqLenInfo
+    _batch_sizes: Optional[Sequence[int]] = None
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        return torch.zeros(
+            shape,
+            dtype=dtype,
+            device=device,
+        )
+
+    def materialize(
+        self,
+        shape: Optional[Tuple[int, ...]] = None,
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        """Materialize the attention bias - for debugging & testing"""
+        if shape is None:
+            shape = (self.q_seqinfo.seqstart_py[-1],
+                     self.k_seqinfo.seqstart_py[-1])
+        assert shape[-1] == self.k_seqinfo.seqstart_py[-1], (
+            shape[-1],
+            self.k_seqinfo.seqstart_py[-1],
+        )
+        assert shape[-2] == self.q_seqinfo.seqstart_py[-1], (
+            shape[-2],
+            self.q_seqinfo.seqstart_py[-1],
+        )
+        mask = torch.empty(shape[-2:], dtype=dtype, device=device)
+        mask.fill_(-math.inf)
+        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
+            zip(
+                self.q_seqinfo.intervals(),
+                self.k_seqinfo.intervals(),
+            )
+        ):
+            mask[q_start:q_end, k_start:k_end] = self._create_block_mask(
+                (q_end - q_start, k_end - k_start),
+                dtype=dtype,
+                device=device,
+            )
+        for _ in range(len(shape) - 2):
+            mask = mask.unsqueeze(0)
+        return mask.expand(shape)
+
+    @classmethod
+    def from_seqlens(
+        cls,
+        q_seqlen: Sequence[int],
+        kv_seqlen: Optional[Sequence[int]] = None,
+    ) -> "BlockDiagonalMask":
+        """Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value.
+
+        Args:
+            q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors
+            kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value.
+                    (Defaults to ``q_seqlen``.)
+        Returns:
+            BlockDiagonalMask
+        """
+        assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen)
+        q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen)
+        if kv_seqlen is None or q_seqlen == kv_seqlen:
+            k_seqinfo = q_seqinfo
+        else:
+            k_seqinfo = _SeqLenInfo.from_seqlens(kv_seqlen)
+        return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo)
+
+    @classmethod
+    def from_tensor_list(
+        cls,
+        tensors: Sequence[torch.Tensor],
+    ) -> Tuple["BlockDiagonalMask", torch.Tensor]:
+        """Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors
+        concatenated on the sequence length dimension
+
+        .. figure:: /_static/block_diag_cat_split.png
+
+            See also :attr:`BlockDiagonalMask.split` to split the returned
+            :attr:`torch.Tensor` back to a list of tensors of varying sequence length
+
+        Args:
+            tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``.
+                All tensors should have the same dimension and the same batch size ``B``, but
+                they can have different sequence length ``M``.
+
+        Returns:
+            Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention
+            along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]``
+        """
+        batch_sizes = [tensor.shape[0] for tensor in tensors]
+        seqlens = []
+        for x in tensors:
+            for _ in range(x.shape[0]):
+                seqlens.append(x.shape[1])
+        block_diag = cls.from_seqlens(seqlens)
+        block_diag._batch_sizes = batch_sizes
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in tensors)
+        concat_tensors = torch.cat(tensors_bs1, dim=1)
+        return block_diag, concat_tensors
+
+    @classmethod
+    def from_tensor_lists_qkv(
+        cls,
+        tensors_q: Sequence[torch.Tensor],
+        tensors_k: Sequence[torch.Tensor],
+        tensors_v: Optional[Sequence[torch.Tensor]] = None,
+    ) -> Tuple["BlockDiagonalMask", torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        assert len(tensors_q) == len(tensors_k)
+        assert tensors_v is None or len(tensors_v) == len(tensors_q)
+        batch_sizes = [tensor.shape[0] for tensor in tensors_q]
+        q_seqlens, kv_seqlens = [], []
+        for i, (q, k) in enumerate(zip(tensors_q, tensors_k)):
+            assert q.shape[0] == k.shape[0]
+            q_seqlens += [q.shape[1]] * q.shape[0]
+            kv_seqlens += [k.shape[1]] * k.shape[0]
+            assert tensors_v is None or tensors_v[i].shape[:2] == k.shape[:2]
+        block_diag = cls.from_seqlens(q_seqlens, kv_seqlens)
+        block_diag._batch_sizes = batch_sizes
+        return (
+            block_diag,
+            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_q], dim=1),
+            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_k], dim=1),
+            torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_v], dim=1)
+            if tensors_v is not None
+            else None,
+        )
+
+    def split_queries(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+        return self.q_seqinfo.split(tensor, self._batch_sizes)
+
+    def split_kv(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+        return self.k_seqinfo.split(tensor, self._batch_sizes)
+
+    def split(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]:
+        """The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list`
+
+        Args:
+            tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]``
+
+        Returns:
+            Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths
+        """
+        assert self.q_seqinfo is self.k_seqinfo
+        return self.q_seqinfo.split(tensor, self._batch_sizes)
+
+    def make_causal(self) -> "BlockDiagonalCausalMask":
+        """Makes each block causal"""
+        return BlockDiagonalCausalMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+        )
+
+    def make_causal_from_bottomright(self) -> "BlockDiagonalCausalFromBottomRightMask":
+        """Makes each block causal with a possible non-causal prefix"""
+        return BlockDiagonalCausalFromBottomRightMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+        )
+
+    def make_local_attention(
+        self, window_size: int
+    ) -> "BlockDiagonalCausalLocalAttentionMask":
+        """Experimental: Makes each block causal with local attention"""
+        return BlockDiagonalCausalLocalAttentionMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+            _window_size=window_size,
+        )
+
+    def make_local_attention_from_bottomright(
+        self, window_size: int
+    ) -> "BlockDiagonalCausalLocalAttentionFromBottomRightMask":
+        """Experimental: Makes each block causal with local attention, start from bottom right"""
+        return BlockDiagonalCausalLocalAttentionFromBottomRightMask(
+            q_seqinfo=self.q_seqinfo,
+            k_seqinfo=self.k_seqinfo,
+            _batch_sizes=self._batch_sizes,
+            _window_size=window_size,
+        )
+
+
+@dataclass
+class BlockDiagonalCausalMask(BlockDiagonalMask):
+    """
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
+
+    Queries and Keys are each divided into the same number of blocks.
+    A query Q in block i cannot attend to a key which is not in block i,
+    nor one which is farther from the initial key in block i than Q
+    is from the initial query in block i.
+    """
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        return LowerTriangularMask().materialize(
+            shape,
+            dtype=dtype,
+            device=device,
+        )
+
+
+@dataclass
+class BlockDiagonalCausalFromBottomRightMask(BlockDiagonalMask):
+    """
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal.
+    This mask allows for a non-causal prefix
+    NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not
+    defined (softmax of vector of `-inf` in the attention)
+
+    Queries and keys are each divided into the same number of blocks.
+    A query Q in block i cannot attend to a key which is not in block i,
+    nor one which nearer the final key in block i than Q is to the
+    final query in block i.
+    """
+
+    def __post_init__(self) -> None:
+        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
+            zip(
+                self.q_seqinfo.intervals(),
+                self.k_seqinfo.intervals(),
+            )
+        ):
+            num_queries = q_end - q_start
+            num_keys = k_end - k_start
+            if num_keys < num_queries:
+                raise ValueError(
+                    f"Block #{i} has num_keys={num_keys} and num_queries={num_queries}."
+                    " Expected `num_keys >= num_queries`"
+                )
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=float("-inf"),
+            device=device,
+        )
+        num_queries, num_keys = shape[-2:]
+        return torch.triu(tensor, diagonal=num_keys - num_queries + 1).to(dtype)  # type: ignore
+
+
+@dataclass
+class BlockDiagonalCausalWithOffsetPaddedKeysMask(AttentionBias):
+    """
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`,
+    except an offset on causality is allowed for each block and we support padding for k/v
+
+    The keys and values are divided into blocks which are padded out to
+    the same total length.
+    For example, if there is space for 12 keys, for three blocks of
+    max length 4, but we only want to use the first 2, 3 and 2
+    of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`.
+    The queries are divided into blocks, without padding, of lengths given by
+    q_seqlen.
+
+    A query Q in block i cannot attend to a key which is not in block i,
+    nor one which is not in use (i.e. in the padded area),
+    nor one which is nearer to the final key in block i
+    than Q is to the final query in block i.
+    """
+
+    q_seqinfo: _SeqLenInfo
+    k_seqinfo: _PaddedSeqLenInfo
+    causal_diagonal: Any = None  # unused. Exists for BC only.
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=float("-inf"),
+            device=device,
+        )
+        num_queries, num_keys = shape[-2:]
+        return torch.triu(tensor, diagonal=1 + num_keys - num_queries).to(dtype)  # type: ignore
+
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        """Materialize the attention bias - for debugging & testing"""
+        if shape[-1] != self.k_seqinfo.seqstart_py[-1]:
+            raise ValueError("k shapes wrong")
+        if shape[-2] != self.q_seqinfo.seqstart_py[-1]:
+            raise ValueError("q shapes wrong")
+        mask = torch.empty(shape[-2:], dtype=dtype, device=device)
+        mask.fill_(-math.inf)
+        for i, ((q_start, q_end), (k_start, k_end)) in enumerate(
+            zip(
+                self.q_seqinfo.intervals(),
+                self.k_seqinfo.intervals(),
+            )
+        ):
+            mask[q_start:q_end, k_start:k_end] = self._create_block_mask(
+                (q_end - q_start, k_end - k_start),
+                dtype=dtype,
+                device=device,
+            )
+        for _ in range(len(shape) - 2):
+            mask = mask.unsqueeze(0)
+        return mask.expand(shape)
+
+    @classmethod
+    def from_seqlens(
+        cls,
+        q_seqlen: Sequence[int],
+        kv_padding: int,
+        kv_seqlen: Sequence[int],
+        causal_diagonal: Any = None,
+    ) -> "BlockDiagonalCausalWithOffsetPaddedKeysMask":
+        """Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor
+        lengths for query and key/value.
+
+        Args:
+            q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors
+            kv_padding (int): Padding for k/v - also an upperbound on each individual key length
+            kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value.
+            causal_diagonal: unused, for BC only
+        Returns:
+            BlockDiagonalCausalWithOffsetPaddedKeysMask
+        """
+        assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen), (
+            q_seqlen,
+            kv_seqlen,
+        )
+        q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen)
+        k_seqinfo = _PaddedSeqLenInfo.from_seqlens_padded(kv_seqlen, kv_padding)
+        return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo)
+
+
+@dataclass
+class BlockDiagonalCausalLocalAttentionMask(BlockDiagonalCausalMask):
+    """
+    (Experimental feature)
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
+    This makes the mask "local" and the attention pattern banded.
+
+    Query i only attends to keys in its block and cannot attend keys further than "window_size"
+    from it.
+    """
+
+    _window_size: int = 0  # forced due to inheritance and default arguments
+
+    def __post_init__(self):
+        if self._window_size <= 0:
+            raise ValueError(
+                f"Expected `window_size > 0`, but window_size={self._window_size}"
+            )
+        q_seqlen = [
+            y - x
+            for x, y in zip(
+                self.q_seqinfo.seqstart_py[:-1], self.q_seqinfo.seqstart_py[1:]
+            )
+        ]
+        kv_seqlen = [
+            y - x
+            for x, y in zip(
+                self.k_seqinfo.seqstart_py[:-1], self.k_seqinfo.seqstart_py[1:]
+            )
+        ]
+        for q, k in zip(q_seqlen, kv_seqlen):
+            if q - self._window_size >= k:
+                # Each query only attends to keys no further than window_size back.
+                # When q > k + window_size, there will be a query for which the window doesn't reach any key.
+                raise RuntimeError(
+                    f"No keys are attended in q_seqlen {q} k_seqlen {k} with sliding window {self._window_size}"
+                )
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=1,
+            device=device,
+        )
+
+        num_queries, num_keys = shape[-2:]
+        mask = torch.tril(tensor, diagonal=0).to(dtype)  # type: ignore
+        if self._window_size is not None and self._window_size > 0:
+            mask = torch.triu(mask, diagonal=-self._window_size + 1)
+        mask = torch.log(mask)
+        return mask.to(dtype)
+
+
+@dataclass
+class BlockDiagonalCausalLocalAttentionFromBottomRightMask(
+    BlockDiagonalCausalFromBottomRightMask
+):
+    """
+    (Experimental feature)
+    Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`.
+    This makes the mask "local" and the attention pattern banded.
+
+    Query i only attends to keys in its block and cannot attend keys further than "window_size"
+    from it.
+    """
+
+    _window_size: int = 0  # forced due to inheritance and default arguments
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self._window_size <= 0:
+            raise ValueError(
+                f"Expected `window_size > 0`, but window_size={self._window_size}"
+            )
+
+    def _create_block_mask(
+        self,
+        shape: Tuple[int, ...],
+        dtype: torch.dtype = torch.float32,
+        device: Union[str, torch.device] = "cpu",
+    ) -> torch.Tensor:
+        create_as = dtype if dtype is not torch.bfloat16 else torch.float32
+        tensor = torch.full(  # type: ignore
+            shape,
+            dtype=create_as,
+            fill_value=1,
+            device=device,
+        )
+        num_queries, num_keys = shape[-2:]
+        mask = torch.tril(tensor, diagonal=num_keys - num_queries).to(dtype)  # type: ignore
+        if self._window_size is not None:
+            mask = torch.triu(
+                mask, diagonal=num_keys - num_queries - self._window_size + 1
+            )
+        mask = torch.log(mask)
+        return mask.to(dtype)
\ No newline at end of file
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
new file mode 100644
index 0000000000000..7de5ac6f84093
--- /dev/null
+++ b/vllm/hpu/cache_ops.py
@@ -0,0 +1,155 @@
+###############################################################################
+# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+###############################################################################
+
+from typing import Tuple
+import torch
+import habana_frameworks.torch as htorch
+
+
+def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, is_prompt=False):
+    """
+    key: [num_tokens, num_heads, head_size]
+    value: [num_tokens, num_heads, head_size]
+    key_cache: [num_heads, head_size, block_size] * num_blocks
+    value_cache: [num_heads, head_size, block_size] * num_blocks
+    slot_mapping: [num_tokens]
+    """
+    num_tokens = key.shape[0]
+    block_size = key_cache.shape[-1]
+    slot_mapping = slot_mapping.to(key.device)
+    # block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    if is_prompt:
+        # indices = torch.tensor([i for i in range(0, block_size)], device=key.device)
+        for i in range(0, num_tokens, block_size):
+            # if block_idx_list[i] < 0:
+            #     # indices.add_(block_size)
+            #     continue
+            key_cache.index_put_([block_indices[i]], key[i:i+block_size].transpose(0,1).transpose(1,2))
+            value_cache.index_put_([block_indices[i]], value[i:i+block_size].transpose(0,1).transpose(1,2))
+            # key_cache.index_put_([block_indices[i]], key.index_select(0, indices).transpose(0,1).transpose(1,2))
+            # value_cache.index_put_([block_indices[i]], value.index_select(0, indices).transpose(0,1).transpose(1,2))
+            # indices.add_(block_size)
+    else:
+        # print(key_cache.data_ptr(), key_cache.shape)
+        # print(key_cache[2, :, :, 2])
+        key_cache = key_cache.permute(0, 3, 1, 2)
+        value_cache = value_cache.permute(0, 3, 1, 2)
+        # print(key_cache.data_ptr(), key_cache.shape)
+        # print(key_cache[2, 2, :, :])
+        block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+        block_offsets = torch.fmod(slot_mapping, block_size)
+        slot_indices = torch.stack([block_indices, block_offsets], dim=-1)
+        index = torch.tensor(0, device=key.device)
+        for i in range(num_tokens):
+            key_cache[slot_indices[i][0], slot_indices[i][1], :, :] = key[i] # key.index_select(0, index)
+            value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i] # value.index_select(0, index)
+            # key_cache.index_put_([slot_indices[i]],  key[i])
+            # value_cache.index_put_([slot_indices[i]], value[i])
+            # key_cache.index_put_([slot_indices[i]],  key.index_select(0, index))
+            # value_cache.index_put_([slot_indices[i]], value.index_select(0, index))
+            index.add_(1)
+        # print(key_cache.data_ptr(), key_cache.shape)
+        key_cache = key_cache.permute(0, 2, 3, 1)
+        value_cache = value_cache.permute(0, 2, 3, 1)
+        # print(key_cache.data_ptr(), key_cache.shape)
+
+
+
+'''
+def create_cache_view(
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    block_idx: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    _, num_heads, head_size, block_size = key_cache.shape
+    cache_stride = key_cache.stride()
+    cache_offset = key_cache.storage_offset()
+    block_shape = (1, num_heads, head_size, block_size)
+    block_offset = block_idx * (cache_stride[-1] * cache_stride[-2] * cache_stride[-3])
+    key_block = torch.as_strided(key_cache,
+                                block_shape,
+                                cache_stride,
+                                cache_offset+block_offset).squeeze(0)
+    value_block = torch.as_strided(value_cache,
+                                    block_shape,
+                                    cache_stride,
+                                    cache_offset+block_offset).squeeze(0)
+    return key_block, value_block
+
+
+def reshape_and_cache_backup1(key, value, key_cache, value_cache, slot_mapping, is_prompt=False):
+    """
+    key: [num_tokens, num_heads, head_size]
+    value: [num_tokens, num_heads, head_size]
+    key_cache: [num_heads, head_size, block_size] * num_blocks
+    value_cache: [num_heads, head_size, block_size] * num_blocks
+    slot_mapping: [num_tokens]
+    """
+    block_size = key_cache[0].shape[2]
+    block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    if is_prompt:
+        indices = torch.tensor([i for i in range(0, block_size)], device=key.device)
+        for i in range(0, len(block_idx_list), block_size): # for i in range(0, block_indices.shape[0], block_size):
+            if block_idx_list[i] < 0:
+                continue
+            block_idx_tensor = block_indices.index_select(0, torch.tensor(i, device=key.device))
+            key_cache.index_put_([block_idx_tensor], key.index_select(0, indices).transpose(0,1).transpose(1,2))
+            value_cache.index_put_([block_idx_tensor], value.index_select(0, indices).transpose(0,1).transpose(1,2))
+            indices.add_(block_size)
+    else:
+        block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
+        block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()]
+        index = torch.tensor(0, device=key.device)
+        for block_idx, block_offset in zip(block_idx_list, block_offset_list):
+            key_block, value_block = create_cache_view(key_cache, value_cache, block_idx)
+            slot_idx = torch.tensor(block_offset, device=key.device)
+            key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2))
+            value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2))
+            index.add_(1)
+
+
+def reshape_and_cache_backup2(key, value, key_cache, value_cache, slot_mapping, is_prompt=False):
+    """
+    key: [num_tokens, num_heads, head_size]
+    value: [num_tokens, num_heads, head_size]
+    key_cache: [num_heads, head_size, block_size] * num_blocks
+    value_cache: [num_heads, head_size, block_size] * num_blocks
+    slot_mapping: [num_tokens]
+    """
+    block_size = key_cache[0].shape[2]
+    block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
+    if is_prompt:
+        cached_set = set()
+        indices = torch.tensor([i for i in range(0, block_size)], device=key.device)
+        for block_idx in block_idx_list:
+            if block_idx in cached_set or block_idx < 0:
+                continue
+            else:
+                cached_set.add(block_idx)
+                key_block, value_block = create_cache_view(key_cache, value_cache, block_idx)
+                key_block.copy_(key.index_select(0, indices).transpose(0,1).transpose(1,2))
+                value_block.copy_(value.index_select(0, indices).transpose(0,1).transpose(1,2))
+            indices.add_(block_size)
+    else:
+        block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()]
+        index = torch.tensor(0, device=key.device)
+        # slot_idx = torch.tensor(0, device=key.device)
+        for block_idx, block_offset in zip(block_idx_list, block_offset_list):
+            key_block, value_block = create_cache_view(key_cache, value_cache, block_idx)
+            # slot_idx.copy_(block_offset)
+            slot_idx = torch.tensor(block_offset, device=key.device)
+            key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2))
+            value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2))
+            index.add_(1)
+'''
\ No newline at end of file
diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py
new file mode 100644
index 0000000000000..cb067fca13cca
--- /dev/null
+++ b/vllm/hpu/cuda_utils.py
@@ -0,0 +1,14 @@
+###############################################################################
+# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+###############################################################################
+
+def get_device_attribute(attribute, device_id):
+    return 10240  # TODO: fake value now
\ No newline at end of file
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
new file mode 100644
index 0000000000000..82556ebf78b20
--- /dev/null
+++ b/vllm/hpu/ops.py
@@ -0,0 +1,172 @@
+###############################################################################
+# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+###############################################################################
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import habana_frameworks.torch as htorch
+from typing import List, Optional, Tuple
+
+def silu_and_mul(output, input):
+    htorch.core.mark_step()
+    d = input.shape[-1] // 2
+    silu = torch.nn.SiLU().to(input.device)
+    x, y = torch.split(input, d, dim=-1)
+    output.copy_(silu(x) * y)
+    htorch.core.mark_step()
+
+def gelu_new(output, input):
+    raise NotImplementedError
+
+def gelu_fast(output, input):
+    raise NotImplementedError
+
+def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, attn_masks=None)  -> None:
+    num_kv_heads = value_cache[0].shape[0]
+    head_size = value_cache[0].shape[1]
+    block_size = value_cache[0].shape[2]
+    num_seqs = query.shape[0]
+    num_query_heads = query.shape[1]
+    max_num_blocks_per_seq = block_tables.shape[1]
+
+    if alibi_slopes or num_query_heads != num_kv_heads: #or attn_masks is None:
+        import pdb
+        pdb.set_trace()
+        raise NotImplementedError
+
+    attn_weights_blocks = []
+    value_blocks = []
+    seq_index = torch.tensor([0], dtype=torch.int64, device="hpu")
+
+    for i in range(0, max_num_blocks_per_seq):
+        # hard override for filler. These blocks would contribute nothing to the output due to zero attention_probs and will clog up compute resources
+        if (i - 2) * block_size > torch.max(context_lens):
+            break
+        attn_weights = torch.full((num_seqs, num_query_heads, 1, block_size), torch.finfo(query.dtype).min, dtype=query.dtype, device="hpu")
+        values = torch.zeros((num_seqs, num_query_heads, head_size, block_size), dtype=query.dtype, device="hpu")
+        for seq_id in range(num_seqs):
+            seq_index.fill_(seq_id)
+            if i * block_size < context_lens[seq_id]:
+
+                q =  torch.index_select(query, 0, seq_index).transpose(0, 1)
+                key = torch.index_select(key_cache, 0, block_tables[seq_id][i]).squeeze(0)
+                attn_weight = scale * torch.matmul(q, key)
+
+                if attn_masks is not None:
+                    attn_mask = torch.index_select(attn_masks[i], 0, seq_index)
+                    attn_weight = torch.masked_fill(attn_weight, ~(attn_mask.unsqueeze(0).to(torch.bool)), torch.finfo(attn_weight.dtype).min)
+
+                if context_lens[seq_id] < (i + 1) * block_size:
+                    if context_lens[seq_id] - i*block_size < 0:
+                        attn_weight = torch.finfo(query.dtype).min
+                    else:
+                        attn_weight[:, :, context_lens[seq_id] - i*block_size:] = torch.finfo(query.dtype).min
+                attn_weights.index_copy_(0, seq_index, attn_weight.unsqueeze(0))
+            #attn_weights[attn_weights == 0.0] = torch.finfo(query.dtype).min
+            #if (i - 2) * block_size < max_context_len:
+            value = torch.index_select(value_cache, 0, block_tables[seq_id][i])
+            value = torch.nan_to_num(value)
+            value[value < -1.0e+30] = 0.0
+            values.index_copy_(0, seq_index, value)
+            torch.hpu.synchronize()
+
+        attn_weights_blocks.append(attn_weights.reshape(num_seqs * num_query_heads, 1, block_size))
+        value_blocks.append(values.reshape(num_seqs * num_query_heads, head_size, block_size).transpose(1, 2))
+
+    exp_sum = torch.zeros((*attn_weights_blocks[0].shape[:2], 1), dtype=attn_weights_blocks[0].dtype, device="hpu")
+    for x in attn_weights_blocks:
+        exp_sum.add_(torch.exp(x).sum(dim=-1, keepdim=True))
+
+    output = torch.zeros_like(query)
+    for i in range(len(attn_weights_blocks)):
+        attention_probs = torch.exp(attn_weights_blocks[i]) / exp_sum
+        value = value_blocks[i]
+        out = torch.matmul(attention_probs.to(value.dtype), value).reshape(num_seqs, num_query_heads, head_size)
+        output.add_(out)
+    htorch.core.mark_step()
+    return output
+
+def rms_norm(out, hidden_states, weight, eps):
+    htorch.core.mark_step()
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    out.copy_(weight * hidden_states.to(input_dtype))
+    htorch.core.mark_step()
+
+def rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def apply_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    rotate_fn = rotate_neox if is_neox_style else rotate_gptj
+    q_embed = (q * cos) + (rotate_fn(q) * sin)
+    k_embed = (k * cos) + (rotate_fn(k) * sin)
+    return q_embed, k_embed
+
+
+def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_style):
+    raise NotImplementedError
+    # update query and key in-place
+    num_tokens = query.shape[0]
+    num_heads = query.shape[-1] // head_size
+    query = query.view(num_tokens, num_heads, head_size)
+    key = key.view(num_tokens, num_heads, head_size)
+    cos, sin = torch.split(cos_sin_cache, cos_sin_cache.shape[-1] // 2, dim=-1)
+    if is_neox_style:
+        sin = torch.cat((sin, sin), dim=-1)
+        cos = torch.cat((cos, cos), dim=-1)
+    else:
+        sin = torch.repeat_interleave(sin, 2, -1)
+        cos = torch.repeat_interleave(cos, 2, -1)
+
+    query_rot = query[..., :head_size]
+    query_pass = query[..., head_size:]
+    key_rot = key[..., :head_size]
+    key_pass = key[..., head_size:]
+
+    query_rot = query_rot.transpose(0, 1)
+    key_rot = key_rot.transpose(0, 1)
+    cos = F.embedding(positions, cos)
+    sin = F.embedding(positions, sin)
+
+    query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
+                                    is_neox_style)
+    query_rot = query_rot.transpose(0, 1).contiguous()
+    key_rot = key_rot.transpose(0, 1).contiguous()
+
+    query.copy_(torch.cat((query_rot, query_pass), dim=-1))
+    key.copy_(torch.cat((key_rot, key_pass), dim=-1))
+    htorch.core.mark_step()
+
+    # Output query/key shape: [num_tokens, num_tokens, head_size]
+    return query, key
+    #raise NotImplementedError
+
+def awq_gemm(*args):
+    raise NotImplementedError
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 1af120d13cd4b..2bdd3a62b8b39 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -6,7 +6,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 1da061a6a52c3..d7b49e8d5a52c 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -4,13 +4,20 @@
 import torch
 import torch.nn as nn
 from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
-                                         LowerTriangularMaskWithTensorBias)
 
-from vllm._C import ops
-from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.utils import is_hip
+from vllm.utils import is_hip, is_hpu
+
+if is_hpu():
+    from vllm.hpu import ops
+    from vllm.hpu import cache_ops
+    from vllm.hpu.attn_bias import (BlockDiagonalCausalMask,
+                                    LowerTriangularMaskWithTensorBias)
+else:
+    from vllm._C import ops
+    from vllm._C import cache_ops
+    from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
+                                             LowerTriangularMaskWithTensorBias)
 
 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
@@ -239,7 +246,7 @@ def _paged_attention(
     # For context len > 8192, use V2 kernel to avoid shared memory shortage.
     use_v1 = input_metadata.max_context_len <= 8192 and (
         max_num_partitions == 1 or num_seqs * num_heads > 512)
-    if use_v1:
+    if use_v1 or is_hpu():
         # Run PagedAttention V1.
         output = ops.paged_attention_v1(
             query,
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index ccfd55806d5c6..57e65c04e4019 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -4,7 +4,11 @@
 import torch
 import torch.nn as nn
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 
 try:
     from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 831576b1d7cd7..4e0a0ec51beb6 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -3,7 +3,11 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 8fe96e7ddb98d..716c6f88d9e62 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -5,7 +5,11 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 1932bd145076b..c3d71e4309dbb 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -3,11 +3,14 @@
 import torch
 from torch.nn.parameter import Parameter
 
-from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
-from vllm.utils import is_hip
+from vllm.utils import is_hip, is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 
 
 class SqueezeLLMConfig(QuantizationConfig):
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 94aa842f71ff2..b2efae7d85540 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -27,7 +27,11 @@
 import torch
 import torch.nn as nn
 
-from vllm._C import ops
+from vllm.utils import is_hpu
+if is_hpu():
+    from vllm.hpu import ops
+else:
+    from vllm._C import ops
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/utils.py b/vllm/utils.py
index b1af1c740e9b9..2b78f31946e24 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -6,8 +6,6 @@
 import psutil
 import torch
 
-from vllm._C import cuda_utils
-
 
 class Device(enum.Enum):
     GPU = enum.auto()
@@ -32,6 +30,16 @@ def is_hip() -> bool:
     return torch.version.hip is not None
 
 
+def is_hpu() -> bool:
+    return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available()
+
+
+if is_hpu():
+    from vllm.hpu import cuda_utils
+else:
+    from vllm._C import cuda_utils
+
+
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
     # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
@@ -59,7 +67,3 @@ def get_open_port():
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(("", 0))
         return s.getsockname()[1]
-
-
-def is_hpu() -> bool:
-    return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available()
\ No newline at end of file
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 0eaa3fa28f246..64680ac5f8fc5 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -3,11 +3,15 @@
 
 import torch
 
-from vllm._C import cache_ops
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
 from vllm.utils import in_wsl, is_hpu
 
+if is_hpu():
+    from vllm.hpu import cache_ops
+else:
+    from vllm._C import cache_ops
+
 logger = init_logger(__name__)
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]

From c0d3c69ce125f4bb3ee5db31bff3dec96c170c2c Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Tue, 23 Jan 2024 16:00:37 +0200
Subject: [PATCH 20/43] Add xops bindings

---
 tests/kernels/test_attention.py         |   3 +-
 vllm/hpu/xops.py                        | 103 ++++++++++++++++++++++++
 vllm/model_executor/layers/attention.py |   3 +-
 3 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 vllm/hpu/xops.py

diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 8ff15aceac542..f2242d7d95e49 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -3,14 +3,15 @@
 
 import pytest
 import torch
-from xformers import ops as xops
 
 from vllm.utils import get_max_shared_memory_bytes, is_hpu
 if is_hpu():
     from vllm.hpu import ops
+    from vllm.hpu import xops
     from vllm.hpu.attn_bias import BlockDiagonalCausalMask
 else:
     from vllm._C import ops
+    from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
new file mode 100644
index 0000000000000..691d30b0fba90
--- /dev/null
+++ b/vllm/hpu/xops.py
@@ -0,0 +1,103 @@
+###############################################################################
+# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
+# All Rights Reserved.
+#
+# Unauthorized copying of this file or any element(s) within it, via any medium
+# is strictly prohibited.
+# This file contains Habana Labs, Ltd. proprietary and confidential information
+# and is subject to the confidentiality and license agreements under which it
+# was provided.
+#
+###############################################################################
+
+
+import habana_frameworks.torch as htorch
+import torch
+import torch.nn.functional as F
+from typing import List, Optional, Tuple, Union
+from .attn_bias import AttentionBias
+
+
+# # xops.memory_efficient_attention_forward
+# def memory_efficient_attention_forward(
+#     query: torch.Tensor,
+#     key: torch.Tensor,
+#     value: torch.Tensor,
+#     attn_bias = None,
+#     p: float = 0.0,
+#     scale: Optional[float] = None
+# ) -> torch.Tensor:
+#     # scale = 1 / query.shape[-1] ** 0.5
+#     query = query * scale
+#     attn = query @ key.transpose(-2, -1)
+#     if attn_bias is not None:
+#         shape=(query.shape[0], query.shape[1], query.shape[-2], query.shape[-2])
+#         attn_mask = torch.full(shape, dtype=query.dtype, fill_value=float("-inf"), device=query.device)
+#         attn_mask = torch.triu(attn_mask, diagonal=1).to(query.dtype)
+#         attn = attn + attn_mask
+#     attn = attn.softmax(-1)
+#     attn = torch.nn.functional.dropout(attn, p)
+#     return attn @ value
+
+
+def block_masked_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+    attn_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    query = query * scale
+    attn = query.transpose(0,1) @ key.transpose(0, 1).transpose(1, 2)
+    if attn_mask is not None:
+        attn = attn + attn_mask.to(attn.dtype)
+    attn = attn.softmax(-1)
+    out = attn @ value.transpose(0, 1)
+    out = out.transpose(0, 1)
+    return out
+
+
+def memory_efficient_attention_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_lens: List[int],
+    attn_bias: Optional[torch.Tensor] = None,
+    p: float = 0.0,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    dim = query.dim()
+    if dim == 4:
+        query, key, value = query.squeeze(0), key.squeeze(0), value.squeeze(0)
+    num_seqs = len(cu_seq_lens) - 1
+    outputs = []
+    for i in range(num_seqs):
+        start_idx = cu_seq_lens[i]
+        end_idx = cu_seq_lens[i + 1]
+        seq_len = end_idx - start_idx
+        mask_start_idx = i * seq_len
+        mask_end_idx = (i + 1) * seq_len
+
+        # # Create attention mask.
+        # attn_mask = torch.ones(seq_len, seq_len, dtype=query.dtype)
+        # attn_mask[:seq_lens[i],:seq_lens[i]] = torch.triu(
+        #     attn_mask[:seq_lens[i],:seq_lens[i]],
+        #     diagonal=1
+        # )
+        # attn_mask = attn_mask * -10000.0 # torch.finfo(query.dtype).min
+        # attn_mask = attn_mask.to(dtype=query.dtype, device=query.device)
+
+        attn_mask = attn_bias.materialize(device=query.device)
+        output = block_masked_attention(
+            query[start_idx:end_idx],
+            key[start_idx:end_idx],
+            value[start_idx:end_idx],
+            scale,
+            attn_mask=attn_mask[mask_start_idx:mask_end_idx,
+                                mask_start_idx:mask_end_idx], # attn_mask=attn_mask,
+        )
+        outputs.append(output)
+    out = torch.cat(outputs, dim=0)
+    if dim == 4:
+        out = out.unsqueeze(0)
+    return out
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index d7b49e8d5a52c..f0955671bdf82 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from xformers import ops as xops
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.utils import is_hip, is_hpu
@@ -11,11 +10,13 @@
 if is_hpu():
     from vllm.hpu import ops
     from vllm.hpu import cache_ops
+    from vllm.hpu import xops
     from vllm.hpu.attn_bias import (BlockDiagonalCausalMask,
                                     LowerTriangularMaskWithTensorBias)
 else:
     from vllm._C import ops
     from vllm._C import cache_ops
+    from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
                                              LowerTriangularMaskWithTensorBias)
 

From 48b26d1912c75ea96f7e6559ed01c84c7f4cc863 Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Tue, 23 Jan 2024 17:20:16 +0200
Subject: [PATCH 21/43] Cast paged attention inputs to bfloat16

---
 benchmarks/kernels/benchmark_paged_attention.py | 2 +-
 tests/kernels/conftest.py                       | 4 ++--
 vllm/hpu/ops.py                                 | 7 +++++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index f22acca3b7909..e47a5313c444c 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from vllm.utils import is_hpu()
+from vllm.utils import is_hpu
 if is_hpu():
     from vllm.hpu import ops
 else:
diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py
index 80b62e4e0ef7d..17af2f5c3868d 100644
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -20,9 +20,9 @@ def create_kv_caches(
     scale = head_size**-0.5
     x = 16 // torch.tensor([], dtype=dtype).element_size()
     if is_hpu():
-        key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    else:
         key_cache_shape = (num_blocks, num_heads, head_size, block_size)
+    else:
+        key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
     key_caches = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 82556ebf78b20..30ee852f7d367 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -30,7 +30,10 @@ def gelu_new(output, input):
 def gelu_fast(output, input):
     raise NotImplementedError
 
-def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, attn_masks=None)  -> None:
+def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, attn_masks=None)  -> None:
+    query = query_in.bfloat16()
+    key_cache = key_cache_in.bfloat16()
+    value_cache = value_cache_in.bfloat16()
     num_kv_heads = value_cache[0].shape[0]
     head_size = value_cache[0].shape[1]
     block_size = value_cache[0].shape[2]
@@ -93,7 +96,7 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block
         out = torch.matmul(attention_probs.to(value.dtype), value).reshape(num_seqs, num_query_heads, head_size)
         output.add_(out)
     htorch.core.mark_step()
-    return output
+    return output.to(dtype=query_in.dtype)
 
 def rms_norm(out, hidden_states, weight, eps):
     htorch.core.mark_step()

From aefa573b0fcf0a8c1bdfab65e93c3489f9b2b40a Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Fri, 26 Jan 2024 12:08:30 +0000
Subject: [PATCH 22/43] Remove leftover debug calls

---
 vllm/hpu/ops.py                                | 2 --
 vllm/model_executor/layers/rotary_embedding.py | 6 ------
 vllm/worker/worker.py                          | 2 --
 3 files changed, 10 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 30ee852f7d367..cfbe3c4ab8eac 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -42,8 +42,6 @@ def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, sca
     max_num_blocks_per_seq = block_tables.shape[1]
 
     if alibi_slopes or num_query_heads != num_kv_heads: #or attn_masks is None:
-        import pdb
-        pdb.set_trace()
         raise NotImplementedError
 
     attn_weights_blocks = []
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index b2efae7d85540..dd64434f64f20 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -126,8 +126,6 @@ def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=100
         )
 
     def _set_cos_sin_cache(self, seq_len, device, dtype):
-        #import pdb
-        #pdb.set_trace()
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
@@ -138,8 +136,6 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 
     def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor):
-        #import pdb
-        #pdb.set_trace()
         seq_len = key.shape[-2]
         if seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
@@ -148,7 +144,6 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso
         query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size))
         key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
         if query.device.type == "hpu" and FusedRoPE:
-            #print('using FusedRoPE')
             if len(positions[0]) == 1:
                 cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
                 sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
@@ -157,7 +152,6 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso
                 sin = sin[positions].unsqueeze(2)
             query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
         else:
-            #print('using torch RoPE')
             query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
         return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 22c53005aa7f5..49600689f4c6a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -324,8 +324,6 @@ def round_up(n, multiple):
                     elif (i + 1) * self.block_size <= context_lens[seq_id]:
                         attn_masks[i][seq_id, :] = 1
             input_metadata.attention_masks = attn_masks.to(device="cuda")
-        # import pdb
-        # pdb.set_trace()
         print("input token shape: ", tokens_tensor.shape)
         return tokens_tensor, positions_tensor, input_metadata
 

From c49b68e3fed7410b672492cd9bb740cc067ce634 Mon Sep 17 00:00:00 2001
From: Mikhail Dvoretckii <mdvoretckii@habana.ai>
Date: Fri, 26 Jan 2024 12:23:09 +0000
Subject: [PATCH 23/43] Update comments on HPU ops

---
 vllm/hpu/ops.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index cfbe3c4ab8eac..f3dd9c2c575a3 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -49,7 +49,8 @@ def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, sca
     seq_index = torch.tensor([0], dtype=torch.int64, device="hpu")
 
     for i in range(0, max_num_blocks_per_seq):
-        # hard override for filler. These blocks would contribute nothing to the output due to zero attention_probs and will clog up compute resources
+        # FIXME: dynamic hard override for filler. These blocks would contribute nothing to the output due to zero attention_probs and
+        #  will clog up compute resources. The override itself makes the code unsuitable for graph precompilation
         if (i - 2) * block_size > torch.max(context_lens):
             break
         attn_weights = torch.full((num_seqs, num_query_heads, 1, block_size), torch.finfo(query.dtype).min, dtype=query.dtype, device="hpu")
@@ -66,15 +67,16 @@ def paged_attention_v1(query_in, key_cache_in, value_cache_in, head_mapping, sca
                     attn_mask = torch.index_select(attn_masks[i], 0, seq_index)
                     attn_weight = torch.masked_fill(attn_weight, ~(attn_mask.unsqueeze(0).to(torch.bool)), torch.finfo(attn_weight.dtype).min)
 
+                # FIXME: these dynamic checks serve to ensure the -inf default value is not overwritten with fillers that would cause errors
+                #  in logsoftmax computation. A change to custom block multiplication code is required to avoid incurring extra costs here
                 if context_lens[seq_id] < (i + 1) * block_size:
                     if context_lens[seq_id] - i*block_size < 0:
                         attn_weight = torch.finfo(query.dtype).min
                     else:
                         attn_weight[:, :, context_lens[seq_id] - i*block_size:] = torch.finfo(query.dtype).min
                 attn_weights.index_copy_(0, seq_index, attn_weight.unsqueeze(0))
-            #attn_weights[attn_weights == 0.0] = torch.finfo(query.dtype).min
-            #if (i - 2) * block_size < max_context_len:
             value = torch.index_select(value_cache, 0, block_tables[seq_id][i])
+            # FIXME: these checks concern filler values in the V cache and should be removed once the underlying issue is addressed
             value = torch.nan_to_num(value)
             value[value < -1.0e+30] = 0.0
             values.index_copy_(0, seq_index, value)
@@ -132,6 +134,8 @@ def apply_rope(
 
 
 def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_style):
+    # FIXME: the below code is unused legacy code not meant to be used. Use FusedRoPE
+    #  on HPU and delete this once coverage is verified
     raise NotImplementedError
     # update query and key in-place
     num_tokens = query.shape[0]
@@ -167,7 +171,6 @@ def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_st
 
     # Output query/key shape: [num_tokens, num_tokens, head_size]
     return query, key
-    #raise NotImplementedError
 
 def awq_gemm(*args):
     raise NotImplementedError

From c5c2a9967e163c8235e65483883a7bd0fca6b5c6 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 2 Feb 2024 16:52:44 +0200
Subject: [PATCH 24/43] Restoring NVIDIA compatibility in setup.py

---
 setup.py | 457 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 231 insertions(+), 226 deletions(-)

diff --git a/setup.py b/setup.py
index 17cb874a0955b..33f9627b94f1b 100644
--- a/setup.py
+++ b/setup.py
@@ -17,225 +17,227 @@
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
 ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
-# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
-
+#SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 def _is_hip() -> bool:
     return torch.version.hip is not None
 
 
 def _is_cuda() -> bool:
-    return torch.version.cuda is not None
-
-
-# # Compiler flags.
-# CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# # TODO(woosuk): Should we use -O3?
-# NVCC_FLAGS = ["-O2", "-std=c++17"]
-
-# if _is_hip():
-#     if ROCM_HOME is None:
-#         raise RuntimeError(
-#             "Cannot find ROCM_HOME. ROCm must be available to build the package."
-#         )
-#     NVCC_FLAGS += ["-DUSE_ROCM"]
-
-# if _is_cuda() and CUDA_HOME is None:
-#     raise RuntimeError(
-#         "Cannot find CUDA_HOME. CUDA must be available to build the package.")
-
-# ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-# CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-# NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-
-
-# def get_amdgpu_offload_arch():
-#     command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
-#     try:
-#         output = subprocess.check_output([command])
-#         return output.decode('utf-8').strip()
-#     except subprocess.CalledProcessError as e:
-#         error_message = f"Error: {e}"
-#         raise RuntimeError(error_message) from e
-#     except FileNotFoundError as e:
-#         # If the command is not found, print an error message
-#         error_message = f"The command {command} was not found."
-#         raise RuntimeError(error_message) from e
-
-#     return None
-
-
-# def get_hipcc_rocm_version():
-#     # Run the hipcc --version command
-#     result = subprocess.run(['hipcc', '--version'],
-#                             stdout=subprocess.PIPE,
-#                             stderr=subprocess.STDOUT,
-#                             text=True)
-
-#     # Check if the command was executed successfully
-#     if result.returncode != 0:
-#         print("Error running 'hipcc --version'")
-#         return None
-
-#     # Extract the version using a regular expression
-#     match = re.search(r'HIP version: (\S+)', result.stdout)
-#     if match:
-#         # Return the version string
-#         return match.group(1)
-#     else:
-#         print("Could not find HIP version in the output")
-#         return None
-
-
-# def get_nvcc_cuda_version(cuda_dir: str) -> Version:
-#     """Get the CUDA version from nvcc.
-
-#     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
-#     """
-#     nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
-#                                           universal_newlines=True)
-#     output = nvcc_output.split()
-#     release_idx = output.index("release") + 1
-#     nvcc_cuda_version = parse(output[release_idx].split(",")[0])
-#     return nvcc_cuda_version
-
-
-# def get_torch_arch_list() -> Set[str]:
-#     # TORCH_CUDA_ARCH_LIST can have one or more architectures,
-#     # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
-#     # compiler to additionally include PTX code that can be runtime-compiled
-#     # and executed on the 8.6 or newer architectures. While the PTX code will
-#     # not give the best performance on the newer architectures, it provides
-#     # forward compatibility.
-#     env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
-#     if env_arch_list is None:
-#         return set()
-
-#     # List are separated by ; or space.
-#     torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
-#     if not torch_arch_list:
-#         return set()
-
-#     # Filter out the invalid architectures and print a warning.
-#     valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
-#         {s + "+PTX"
-#          for s in NVIDIA_SUPPORTED_ARCHS})
-#     arch_list = torch_arch_list.intersection(valid_archs)
-#     # If none of the specified architectures are valid, raise an error.
-#     if not arch_list:
-#         raise RuntimeError(
-#             "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
-#             f"variable ({env_arch_list}) is supported. "
-#             f"Supported CUDA/ROCM architectures are: {valid_archs}.")
-#     invalid_arch_list = torch_arch_list - valid_archs
-#     if invalid_arch_list:
-#         warnings.warn(
-#             f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
-#             "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
-#             f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
-#             f"{valid_archs}.",
-#             stacklevel=2)
-#     return arch_list
-
-
-# # First, check the TORCH_CUDA_ARCH_LIST environment variable.
-# compute_capabilities = get_torch_arch_list()
-# if _is_cuda() and not compute_capabilities:
-#     # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
-#     # GPUs on the current machine.
-#     device_count = torch.cuda.device_count()
-#     for i in range(device_count):
-#         major, minor = torch.cuda.get_device_capability(i)
-#         if major < 7:
-#             raise RuntimeError(
-#                 "GPUs with compute capability below 7.0 are not supported.")
-#         compute_capabilities.add(f"{major}.{minor}")
-
-# if _is_cuda():
-#     nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
-#     if not compute_capabilities:
-#         # If no GPU is specified nor available, add all supported architectures
-#         # based on the NVCC CUDA version.
-#         compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
-#         if nvcc_cuda_version < Version("11.1"):
-#             compute_capabilities.remove("8.6")
-#         if nvcc_cuda_version < Version("11.8"):
-#             compute_capabilities.remove("8.9")
-#             compute_capabilities.remove("9.0")
-#     # Validate the NVCC CUDA version.
-#     if nvcc_cuda_version < Version("11.0"):
-#         raise RuntimeError(
-#             "CUDA 11.0 or higher is required to build the package.")
-#     if (nvcc_cuda_version < Version("11.1")
-#             and any(cc.startswith("8.6") for cc in compute_capabilities)):
-#         raise RuntimeError(
-#             "CUDA 11.1 or higher is required for compute capability 8.6.")
-#     if nvcc_cuda_version < Version("11.8"):
-#         if any(cc.startswith("8.9") for cc in compute_capabilities):
-#             # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
-#             # However, GPUs with compute capability 8.9 can also run the code generated by
-#             # the previous versions of CUDA 11 and targeting compute capability 8.0.
-#             # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
-#             # instead of 8.9.
-#             warnings.warn(
-#                 "CUDA 11.8 or higher is required for compute capability 8.9. "
-#                 "Targeting compute capability 8.0 instead.",
-#                 stacklevel=2)
-#             compute_capabilities = set(cc for cc in compute_capabilities
-#                                        if not cc.startswith("8.9"))
-#             compute_capabilities.add("8.0+PTX")
-#         if any(cc.startswith("9.0") for cc in compute_capabilities):
-#             raise RuntimeError(
-#                 "CUDA 11.8 or higher is required for compute capability 9.0.")
-
-#     # Add target compute capabilities to NVCC flags.
-#     for capability in compute_capabilities:
-#         num = capability[0] + capability[2]
-#         NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
-#         if capability.endswith("+PTX"):
-#             NVCC_FLAGS += [
-#                 "-gencode", f"arch=compute_{num},code=compute_{num}"
-#             ]
-
-#     # Use NVCC threads to parallelize the build.
-#     if nvcc_cuda_version >= Version("11.2"):
-#         nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
-#         num_threads = min(os.cpu_count(), nvcc_threads)
-#         NVCC_FLAGS += ["--threads", str(num_threads)]
-
-# elif _is_hip():
-#     amd_arch = get_amdgpu_offload_arch()
-#     if amd_arch not in ROCM_SUPPORTED_ARCHS:
-#         raise RuntimeError(
-#             f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
-#             f"amdgpu_arch_found: {amd_arch}")
-
-# # ext_modules = []
-
-# vllm_extension_sources = [
-#     "csrc/cache_kernels.cu",
-#     "csrc/attention/attention_kernels.cu",
-#     "csrc/pos_encoding_kernels.cu",
-#     "csrc/activation_kernels.cu",
-#     "csrc/layernorm_kernels.cu",
-#     "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-#     "csrc/quantization/gptq/q_gemm.cu",
-#     "csrc/cuda_utils_kernels.cu",
-#     "csrc/pybind.cpp",
-# ]
-# 
-# if _is_cuda():
-#     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
-
-#     vllm_extension = CUDAExtension(
-#         name="vllm._C",
-#         sources=vllm_extension_sources,
-#         extra_compile_args={
-#             "cxx": CXX_FLAGS,
-#             "nvcc": NVCC_FLAGS,
-#         },
-#     )
-#     ext_modules.append(vllm_extension)
+    return torch.version.cuda is not None and torch.cuda.is_available()
+
+
+# Compiler flags.
+CXX_FLAGS = []
+# TODO(woosuk): Should we use -O3?
+NVCC_FLAGS = []
+
+if _is_cuda() or _is_hip():    
+    CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
+    NVCC_FLAGS = ["-O2", "-std=c++17"]
+
+    ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+    CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+    NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+
+if _is_hip():
+    if ROCM_HOME is None:
+        raise RuntimeError(
+            "Cannot find ROCM_HOME. ROCm must be available to build the package."
+        )
+    NVCC_FLAGS += ["-DUSE_ROCM"]
+
+if _is_cuda() and CUDA_HOME is None:
+    raise RuntimeError(
+        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+
+def get_amdgpu_offload_arch():
+    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
+    try:
+        output = subprocess.check_output([command])
+        return output.decode('utf-8').strip()
+    except subprocess.CalledProcessError as e:
+        error_message = f"Error: {e}"
+        raise RuntimeError(error_message) from e
+    except FileNotFoundError as e:
+        # If the command is not found, print an error message
+        error_message = f"The command {command} was not found."
+        raise RuntimeError(error_message) from e
+
+    return None
+
+
+def get_hipcc_rocm_version():
+    # Run the hipcc --version command
+    result = subprocess.run(['hipcc', '--version'],
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            text=True)
+
+    # Check if the command was executed successfully
+    if result.returncode != 0:
+        print("Error running 'hipcc --version'")
+        return None
+
+    # Extract the version using a regular expression
+    match = re.search(r'HIP version: (\S+)', result.stdout)
+    if match:
+        # Return the version string
+        return match.group(1)
+    else:
+        print("Could not find HIP version in the output")
+        return None
+
+
+def get_nvcc_cuda_version(cuda_dir: str) -> Version:
+    """Get the CUDA version from nvcc.
+
+    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
+    """
+    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                          universal_newlines=True)
+    output = nvcc_output.split()
+    release_idx = output.index("release") + 1
+    nvcc_cuda_version = parse(output[release_idx].split(",")[0])
+    return nvcc_cuda_version
+
+
+def get_torch_arch_list() -> Set[str]:
+    if _is_cuda():
+        # TORCH_CUDA_ARCH_LIST can have one or more architectures,
+        # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
+        # compiler to additionally include PTX code that can be runtime-compiled
+        # and executed on the 8.6 or newer architectures. While the PTX code will
+        # not give the best performance on the newer architectures, it provides
+        # forward compatibility.
+        env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+        if env_arch_list is None:
+            return set()
+
+        # List are separated by ; or space.
+        torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
+        if not torch_arch_list:
+            return set()
+
+        # Filter out the invalid architectures and print a warning.
+        valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
+            {s + "+PTX"
+            for s in NVIDIA_SUPPORTED_ARCHS})
+        arch_list = torch_arch_list.intersection(valid_archs)
+        # If none of the specified architectures are valid, raise an error.
+        if not arch_list:
+            raise RuntimeError(
+                "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env "
+                f"variable ({env_arch_list}) is supported. "
+                f"Supported CUDA/ROCM architectures are: {valid_archs}.")
+        invalid_arch_list = torch_arch_list - valid_archs
+        if invalid_arch_list:
+            warnings.warn(
+                f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are "
+                "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
+                f"({env_arch_list}). Supported CUDA/ROCM architectures are: "
+                f"{valid_archs}.",
+                stacklevel=2)
+        return arch_list
+
+# First, check the TORCH_CUDA_ARCH_LIST environment variable.
+compute_capabilities = get_torch_arch_list()
+if _is_cuda() and not compute_capabilities:
+    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
+    # GPUs on the current machine.
+    device_count = torch.cuda.device_count()
+    for i in range(device_count):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 7:
+            raise RuntimeError(
+                "GPUs with compute capability below 7.0 are not supported.")
+        compute_capabilities.add(f"{major}.{minor}")
+
+if _is_cuda():
+    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
+    if not compute_capabilities:
+        # If no GPU is specified nor available, add all supported architectures
+        # based on the NVCC CUDA version.
+        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
+        if nvcc_cuda_version < Version("11.1"):
+            compute_capabilities.remove("8.6")
+        if nvcc_cuda_version < Version("11.8"):
+            compute_capabilities.remove("8.9")
+            compute_capabilities.remove("9.0")
+    # Validate the NVCC CUDA version.
+    if nvcc_cuda_version < Version("11.0"):
+        raise RuntimeError(
+            "CUDA 11.0 or higher is required to build the package.")
+    if (nvcc_cuda_version < Version("11.1")
+            and any(cc.startswith("8.6") for cc in compute_capabilities)):
+        raise RuntimeError(
+            "CUDA 11.1 or higher is required for compute capability 8.6.")
+    if nvcc_cuda_version < Version("11.8"):
+        if any(cc.startswith("8.9") for cc in compute_capabilities):
+            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
+            # However, GPUs with compute capability 8.9 can also run the code generated by
+            # the previous versions of CUDA 11 and targeting compute capability 8.0.
+            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
+            # instead of 8.9.
+            warnings.warn(
+                "CUDA 11.8 or higher is required for compute capability 8.9. "
+                "Targeting compute capability 8.0 instead.",
+                stacklevel=2)
+            compute_capabilities = set(cc for cc in compute_capabilities
+                                       if not cc.startswith("8.9"))
+            compute_capabilities.add("8.0+PTX")
+        if any(cc.startswith("9.0") for cc in compute_capabilities):
+            raise RuntimeError(
+                "CUDA 11.8 or higher is required for compute capability 9.0.")
+
+    # Add target compute capabilities to NVCC flags.
+    for capability in compute_capabilities:
+        num = capability[0] + capability[2]
+        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
+        if capability.endswith("+PTX"):
+            NVCC_FLAGS += [
+                "-gencode", f"arch=compute_{num},code=compute_{num}"
+            ]
+
+    # Use NVCC threads to parallelize the build.
+    if nvcc_cuda_version >= Version("11.2"):
+        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
+        num_threads = min(os.cpu_count(), nvcc_threads)
+        NVCC_FLAGS += ["--threads", str(num_threads)]
+
+elif _is_hip():
+    amd_arch = get_amdgpu_offload_arch()
+    if amd_arch not in ROCM_SUPPORTED_ARCHS:
+        raise RuntimeError(
+            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
+            f"amdgpu_arch_found: {amd_arch}")
+
+ext_modules = []
+
+vllm_extension_sources = [
+    "csrc/cache_kernels.cu",
+    "csrc/attention/attention_kernels.cu",
+    "csrc/pos_encoding_kernels.cu",
+    "csrc/activation_kernels.cu",
+    "csrc/layernorm_kernels.cu",
+    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+    "csrc/quantization/gptq/q_gemm.cu",
+    "csrc/cuda_utils_kernels.cu",
+    "csrc/pybind.cpp",
+]
+
+if _is_cuda():
+    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+
+    vllm_extension = CUDAExtension(
+        name="vllm._C",
+        sources=vllm_extension_sources,
+        extra_compile_args={
+            "cxx": CXX_FLAGS,
+            "nvcc": NVCC_FLAGS,
+        },
+    )
+    ext_modules.append(vllm_extension)
 
 
 def get_path(*filepath) -> str:
@@ -258,17 +260,17 @@ def find_version(filepath: str) -> str:
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
 
-    # if _is_hip():
-    #     # Get the HIP version
-    #     hipcc_version = get_hipcc_rocm_version()
-    #     if hipcc_version != MAIN_CUDA_VERSION:
-    #         rocm_version_str = hipcc_version.replace(".", "")[:3]
-    #         version += f"+rocm{rocm_version_str}"
-    # else:
-    #     cuda_version = str(nvcc_cuda_version)
-    #     if cuda_version != MAIN_CUDA_VERSION:
-    #         cuda_version_str = cuda_version.replace(".", "")[:3]
-    #         version += f"+cu{cuda_version_str}"
+    if _is_hip():
+        # Get the HIP version
+        hipcc_version = get_hipcc_rocm_version()
+        if hipcc_version != MAIN_CUDA_VERSION:
+            rocm_version_str = hipcc_version.replace(".", "")[:3]
+            version += f"+rocm{rocm_version_str}"
+    elif _is_cuda():
+        cuda_version = str(nvcc_cuda_version)
+        if cuda_version != MAIN_CUDA_VERSION:
+            cuda_version_str = cuda_version.replace(".", "")[:3]
+            version += f"+cu{cuda_version_str}"
 
     return version
 
@@ -283,6 +285,9 @@ def get_requirements() -> List[str]:
     if _is_hip():
         with open(get_path("requirements-rocm.txt")) as f:
             requirements = f.read().strip().split("\n")
+    elif _is_cuda():
+        with open(get_path("requirements-cuda.txt")) as f:
+            requirements = f.read().strip().split("\n")
     else:
         with open(get_path("requirements.txt")) as f:
             requirements = f.read().strip().split("\n")
@@ -315,6 +320,6 @@ def get_requirements() -> List[str]:
                                                "examples", "tests")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
-    # ext_modules=ext_modules,
-    # cmdclass={"build_ext": BuildExtension},
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension} if _is_cuda() or _is_hip() else {},
 )

From 1c669080dab477a9f71015db74136aa31391822f Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 5 Feb 2024 13:46:10 +0200
Subject: [PATCH 25/43] vllm.hpu cleanup

---
 vllm/hpu/cache_ops.py                         | 115 +-----------------
 vllm/hpu/ops.py                               |  34 ------
 vllm/hpu/xops.py                              |  34 +-----
 .../model_executor/layers/rotary_embedding.py |   2 +-
 4 files changed, 6 insertions(+), 179 deletions(-)

diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 7de5ac6f84093..5c678587c6ff9 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -26,130 +26,21 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, is_promp
     num_tokens = key.shape[0]
     block_size = key_cache.shape[-1]
     slot_mapping = slot_mapping.to(key.device)
-    # block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
     if is_prompt:
-        # indices = torch.tensor([i for i in range(0, block_size)], device=key.device)
         for i in range(0, num_tokens, block_size):
-            # if block_idx_list[i] < 0:
-            #     # indices.add_(block_size)
-            #     continue
             key_cache.index_put_([block_indices[i]], key[i:i+block_size].transpose(0,1).transpose(1,2))
             value_cache.index_put_([block_indices[i]], value[i:i+block_size].transpose(0,1).transpose(1,2))
-            # key_cache.index_put_([block_indices[i]], key.index_select(0, indices).transpose(0,1).transpose(1,2))
-            # value_cache.index_put_([block_indices[i]], value.index_select(0, indices).transpose(0,1).transpose(1,2))
-            # indices.add_(block_size)
     else:
-        # print(key_cache.data_ptr(), key_cache.shape)
-        # print(key_cache[2, :, :, 2])
         key_cache = key_cache.permute(0, 3, 1, 2)
         value_cache = value_cache.permute(0, 3, 1, 2)
-        # print(key_cache.data_ptr(), key_cache.shape)
-        # print(key_cache[2, 2, :, :])
         block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
         block_offsets = torch.fmod(slot_mapping, block_size)
         slot_indices = torch.stack([block_indices, block_offsets], dim=-1)
         index = torch.tensor(0, device=key.device)
         for i in range(num_tokens):
-            key_cache[slot_indices[i][0], slot_indices[i][1], :, :] = key[i] # key.index_select(0, index)
-            value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i] # value.index_select(0, index)
-            # key_cache.index_put_([slot_indices[i]],  key[i])
-            # value_cache.index_put_([slot_indices[i]], value[i])
-            # key_cache.index_put_([slot_indices[i]],  key.index_select(0, index))
-            # value_cache.index_put_([slot_indices[i]], value.index_select(0, index))
+            key_cache[slot_indices[i][0], slot_indices[i][1], :, :] = key[i]
+            value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i]
             index.add_(1)
-        # print(key_cache.data_ptr(), key_cache.shape)
         key_cache = key_cache.permute(0, 2, 3, 1)
-        value_cache = value_cache.permute(0, 2, 3, 1)
-        # print(key_cache.data_ptr(), key_cache.shape)
-
-
-
-'''
-def create_cache_view(
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    block_idx: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    _, num_heads, head_size, block_size = key_cache.shape
-    cache_stride = key_cache.stride()
-    cache_offset = key_cache.storage_offset()
-    block_shape = (1, num_heads, head_size, block_size)
-    block_offset = block_idx * (cache_stride[-1] * cache_stride[-2] * cache_stride[-3])
-    key_block = torch.as_strided(key_cache,
-                                block_shape,
-                                cache_stride,
-                                cache_offset+block_offset).squeeze(0)
-    value_block = torch.as_strided(value_cache,
-                                    block_shape,
-                                    cache_stride,
-                                    cache_offset+block_offset).squeeze(0)
-    return key_block, value_block
-
-
-def reshape_and_cache_backup1(key, value, key_cache, value_cache, slot_mapping, is_prompt=False):
-    """
-    key: [num_tokens, num_heads, head_size]
-    value: [num_tokens, num_heads, head_size]
-    key_cache: [num_heads, head_size, block_size] * num_blocks
-    value_cache: [num_heads, head_size, block_size] * num_blocks
-    slot_mapping: [num_tokens]
-    """
-    block_size = key_cache[0].shape[2]
-    block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
-    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    if is_prompt:
-        indices = torch.tensor([i for i in range(0, block_size)], device=key.device)
-        for i in range(0, len(block_idx_list), block_size): # for i in range(0, block_indices.shape[0], block_size):
-            if block_idx_list[i] < 0:
-                continue
-            block_idx_tensor = block_indices.index_select(0, torch.tensor(i, device=key.device))
-            key_cache.index_put_([block_idx_tensor], key.index_select(0, indices).transpose(0,1).transpose(1,2))
-            value_cache.index_put_([block_idx_tensor], value.index_select(0, indices).transpose(0,1).transpose(1,2))
-            indices.add_(block_size)
-    else:
-        block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
-        block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()]
-        index = torch.tensor(0, device=key.device)
-        for block_idx, block_offset in zip(block_idx_list, block_offset_list):
-            key_block, value_block = create_cache_view(key_cache, value_cache, block_idx)
-            slot_idx = torch.tensor(block_offset, device=key.device)
-            key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2))
-            value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2))
-            index.add_(1)
-
-
-def reshape_and_cache_backup2(key, value, key_cache, value_cache, slot_mapping, is_prompt=False):
-    """
-    key: [num_tokens, num_heads, head_size]
-    value: [num_tokens, num_heads, head_size]
-    key_cache: [num_heads, head_size, block_size] * num_blocks
-    value_cache: [num_heads, head_size, block_size] * num_blocks
-    slot_mapping: [num_tokens]
-    """
-    block_size = key_cache[0].shape[2]
-    block_idx_list = [int(slot_idx / block_size) if slot_idx > 0 else slot_idx for slot_idx in slot_mapping.tolist()]
-    if is_prompt:
-        cached_set = set()
-        indices = torch.tensor([i for i in range(0, block_size)], device=key.device)
-        for block_idx in block_idx_list:
-            if block_idx in cached_set or block_idx < 0:
-                continue
-            else:
-                cached_set.add(block_idx)
-                key_block, value_block = create_cache_view(key_cache, value_cache, block_idx)
-                key_block.copy_(key.index_select(0, indices).transpose(0,1).transpose(1,2))
-                value_block.copy_(value.index_select(0, indices).transpose(0,1).transpose(1,2))
-            indices.add_(block_size)
-    else:
-        block_offset_list = [slot_idx % block_size for slot_idx in slot_mapping.tolist()]
-        index = torch.tensor(0, device=key.device)
-        # slot_idx = torch.tensor(0, device=key.device)
-        for block_idx, block_offset in zip(block_idx_list, block_offset_list):
-            key_block, value_block = create_cache_view(key_cache, value_cache, block_idx)
-            # slot_idx.copy_(block_offset)
-            slot_idx = torch.tensor(block_offset, device=key.device)
-            key_block.index_copy_(-1, slot_idx, key.index_select(0, index).transpose(0,1).transpose(1,2))
-            value_block.index_copy_(-1, slot_idx, value.index_select(0, index).transpose(0,1).transpose(1,2))
-            index.add_(1)
-'''
\ No newline at end of file
+        value_cache = value_cache.permute(0, 2, 3, 1)
\ No newline at end of file
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index f3dd9c2c575a3..0454814091562 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -137,40 +137,6 @@ def rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox_st
     # FIXME: the below code is unused legacy code not meant to be used. Use FusedRoPE
     #  on HPU and delete this once coverage is verified
     raise NotImplementedError
-    # update query and key in-place
-    num_tokens = query.shape[0]
-    num_heads = query.shape[-1] // head_size
-    query = query.view(num_tokens, num_heads, head_size)
-    key = key.view(num_tokens, num_heads, head_size)
-    cos, sin = torch.split(cos_sin_cache, cos_sin_cache.shape[-1] // 2, dim=-1)
-    if is_neox_style:
-        sin = torch.cat((sin, sin), dim=-1)
-        cos = torch.cat((cos, cos), dim=-1)
-    else:
-        sin = torch.repeat_interleave(sin, 2, -1)
-        cos = torch.repeat_interleave(cos, 2, -1)
-
-    query_rot = query[..., :head_size]
-    query_pass = query[..., head_size:]
-    key_rot = key[..., :head_size]
-    key_pass = key[..., head_size:]
-
-    query_rot = query_rot.transpose(0, 1)
-    key_rot = key_rot.transpose(0, 1)
-    cos = F.embedding(positions, cos)
-    sin = F.embedding(positions, sin)
-
-    query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
-                                    is_neox_style)
-    query_rot = query_rot.transpose(0, 1).contiguous()
-    key_rot = key_rot.transpose(0, 1).contiguous()
-
-    query.copy_(torch.cat((query_rot, query_pass), dim=-1))
-    key.copy_(torch.cat((key_rot, key_pass), dim=-1))
-    htorch.core.mark_step()
-
-    # Output query/key shape: [num_tokens, num_tokens, head_size]
-    return query, key
 
 def awq_gemm(*args):
     raise NotImplementedError
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
index 691d30b0fba90..7309a1f0c1fbd 100644
--- a/vllm/hpu/xops.py
+++ b/vllm/hpu/xops.py
@@ -18,28 +18,6 @@
 from .attn_bias import AttentionBias
 
 
-# # xops.memory_efficient_attention_forward
-# def memory_efficient_attention_forward(
-#     query: torch.Tensor,
-#     key: torch.Tensor,
-#     value: torch.Tensor,
-#     attn_bias = None,
-#     p: float = 0.0,
-#     scale: Optional[float] = None
-# ) -> torch.Tensor:
-#     # scale = 1 / query.shape[-1] ** 0.5
-#     query = query * scale
-#     attn = query @ key.transpose(-2, -1)
-#     if attn_bias is not None:
-#         shape=(query.shape[0], query.shape[1], query.shape[-2], query.shape[-2])
-#         attn_mask = torch.full(shape, dtype=query.dtype, fill_value=float("-inf"), device=query.device)
-#         attn_mask = torch.triu(attn_mask, diagonal=1).to(query.dtype)
-#         attn = attn + attn_mask
-#     attn = attn.softmax(-1)
-#     attn = torch.nn.functional.dropout(attn, p)
-#     return attn @ value
-
-
 def block_masked_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -78,15 +56,7 @@ def memory_efficient_attention_forward(
         mask_start_idx = i * seq_len
         mask_end_idx = (i + 1) * seq_len
 
-        # # Create attention mask.
-        # attn_mask = torch.ones(seq_len, seq_len, dtype=query.dtype)
-        # attn_mask[:seq_lens[i],:seq_lens[i]] = torch.triu(
-        #     attn_mask[:seq_lens[i],:seq_lens[i]],
-        #     diagonal=1
-        # )
-        # attn_mask = attn_mask * -10000.0 # torch.finfo(query.dtype).min
-        # attn_mask = attn_mask.to(dtype=query.dtype, device=query.device)
-
+        # Create attention mask.
         attn_mask = attn_bias.materialize(device=query.device)
         output = block_masked_attention(
             query[start_idx:end_idx],
@@ -94,7 +64,7 @@ def memory_efficient_attention_forward(
             value[start_idx:end_idx],
             scale,
             attn_mask=attn_mask[mask_start_idx:mask_end_idx,
-                                mask_start_idx:mask_end_idx], # attn_mask=attn_mask,
+                                mask_start_idx:mask_end_idx],
         )
         outputs.append(output)
     out = torch.cat(outputs, dim=0)
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index dd64434f64f20..19d52ba0fcaff 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -51,7 +51,7 @@ def get_device_name():
     """
     Returns the name of the current device: Gaudi or Gaudi2.
 
-    Inspired from: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
+    Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
     """
     import habana_frameworks.torch.utils.experimental as htexp
 

From 5725b31ae187f26ed89956075a54160e016eaf39 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Wed, 7 Feb 2024 12:03:53 +0200
Subject: [PATCH 26/43] Added HPU-specific requirements

---
 requirements-hpu.txt | 14 ++++++++++++++
 requirements.txt     |  4 ++--
 setup.py             |  4 ++--
 3 files changed, 18 insertions(+), 4 deletions(-)
 create mode 100644 requirements-hpu.txt

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
new file mode 100644
index 0000000000000..73a64a94391f0
--- /dev/null
+++ b/requirements-hpu.txt
@@ -0,0 +1,14 @@
+ninja  # For faster builds.
+psutil
+ray >= 2.5.1
+pandas  # Required for Ray data.
+pyarrow  # Required for Ray data.
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+#torch == 2.1.2
+transformers >= 4.36.0  # Required for Mixtral.
+#xformers == 0.0.23.post1  # Required for CUDA 12.1.
+fastapi
+uvicorn[standard]
+pydantic == 1.10.13  # Required for OpenAI server.
+aioprometheus[starlette]
diff --git a/requirements.txt b/requirements.txt
index 73a64a94391f0..92ba0a716c45c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,9 +5,9 @@ pandas  # Required for Ray data.
 pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
-#torch == 2.1.2
+torch == 2.1.2
 transformers >= 4.36.0  # Required for Mixtral.
-#xformers == 0.0.23.post1  # Required for CUDA 12.1.
+xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic == 1.10.13  # Required for OpenAI server.
diff --git a/setup.py b/setup.py
index 33f9627b94f1b..f4e8f1b9545c1 100644
--- a/setup.py
+++ b/setup.py
@@ -285,8 +285,8 @@ def get_requirements() -> List[str]:
     if _is_hip():
         with open(get_path("requirements-rocm.txt")) as f:
             requirements = f.read().strip().split("\n")
-    elif _is_cuda():
-        with open(get_path("requirements-cuda.txt")) as f:
+    elif not _is_cuda() and not _is_hip():
+        with open(get_path("requirements-hpu.txt")) as f:
             requirements = f.read().strip().split("\n")
     else:
         with open(get_path("requirements.txt")) as f:

From 97d31b0d8829626f4651954d6dc2b8146188ff94 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Wed, 7 Feb 2024 12:18:56 +0200
Subject: [PATCH 27/43] Restored full functionality on NVIDIA

---
 vllm/entrypoints/api_server.py                |   5 +-
 vllm/entrypoints/openai/api_server.py         |   5 +-
 vllm/hpu/rotary_embed.py                      | 110 ++++++++++++++++
 vllm/model_executor/layers/attention.py       | 123 +++++++++++-------
 .../model_executor/layers/rotary_embedding.py | 118 +----------------
 5 files changed, 199 insertions(+), 162 deletions(-)
 create mode 100644 vllm/hpu/rotary_embed.py

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index b120210831fe5..74d18efe3c7f8 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -2,8 +2,9 @@
 import json
 from typing import AsyncGenerator
 import torch
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.gpu_migration
+if torch.version.cuda is None and torch.version.hip is None:
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.gpu_migration
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 import uvicorn
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bb5b921123460..1d7272fb8b05e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -9,8 +9,9 @@
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
 import torch
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.gpu_migration
+if torch.version.cuda is None and torch.version.hip is None:
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.gpu_migration
 from aioprometheus import MetricsMiddleware
 from aioprometheus.asgi.starlette import metrics
 import fastapi
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
new file mode 100644
index 0000000000000..e4be97ff285ee
--- /dev/null
+++ b/vllm/hpu/rotary_embed.py
@@ -0,0 +1,110 @@
+import torch
+import torch.nn as nn
+
+def get_device_name():
+    """
+    Returns the name of the current device: Gaudi or Gaudi2.
+
+    Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
+    """
+    import habana_frameworks.torch.utils.experimental as htexp
+
+    device_type = htexp._get_device_type()
+
+    if device_type == htexp.synDeviceType.synDeviceGaudi:
+        return "gaudi"
+    elif device_type == htexp.synDeviceType.synDeviceGaudi2:
+        return "gaudi2"
+    else:
+        raise ValueError(f"Unsupported device: the device type is {device_type}.")
+
+# TODO: remove this workaround when FusedRoPE properly works on Gaudi
+if get_device_name() == "gaudi2":
+    try:
+        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE
+    except ImportError:
+        print("Not using HPU fused kernel for apply_rotary_pos_emb")
+        FusedRoPE = None
+else:
+    FusedRoPE = None
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids]#.unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids]#.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class HpuRotaryEmbedding(nn.Module):
+    def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='cuda'):
+        super().__init__()
+
+        self.head_size = head_size
+        self.dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor):
+        seq_len = key.shape[-2]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
+
+        cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
+        query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size))
+        key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
+        if query.device.type == "hpu" and FusedRoPE:
+            if len(positions[0]) == 1:
+                cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+                sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
+            else:
+                cos = cos[positions].unsqueeze(2)
+                sin = sin[positions].unsqueeze(2)
+            query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
+        else:
+            query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
+        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
\ No newline at end of file
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index f0955671bdf82..a6f6cca70480d 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -151,22 +151,35 @@ def forward(
                 key = key.unflatten(0, (batch_size, seq_len))
                 value = value.unflatten(0, (batch_size, seq_len))
 
-            cu_seq_lens = [0]
-            for i in range(len(input_metadata.prompt_lens)):
-                cu_seq_lens.append(cu_seq_lens[-1] + input_metadata.prompt_lens[i])
-            input_metadata.cu_seq_lens = cu_seq_lens
-            out = xops.memory_efficient_attention_forward(
-                query,
-                key,
-                value,
-                cu_seq_lens=cu_seq_lens,
-                attn_bias=input_metadata.attn_bias,
-                p=0.0,
-                scale=self.scale,
-            )
-            output = torch.zeros_like(query)
-            output[:, :out.shape[1], :, :] = out
-            output = output.view_as(query)
+            if is_hpu():
+                cu_seq_lens = [0]
+                for i in range(len(input_metadata.prompt_lens)):
+                    cu_seq_lens.append(cu_seq_lens[-1] + input_metadata.prompt_lens[i])
+                input_metadata.cu_seq_lens = cu_seq_lens
+                out = xops.memory_efficient_attention_forward(
+                    query,
+                    key,
+                    value,
+                    cu_seq_lens,
+                    attn_bias=input_metadata.attn_bias,
+                    p=0.0,
+                    scale=self.scale,
+                )
+                output = torch.zeros_like(query)
+                output[:, :out.shape[1], :, :] = out
+                output = output.view_as(query)
+            else:
+                out = xops.memory_efficient_attention_forward(
+                    query,
+                    key,
+                    value,
+                    attn_bias=input_metadata.attn_bias,
+                    p=0.0,
+                    scale=self.scale,
+                    op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
+                    (is_hip()) else None,
+                )
+                output = out.view_as(query)
         else:
             # Decoding run.
             if key_cache is not None and value_cache is not None:
@@ -247,8 +260,8 @@ def _paged_attention(
     # For context len > 8192, use V2 kernel to avoid shared memory shortage.
     use_v1 = input_metadata.max_context_len <= 8192 and (
         max_num_partitions == 1 or num_seqs * num_heads > 512)
-    if use_v1 or is_hpu():
-        # Run PagedAttention V1.
+
+    if is_hpu():
         output = ops.paged_attention_v1(
             query,
             key_cache,
@@ -262,33 +275,49 @@ def _paged_attention(
             alibi_slopes,
         )
     else:
-        # Run PagedAttention V2.
-        assert _PARTITION_SIZE % block_size == 0
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions, head_size),
-            dtype=output.dtype,
-            device=output.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions),
-            dtype=torch.float32,
-            device=output.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-        ops.paged_attention_v2(
-            output,
-            exp_sums,
-            max_logits,
-            tmp_output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
-            block_size,
-            input_metadata.max_context_len,
-            alibi_slopes,
-        )
+        if use_v1:
+            # Run PagedAttention V1.
+            ops.paged_attention_v1(
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                input_metadata.block_tables,
+                input_metadata.context_lens,
+                block_size,
+                input_metadata.max_context_len,
+                alibi_slopes,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                input_metadata.block_tables,
+                input_metadata.context_lens,
+                block_size,
+                input_metadata.max_context_len,
+                alibi_slopes,
+            )
     return output
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 19d52ba0fcaff..201a5142e6466 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -30,6 +30,7 @@
 from vllm.utils import is_hpu
 if is_hpu():
     from vllm.hpu import ops
+    from vllm.hpu.rotary_embed import HpuRotaryEmbedding
 else:
     from vllm._C import ops
 
@@ -47,115 +48,6 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
-def get_device_name():
-    """
-    Returns the name of the current device: Gaudi or Gaudi2.
-
-    Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274
-    """
-    import habana_frameworks.torch.utils.experimental as htexp
-
-    device_type = htexp._get_device_type()
-
-    if device_type == htexp.synDeviceType.synDeviceGaudi:
-        return "gaudi"
-    elif device_type == htexp.synDeviceType.synDeviceGaudi2:
-        return "gaudi2"
-    else:
-        raise ValueError(f"Unsupported device: the device type is {device_type}.")
-
-# TODO: remove this workaround when FusedRoPE properly works on Gaudi
-if get_device_name() == "gaudi2":
-    try:
-        from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE
-    except ImportError:
-        print("Not using HPU fused kernel for apply_rotary_pos_emb")
-        FusedRoPE = None
-else:
-    FusedRoPE = None
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids]#.unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids]#.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaRotaryEmbedding(nn.Module):
-    def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='cuda'):
-        super().__init__()
-
-        self.head_size = head_size
-        self.dim = rotary_dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor):
-        seq_len = key.shape[-2]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype)
-
-        cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
-        query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size))
-        key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size))
-        if query.device.type == "hpu" and FusedRoPE:
-            if len(positions[0]) == 1:
-                cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-                sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-            else:
-                cos = cos[positions].unsqueeze(2)
-                sin = sin[positions].unsqueeze(2)
-            query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
-        else:
-            query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
-        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
-
-
 class RotaryEmbedding(nn.Module):
     """Original rotary positional embedding."""
 
@@ -456,8 +348,12 @@ def get_rope(
         return _ROPE_DICT[key]
 
     if rope_scaling is None:
-        rotary_emb = LlamaRotaryEmbedding(head_size, rotary_dim, max_position, base,
-                                          is_neox_style)
+        if is_hpu():
+            rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                            is_neox_style)
+        else:
+            rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                                            is_neox_style)
     else:
         scaling_type = rope_scaling["type"]
         scaling_factor = rope_scaling["factor"]

From 07671d7ebeb313acbd92366327feacd3435d047f Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Thu, 8 Feb 2024 12:40:17 +0200
Subject: [PATCH 28/43] vllm.core cleanup

---
 vllm/core/scheduler.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e13da6f88580a..ca28bbdc2fb95 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -137,8 +137,6 @@ def _schedule(self) -> SchedulerOutputs:
             # sequence groups are added to the front and the new sequence groups
             # are added to the back.
             while self.waiting:
-                if len(scheduled) == 4:
-                    break
                 seq_group = self.waiting[0]
 
                 assert seq_group.num_seqs() == 1, (

From 413fb6065bde5bda44409e15994a80874e4526b4 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Thu, 8 Feb 2024 12:41:11 +0200
Subject: [PATCH 29/43] vllm init cleanup

---
 vllm/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 9f25f62bd2c1a..138882d1a5a24 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,6 +1,4 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
-import habana_frameworks.torch.gpu_migration
-
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine

From a38686e127cf9313748ecc2a80cd14e42d6b0aa0 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 9 Feb 2024 14:09:20 +0200
Subject: [PATCH 30/43] vllm.hpu cleanup

---
 vllm/hpu/__init__.py     | 2 +-
 vllm/hpu/attn_bias.py    | 2 +-
 vllm/hpu/cache_ops.py    | 2 +-
 vllm/hpu/cuda_utils.py   | 2 +-
 vllm/hpu/rotary_embed.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
index ce3a3ce5d435c..3edd0d0f2dc99 100644
--- a/vllm/hpu/__init__.py
+++ b/vllm/hpu/__init__.py
@@ -8,4 +8,4 @@
 # and is subject to the confidentiality and license agreements under which it
 # was provided.
 #
-###############################################################################
\ No newline at end of file
+###############################################################################
diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py
index ac3ce8e6784cc..ff508a59cc56a 100644
--- a/vllm/hpu/attn_bias.py
+++ b/vllm/hpu/attn_bias.py
@@ -761,4 +761,4 @@ def _create_block_mask(
                 mask, diagonal=num_keys - num_queries - self._window_size + 1
             )
         mask = torch.log(mask)
-        return mask.to(dtype)
\ No newline at end of file
+        return mask.to(dtype)
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 5c678587c6ff9..de1bc9909ee85 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -43,4 +43,4 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, is_promp
             value_cache[slot_indices[i][0], slot_indices[i][1], :, :] = value[i]
             index.add_(1)
         key_cache = key_cache.permute(0, 2, 3, 1)
-        value_cache = value_cache.permute(0, 2, 3, 1)
\ No newline at end of file
+        value_cache = value_cache.permute(0, 2, 3, 1)
diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py
index cb067fca13cca..f9a019431e4c5 100644
--- a/vllm/hpu/cuda_utils.py
+++ b/vllm/hpu/cuda_utils.py
@@ -11,4 +11,4 @@
 ###############################################################################
 
 def get_device_attribute(attribute, device_id):
-    return 10240  # TODO: fake value now
\ No newline at end of file
+    return 10240  # TODO: fake value now
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index e4be97ff285ee..72489c568920a 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -107,4 +107,4 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso
             query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0)
         else:
             query, key = apply_rotary_pos_emb(query, key, cos, sin, positions)
-        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
\ No newline at end of file
+        return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))

From bed7da6e56184098cc8f451de6876bb4c88a7327 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 9 Feb 2024 17:09:46 +0200
Subject: [PATCH 31/43] vllm.benchmarks cleanup

---
 benchmarks/benchmark_latency.py    | 2 ++
 benchmarks/benchmark_serving.py    | 3 +++
 benchmarks/benchmark_throughput.py | 6 ++----
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index e33d5fb2dc247..17b207544295b 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -6,6 +6,8 @@
 
 import numpy as np
 import torch
+if torch.version.cuda is None and torch.version.hip is None:
+    import habana_frameworks.torch as htorch
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 3a80e679191e3..bbf59b034ac24 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -25,6 +25,9 @@
 import aiohttp
 import numpy as np
 from transformers import PreTrainedTokenizerBase
+import torch
+if torch.version.cuda is None and torch.version.hip is None:
+    import habana_frameworks.torch as htorch
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # (prompt len, output len, latency)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 43b368f020471..97ee9b687fb6b 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,6 +6,8 @@
 from typing import List, Optional, Tuple
 
 import torch
+if torch.version.cuda is None and torch.version.hip is None:
+    import habana_frameworks.torch as htorch
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
 from tqdm import tqdm
@@ -90,10 +92,6 @@ def run_vllm(
         dtype=dtype,
         max_model_len=max_model_len,
         enforce_eager=enforce_eager,
-        max_num_batched_tokens=(16 * 128),
-        max_num_seqs=20,
-        max_paddings=(16 * 128),
-        block_size=32,
     )
 
     # Add the requests to the engine.

From 0baa2ef8f1bf70962cca88c363686826c096a5be Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 9 Feb 2024 17:11:28 +0200
Subject: [PATCH 32/43] vllm.entrypoint cleanup

---
 setup.py                       |  8 +++++---
 vllm/entrypoints/api_server.py | 10 ----------
 vllm/entrypoints/llm.py        | 15 +++++----------
 3 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/setup.py b/setup.py
index f4e8f1b9545c1..f182a0084fae1 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
 ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
-#SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 def _is_hip() -> bool:
     return torch.version.hip is not None
@@ -103,7 +103,7 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
 
 
 def get_torch_arch_list() -> Set[str]:
-    if _is_cuda():
+    if _is_cuda() or _is_hip():
         # TORCH_CUDA_ARCH_LIST can have one or more architectures,
         # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
         # compiler to additionally include PTX code that can be runtime-compiled
@@ -139,7 +139,9 @@ def get_torch_arch_list() -> Set[str]:
                 f"{valid_archs}.",
                 stacklevel=2)
         return arch_list
-
+    else:
+        return set()
+        
 # First, check the TORCH_CUDA_ARCH_LIST environment variable.
 compute_capabilities = get_torch_arch_list()
 if _is_cuda() and not compute_capabilities:
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 74d18efe3c7f8..bdb35df8878ca 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -69,16 +69,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     prompt = final_output.prompt
     text_outputs = [prompt + output.text for output in final_output.outputs]
     ret = {"text": text_outputs}
-    DEBUG = True
-    if DEBUG:
-        text_tokens = [output.token_ids for output in final_output.outputs]
-        from vllm.transformers_utils.tokenizer import get_tokenizer
-        tokenizer = get_tokenizer('lmsys/vicuna-7b-v1.3')
-        decoded_tokens = [tokenizer.decode(token_ids) for token_ids in text_tokens]
-        ret["DEBUG"] = {
-            'tokens': text_tokens,
-            'decoded_tokens': decoded_tokens,
-        }
     return JSONResponse(ret)
 
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e399de249c9c3..8220ccd406f03 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -8,9 +8,9 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.utils import Counter
+from vllm.utils import is_hpu
 
 import torch
-import habana_frameworks.torch as htorch
 
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
@@ -181,7 +181,8 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
             pbar = tqdm(total=num_requests, desc="Processed prompts")
-        if profiling:
+        
+        if profiling and is_hpu():
             prof = torch.profiler.profile(
                 schedule = torch.profiler.schedule(wait=6, warmup=0, active=2, repeat=1),
                 activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU],
@@ -196,21 +197,15 @@ def _run_engine(self, use_tqdm: bool, profiling: bool = False) -> List[RequestOu
         outputs: List[RequestOutput] = []
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
-            print("vLLM completed a step")
-            if profiling:
-                count += 1
-                print(f"Processing step {count}")
-                if count == 8:
-                    break
             for output in step_outputs:
                 if output.finished:
                     outputs.append(output)
                     if use_tqdm:
                         pbar.update(1)
-            if profiling:
+            if profiling and is_hpu():
                 htorch.core.mark_step()
                 prof.step()
-        if profiling:
+        if profiling and is_hpu():
             htorch.hpu.synchronize()
             prof.stop()
         if use_tqdm:

From 1f22aa177ab82b76ce4700951a17190278ac53a4 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Tue, 13 Feb 2024 16:53:13 +0200
Subject: [PATCH 33/43] Changed is_hpu logic

---
 vllm/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 2b78f31946e24..29bb24a5f8b56 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,6 +1,7 @@
 import enum
 import socket
 import uuid
+import importlib
 from platform import uname
 
 import psutil
@@ -31,7 +32,7 @@ def is_hip() -> bool:
 
 
 def is_hpu() -> bool:
-    return getattr(torch, 'hpu', None) is not None and torch.hpu.is_available()
+    return importlib.util.find_spec('habana_frameworks') is not None
 
 
 if is_hpu():

From eb2c22a46874c0e0d4011c17d6bce7ad54dfb0d1 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Tue, 13 Feb 2024 16:54:00 +0200
Subject: [PATCH 34/43] vllm.benchmark cleanup

---
 benchmarks/benchmark_latency.py       |  3 ++-
 benchmarks/benchmark_serving.py       |  3 ++-
 benchmarks/benchmark_throughput.py    |  3 ++-
 benchmarks/run_benchmark_bloom560m.sh | 28 ---------------------------
 4 files changed, 6 insertions(+), 31 deletions(-)
 delete mode 100755 benchmarks/run_benchmark_bloom560m.sh

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 17b207544295b..f550aba060e38 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -6,7 +6,8 @@
 
 import numpy as np
 import torch
-if torch.version.cuda is None and torch.version.hip is None:
+from vllm.utils import is_hpu
+if is_hpu():
     import habana_frameworks.torch as htorch
 from tqdm import tqdm
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index bbf59b034ac24..bb28d700fc321 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -26,7 +26,8 @@
 import numpy as np
 from transformers import PreTrainedTokenizerBase
 import torch
-if torch.version.cuda is None and torch.version.hip is None:
+from vllm.utils import is_hpu
+if is_hpu():
     import habana_frameworks.torch as htorch
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 97ee9b687fb6b..6b4a0ff031ee8 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,7 +6,8 @@
 from typing import List, Optional, Tuple
 
 import torch
-if torch.version.cuda is None and torch.version.hip is None:
+from vllm.utils import is_hpu
+if is_hpu():
     import habana_frameworks.torch as htorch
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
diff --git a/benchmarks/run_benchmark_bloom560m.sh b/benchmarks/run_benchmark_bloom560m.sh
deleted file mode 100755
index 13726bc3f46c0..0000000000000
--- a/benchmarks/run_benchmark_bloom560m.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-cd /software/users/mdvoretckii/huda
-source reset.sh
-cd /software/users/mdvoretckii/habana_vllm
-python -m pip install -e .
-python -m pip install xformers --no-deps
-cd benchmarks
-#python benchmark_throughput.py --tokenizer bigscience/bloom-560m --model bigscience/bloom-560m --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100
-python benchmark_throughput.py --tokenizer lmsys/vicuna-7b-v1.3 --model lmsys/vicuna-7b-v1.3 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100
-#curl -X POST  -H "Accept: Application/json" -H "Content-Type: application/json" http://localhost:8000/generate -d '{"prompt":"Would you like a jelly baby?","use_beam_search":false,"n":1}'
-
-
-# Missing ops:
-# Bloom: alibi
-# llama: RMS Norm, RoPE, fused silu, fail in sample
-# ---
-# GPT2: gelu_new
-# Aquila: issues with external source
-# Baichuan: no tokenizer
-# Falcon: fail in sample
-# Falcon RW: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens'
-# GPT BigCode: gated, santacoder fails in sample (not affected by CPU RoPE)
-# GPT-J: gelu_new
-# GPT-NeoX: gelu_fast
-# InternLM: no tokenizer class
-# Mistral: max_num_batched_tokens (2048) is smaller than max_model_len (32768).
-# MPT: TypeError: memory_efficient_attention_forward() missing 1 required positional argument: 'cu_seq_lens'
-# OPT: fail in sample
-# Qwen: no tokenizer class

From e69fca6e11c9f9f618c4833e13f232bed12134f1 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Tue, 13 Feb 2024 16:56:20 +0200
Subject: [PATCH 35/43] Fixed importing condition

---
 vllm/entrypoints/api_server.py        | 3 ++-
 vllm/entrypoints/openai/api_server.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index bdb35df8878ca..629f329a568c4 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -2,7 +2,8 @@
 import json
 from typing import AsyncGenerator
 import torch
-if torch.version.cuda is None and torch.version.hip is None:
+from vllm.utils import is_hpu
+if is_hpu():
     import habana_frameworks.torch.core as htcore
     import habana_frameworks.torch.gpu_migration
 from fastapi import FastAPI, Request
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1d7272fb8b05e..d3062e9220dd8 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -9,7 +9,8 @@
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
 import torch
-if torch.version.cuda is None and torch.version.hip is None:
+from vllm.utils import is_hpu
+if is_hpu():
     import habana_frameworks.torch.core as htcore
     import habana_frameworks.torch.gpu_migration
 from aioprometheus import MetricsMiddleware

From 38cc53bebd71e638819e1febc3fe474b4c13cdfe Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Tue, 13 Feb 2024 16:58:19 +0200
Subject: [PATCH 36/43] tests cleanup

---
 tests/async_engine/test_api_server.py |  2 +-
 tests/conftest.py                     | 35 +++++++---
 tests/kernels/test_attention.py       | 95 ++++++++++++++++-----------
 tests/samplers/test_beam_search.py    |  2 +-
 tests/samplers/test_logprobs.py       |  4 +-
 5 files changed, 86 insertions(+), 52 deletions(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 2eb1b2606b80e..c61b2394cd88a 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -26,7 +26,7 @@ def api_server():
         "api_server_async_engine.py").absolute()
     uvicorn_process = subprocess.Popen([
         sys.executable, "-u",
-        str(script_path), "--model", "lmsys/vicuna-7b-v1.3"
+        str(script_path), "--model", "facebook/opt-125m",
     ])
     yield
     uvicorn_process.terminate()
diff --git a/tests/conftest.py b/tests/conftest.py
index 7b73aaff6f6c9..fa24c667f93d3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,6 +3,10 @@
 
 import pytest
 import torch
+from vllm.utils import is_hpu
+if is_hpu():
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.gpu_migration
 from transformers import AutoModelForCausalLM
 
 from vllm import LLM, SamplingParams
@@ -53,11 +57,18 @@ def __init__(
     ) -> None:
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch_dtype,
-            trust_remote_code=True,
-        )#.cuda()
+        if is_hpu():
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            )
+        else:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
         if tokenizer_name is None:
             tokenizer_name = model_name
         self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
@@ -69,9 +80,12 @@ def generate(
     ) -> List[Tuple[List[int], str]]:
         outputs: List[Tuple[List[int], str]] = []
         for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            if is_hpu():
+                input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids 
+            else:
+                input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.cuda
             output_ids = self.model.generate(
-                input_ids,#.cuda(),
+                input_ids,
                 use_cache=True,
                 **kwargs,
             )
@@ -125,9 +139,12 @@ def generate_greedy_logprobs(
     ) -> List[List[torch.Tensor]]:
         all_logprobs = []
         for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            if is_hpu():
+                input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+            else:
+                input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.cuda()
             output = self.model.generate(
-                input_ids,#.cuda(),
+                input_ids,
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index f2242d7d95e49..f2054bba05a74 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,6 +6,8 @@
 
 from vllm.utils import get_max_shared_memory_bytes, is_hpu
 if is_hpu():
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.gpu_migration
     from vllm.hpu import ops
     from vllm.hpu import xops
     from vllm.hpu.attn_bias import BlockDiagonalCausalMask
@@ -21,6 +23,9 @@
 NUM_BLOCKS = 40000  # Arbitrary values for testing
 PARTITION_SIZE = 512
 
+VERSION = ["v1", "v2"]
+if is_hpu():
+    VERSION.pop()
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
@@ -105,7 +110,7 @@ def ref_single_query_cached_kv_attention(
         output[i].copy_(out, non_blocking=True)
 
 
-@pytest.mark.parametrize("version", ["v1"])#, "v2"])
+@pytest.mark.parametrize("version", VERSION)
 @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -169,53 +174,67 @@ def test_paged_attention(
 
     # Call the paged attention kernel.
     output = torch.empty_like(query)
-    if version == "v1":
+    if is_hpu():
         output = ops.paged_attention_v1(
             query,
             key_cache,
             value_cache,
             num_kv_heads,
             scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-        )
-    elif version == "v2":
-        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
-                          PARTITION_SIZE)
-        assert PARTITION_SIZE % block_size == 0
-        num_seqs, num_heads, head_size = output.shape
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, num_partitions, head_size),
-            dtype=output.dtype,
-            device=output.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, num_partitions),
-            dtype=torch.float32,
-            device=output.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-        ops.paged_attention_v2(
-            output,
-            exp_sums,
-            max_logits,
-            tmp_output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
             block_size,
-            max_context_len,
+            input_metadata.max_context_len,
             alibi_slopes,
         )
     else:
-        raise AssertionError(f"Unknown version: {version}")
+        if version == "v1":
+            output = ops.paged_attention_v1(
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                context_lens,
+                block_size,
+                max_context_len,
+                alibi_slopes,
+            )
+        elif version == "v2":
+            num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
+                            PARTITION_SIZE)
+            assert PARTITION_SIZE % block_size == 0
+            num_seqs, num_heads, head_size = output.shape
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                context_lens,
+                block_size,
+                max_context_len,
+                alibi_slopes,
+            )
+        else:
+            raise AssertionError(f"Unknown version: {version}")
 
     # Run the reference implementation.
     ref_output = torch.empty_like(query)
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 4cf777e2b9e6f..a491ffa763505 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -10,7 +10,7 @@
 #   3. Use the model "huggyllama/llama-7b".
 MAX_TOKENS = [128]
 BEAM_WIDTHS = [4]
-MODELS = ["lmsys/vicuna-7b-v1.3"]
+MODELS = ["facebook/opt-125m"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 24b1572d9a325..b1a5e1f538a7b 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -3,7 +3,7 @@
 
 from vllm import SamplingParams
 
-MODELS = ["lmsys/vicuna-7b-v1.3"]
+MODELS = ["facebook/opt-125m"]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -24,8 +24,6 @@ def test_get_prompt_logprobs(
     del hf_model
 
     vllm_model = vllm_runner(model, dtype=dtype)
-    import pdb
-    pdb.set_trace()
     vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                           logprobs=5,
                                           prompt_logprobs=5,

From 54d499a006a23a9740f65153b74accc2b3ab27a3 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Tue, 13 Feb 2024 17:00:59 +0200
Subject: [PATCH 37/43] removed dummy printings

---
 vllm/model_executor/models/llama.py | 1 -
 vllm/worker/worker.py               | 1 -
 2 files changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 7722cc140326d..b3b24ea6fea44 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -283,7 +283,6 @@ def forward(
         kv_caches: List[KVCache],
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
-        print(f'Input shape: {input_ids.shape}')
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    input_metadata)
         return hidden_states
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 49600689f4c6a..94cf44e5f6d6b 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -324,7 +324,6 @@ def round_up(n, multiple):
                     elif (i + 1) * self.block_size <= context_lens[seq_id]:
                         attn_masks[i][seq_id, :] = 1
             input_metadata.attention_masks = attn_masks.to(device="cuda")
-        print("input token shape: ", tokens_tensor.shape)
         return tokens_tensor, positions_tensor, input_metadata
 
     @torch.inference_mode()

From c0ea99caeb0b380867a7f838ed6c9ed33959f836 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Tue, 13 Feb 2024 16:02:48 +0100
Subject: [PATCH 38/43] Update test_api_server.py

---
 tests/async_engine/test_api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index c61b2394cd88a..d90ba37b27bb9 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -26,7 +26,7 @@ def api_server():
         "api_server_async_engine.py").absolute()
     uvicorn_process = subprocess.Popen([
         sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m",
+        str(script_path), "--model", "facebook/opt-125m"
     ])
     yield
     uvicorn_process.terminate()

From ea3ea4410158c41c54177310899560fddbcdc27a Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Wed, 14 Feb 2024 18:41:58 +0200
Subject: [PATCH 39/43] restored attention and logprobs tests functionality on
 Nvidia

---
 tests/kernels/test_attention.py | 41 ++++++++++++++++++++++-----------
 tests/samplers/test_logprobs.py |  2 +-
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index f2054bba05a74..e7f2f5bb395ef 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -181,15 +181,16 @@ def test_paged_attention(
             value_cache,
             num_kv_heads,
             scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
+            block_tables,
+            context_lens,
             block_size,
-            input_metadata.max_context_len,
+            max_context_len,
             alibi_slopes,
         )
     else:
         if version == "v1":
-            output = ops.paged_attention_v1(
+            ops.paged_attention_v1(
+                output,
                 query,
                 key_cache,
                 value_cache,
@@ -331,19 +332,31 @@ def test_multi_query_kv_attention(
         key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
     attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-    output = xops.memory_efficient_attention_forward(
-        query.unsqueeze(0),
-        key.unsqueeze(0),
-        value.unsqueeze(0),
-        attn_bias=attn_bias,
-        p=0.0,
-        scale=scale,
-    )
-    output = output.squeeze(0)
-
     cu_seq_lens = [0]
     for seq_len in seq_lens:
         cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
+    
+    if is_hpu():
+        output = xops.memory_efficient_attention_forward(
+            query.unsqueeze(0),
+            key.unsqueeze(0),
+            value.unsqueeze(0),
+            cu_seq_lens,
+            attn_bias=attn_bias,
+            p=0.0,
+            scale=scale,
+        )        
+    else:
+        output = xops.memory_efficient_attention_forward(
+            query.unsqueeze(0),
+            key.unsqueeze(0),
+            value.unsqueeze(0),
+            attn_bias=attn_bias,
+            p=0.0,
+            scale=scale,
+        )
+    output = output.squeeze(0)
+
     ref_output = ref_multi_query_kv_attention(
         cu_seq_lens,
         query,
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index b1a5e1f538a7b..1c67cc5bd7394 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -16,7 +16,7 @@ def test_get_prompt_logprobs(
     example_prompts,
 ):
     max_tokens = 5
-    hf_model = hf_runner(model, dtype="float")
+    hf_model = hf_runner(model, dtype=dtype)
     hf_logprobs = hf_model.generate_greedy_logprobs(
         example_prompts,
         max_tokens=max_tokens,

From 5543642d19229a898fdee4b36a2faf169207d1c7 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 16 Feb 2024 12:43:45 +0200
Subject: [PATCH 40/43] throughput benchmark cleanup

---
 benchmarks/benchmark_throughput.py | 12 +++---------
 vllm/__init__.py                   |  1 +
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6b4a0ff031ee8..9afb4721dd01c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -38,16 +38,11 @@ def sample_requests(
     completions = [completion for _, completion in dataset]
     completion_token_ids = tokenizer(completions).input_ids
     tokenized_dataset = []
-    count = 0
     for i in range(len(dataset)):
-        count += 1
-        i = i % 4
         output_len = len(completion_token_ids[i])
         if fixed_output_len is not None:
             output_len = fixed_output_len
         tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
-        if count == num_requests:
-            break
 
     # Filter out too long sequences.
     filtered_dataset: List[Tuple[str, int, int]] = []
@@ -61,10 +56,9 @@ def sample_requests(
             continue
         filtered_dataset.append((prompt, prompt_len, output_len))
 
-    # # Sample the requests.
-    # sampled_requests = random.sample(filtered_dataset, num_requests)
-    # return sampled_requests
-    return filtered_dataset
+    # Sample the requests.
+    sampled_requests = random.sample(filtered_dataset, num_requests)
+    return sampled_requests
 
 
 def run_vllm(
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 138882d1a5a24..e5cd1c2f3334b 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,4 +1,5 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine

From a2acb8699b2b6316b3e4085ae94087b3acf47ea6 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 16 Feb 2024 16:12:06 +0200
Subject: [PATCH 41/43] Changed Habana copyright header

---
 vllm/hpu/__init__.py     | 10 +---------
 vllm/hpu/cache_ops.py    | 10 +---------
 vllm/hpu/cuda_utils.py   | 10 +---------
 vllm/hpu/ops.py          | 10 +---------
 vllm/hpu/rotary_embed.py |  4 ++++
 vllm/hpu/xops.py         | 11 +----------
 6 files changed, 9 insertions(+), 46 deletions(-)

diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
index 3edd0d0f2dc99..c6a95a54d3d95 100644
--- a/vllm/hpu/__init__.py
+++ b/vllm/hpu/__init__.py
@@ -1,11 +1,3 @@
 ###############################################################################
-# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index de1bc9909ee85..913fca2ce56f0 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -1,13 +1,5 @@
 ###############################################################################
-# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
 from typing import Tuple
diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py
index f9a019431e4c5..50e8c39076dc0 100644
--- a/vllm/hpu/cuda_utils.py
+++ b/vllm/hpu/cuda_utils.py
@@ -1,13 +1,5 @@
 ###############################################################################
-# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
 def get_device_attribute(attribute, device_id):
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 0454814091562..9e75695b8846f 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -1,13 +1,5 @@
 ###############################################################################
-# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
 import torch
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 72489c568920a..679acba6924b1 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -1,3 +1,7 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
 import torch
 import torch.nn as nn
 
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
index 7309a1f0c1fbd..a28bd7e1dfcf2 100644
--- a/vllm/hpu/xops.py
+++ b/vllm/hpu/xops.py
@@ -1,16 +1,7 @@
 ###############################################################################
-# Copyright (C) 2023 Habana Labs, Ltd. an Intel Company
-# All Rights Reserved.
-#
-# Unauthorized copying of this file or any element(s) within it, via any medium
-# is strictly prohibited.
-# This file contains Habana Labs, Ltd. proprietary and confidential information
-# and is subject to the confidentiality and license agreements under which it
-# was provided.
-#
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
 ###############################################################################
 
-
 import habana_frameworks.torch as htorch
 import torch
 import torch.nn.functional as F

From 956bab7095c5f2c831b36cb3bde87f888fee7f31 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 16 Feb 2024 16:19:57 +0200
Subject: [PATCH 42/43] Restored alibi in bloom

---
 vllm/model_executor/models/bloom.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 3a557288c3dd1..6d1aeeed78e93 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -109,8 +109,8 @@ def __init__(
         scaling = self.head_dim**-0.5
         self.attn = PagedAttention(self.num_heads,
                                    self.head_dim,
-                                   scaling)#,
-                                   #alibi_slopes=alibi_slopes)
+                                   scaling,
+                                   alibi_slopes=alibi_slopes)
 
     def forward(
         self,

From 702d8a733ea8ef65970ff3bb81bcbbe5aeae4aa4 Mon Sep 17 00:00:00 2001
From: Sebastian Urwan <surwan@habana.ai>
Date: Fri, 16 Feb 2024 16:27:17 +0200
Subject: [PATCH 43/43] Added BSD license header

---
 vllm/hpu/__init__.py     | 3 +++
 vllm/hpu/cache_ops.py    | 3 +++
 vllm/hpu/cuda_utils.py   | 3 +++
 vllm/hpu/ops.py          | 3 +++
 vllm/hpu/rotary_embed.py | 3 +++
 vllm/hpu/xops.py         | 3 +++
 6 files changed, 18 insertions(+)

diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
index c6a95a54d3d95..b8e4d3aac98a7 100644
--- a/vllm/hpu/__init__.py
+++ b/vllm/hpu/__init__.py
@@ -1,3 +1,6 @@
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
 ###############################################################################
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
index 913fca2ce56f0..fb08e4167a10a 100644
--- a/vllm/hpu/cache_ops.py
+++ b/vllm/hpu/cache_ops.py
@@ -1,5 +1,8 @@
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
 ###############################################################################
 
 from typing import Tuple
diff --git a/vllm/hpu/cuda_utils.py b/vllm/hpu/cuda_utils.py
index 50e8c39076dc0..bec242cf985c2 100644
--- a/vllm/hpu/cuda_utils.py
+++ b/vllm/hpu/cuda_utils.py
@@ -1,5 +1,8 @@
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
 ###############################################################################
 
 def get_device_attribute(attribute, device_id):
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index 9e75695b8846f..79f8f186a2b21 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -1,5 +1,8 @@
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
 ###############################################################################
 
 import torch
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
index 679acba6924b1..3def58b11feb6 100644
--- a/vllm/hpu/rotary_embed.py
+++ b/vllm/hpu/rotary_embed.py
@@ -1,5 +1,8 @@
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
 ###############################################################################
 
 import torch
diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py
index a28bd7e1dfcf2..6460cb6ac4f33 100644
--- a/vllm/hpu/xops.py
+++ b/vllm/hpu/xops.py
@@ -1,5 +1,8 @@
 ###############################################################################
 # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
 ###############################################################################
 
 import habana_frameworks.torch as htorch