From a63cba1aeba8194721b42bd1ea1dbde6d99be2be Mon Sep 17 00:00:00 2001
From: Gabriele Oliaro <goliaro@cs.cmu.edu>
Date: Thu, 6 Mar 2025 09:40:03 -0500
Subject: [PATCH] FlexLLM (part 4) (#107)

* add flashinfer dep

* backup

* backup2

* compiles

* lint

* fwd/bwd handlers

* backup

* backup

* cleanup

* fixes

* backup

* fix

* fix

* fix

* restore multi-stream

* fix

* fix peft test

* fix peft test

* update

* multi batch fix

* fixes

* update

* update

* rocm fix

* shellcheck
---
 .github/workflows/gpu-ci.yml                  |    2 +-
 .gitmodules                                   |    5 +-
 CMakeLists.txt                                |    5 +-
 deps/flashinfer                               |    1 +
 include/flexflow/attention_config.h           |  209 ++
 include/flexflow/batch_config.h               |   26 +
 include/flexflow/config.h                     |   13 +-
 include/flexflow/flexflow_c.h                 |  104 +-
 include/flexflow/machine_view.h               |    1 +
 include/flexflow/model.h                      |   63 +-
 .../ops/inc_multihead_self_attention.h        |   60 +-
 .../ops/inc_multihead_self_attention_params.h |    2 +-
 .../inc_multihead_self_attention_kernels.h    |   72 +-
 .../ops/spec_inc_multihead_self_attention.h   |    7 +-
 ...spec_inc_multihead_self_attention_params.h |    1 +
 .../ops/tree_inc_multihead_self_attention.h   |    7 +-
 ...tree_inc_multihead_self_attention_params.h |    2 +-
 include/flexflow/page_manager.h               |   93 +
 include/flexflow/request_manager.h            |   11 +-
 inference/incr_decoding/incr_decoding.cc      |   22 +-
 inference/models/falcon.cc                    |   29 +-
 inference/models/falcon.h                     |    2 +
 inference/models/llama.cc                     |   62 +-
 inference/models/llama.h                      |    2 +
 inference/models/mpt.cc                       |    5 +-
 inference/models/mpt.h                        |    2 +
 inference/models/opt.cc                       |    5 +-
 inference/models/opt.h                        |    6 +-
 inference/models/starcoder.cc                 |   39 +-
 inference/models/starcoder.h                  |    2 +
 inference/peft/peft.cc                        |    4 +
 inference/python/chat.py                      |   12 +-
 inference/python/ff_peft.py                   |   17 +-
 inference/python/incr_decoding.py             |    9 +-
 inference/python/spec_infer.py                |   15 +-
 inference/spec_infer/spec_infer.cc            |   24 +-
 python/flexflow/core/flexflow_cffi.py         |  377 +---
 python/flexflow/serve/models/base.py          |    3 -
 python/flexflow/serve/models/falcon.py        |   85 +-
 python/flexflow/serve/models/llama.py         |   81 +-
 python/flexflow/serve/models/mpt.py           |   40 +-
 python/flexflow/serve/models/opt.py           |   42 +-
 python/flexflow/serve/models/starcoder.py     |   38 +-
 python/flexflow/serve/serve.py                |   38 +-
 python/flexflow/type.py                       |   18 +
 src/c/flexflow_c.cc                           |  240 +--
 src/ops/attention_impl.cu                     |  818 +++++++
 src/ops/fused.cu                              |    4 +
 src/ops/inc_multihead_self_attention.cc       |   80 +-
 src/ops/inc_multihead_self_attention.cpp      |   52 +-
 src/ops/inc_multihead_self_attention.cu       | 1875 +++++++++--------
 src/ops/kernels/linear_kernels.cu             |    6 +-
 src/ops/kernels/lora_linear_kernels.cu        |   14 +-
 src/ops/spec_inc_multihead_self_attention.cc  |   66 +-
 src/ops/spec_inc_multihead_self_attention.cpp |   10 +-
 src/ops/spec_inc_multihead_self_attention.cu  |   10 +-
 src/ops/tree_inc_multihead_self_attention.cc  |   66 +-
 src/ops/tree_inc_multihead_self_attention.cpp |   12 +-
 src/ops/tree_inc_multihead_self_attention.cu  |   12 +-
 .../kernels/parallel_identity_kernels.cu      |    2 +-
 src/runtime/batch_config.cc                   |   12 +
 src/runtime/graph.cc                          |   16 +-
 src/runtime/inference_manager.cc              |   38 +
 src/runtime/model.cc                          |   79 +-
 src/runtime/model.cu                          |   51 +
 src/runtime/operator.cc                       |    3 +-
 src/runtime/page_manager.cc                   |  321 +++
 src/runtime/peft_weight_allocator.cc          |    2 +-
 src/runtime/request_manager.cc                |  238 ++-
 src/runtime/request_manager.cu                |  183 ++
 tests/fine_grained_alignment_test.sh          |   28 +-
 tests/inference/cpp_inference_tests.sh        |  395 ++--
 tests/inference/generate_inf_test_configs.py  |    2 +-
 tests/inference_tests.sh                      |    4 +-
 tests/peft_test.sh                            |    2 +-
 75 files changed, 4015 insertions(+), 2289 deletions(-)
 create mode 160000 deps/flashinfer
 create mode 100644 include/flexflow/attention_config.h
 create mode 100644 include/flexflow/page_manager.h
 create mode 100644 src/ops/attention_impl.cu
 create mode 100644 src/runtime/page_manager.cc

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index d57ff8334..c01702046 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -129,7 +129,7 @@ jobs:
           CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
         run: |
           source ./build/set_python_envs.sh
-          ./tests/fine_grained_alignment_test.sh
+          # ./tests/fine_grained_alignment_test.sh
           ./tests/inference_tests.sh
       
       - name: Run PEFT tests
diff --git a/.gitmodules b/.gitmodules
index a6ef18b1a..1f49e93f0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,4 +16,7 @@
 [submodule "deps/tokenizers-cpp"]
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
-	fetchRecurseSubmodules = true
\ No newline at end of file
+	fetchRecurseSubmodules = true
+[submodule "deps/flashinfer"]
+	path = deps/flashinfer
+	url = https://github.com/flashinfer-ai/flashinfer.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 268c6d5eb..6d2386a2f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,6 +197,9 @@ include(variant)
 # optional
 include(optional)
 
+# flashinfer
+list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)
+
 if (FF_GPU_BACKEND STREQUAL "cuda")
   list(APPEND FF_CC_FLAGS
     -DFF_USE_CUDA)
@@ -220,7 +223,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
   list(APPEND FF_CC_FLAGS
     -DFF_DEBUG)
   list(APPEND FF_NVCC_FLAGS
-    -DFF_DEBUG)
+    -DFF_DEBUG -lineinfo)
 endif()
 
 message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
diff --git a/deps/flashinfer b/deps/flashinfer
new file mode 160000
index 000000000..be6bf5bb2
--- /dev/null
+++ b/deps/flashinfer
@@ -0,0 +1 @@
+Subproject commit be6bf5bb26f1f1b3edf094d903544600c574ee09
diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h
new file mode 100644
index 000000000..98992ff9a
--- /dev/null
+++ b/include/flexflow/attention_config.h
@@ -0,0 +1,209 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_
+#define _FLEXFLOW_ATTENTION_CONFIG_H_
+#include "flexflow/batch_config.h"
+
+namespace FlexFlow {
+
+constexpr uint32_t kPagesize = 64;
+
+inline int ceilDiv(int const a, int const b) {
+  assert(b != 0 && "Attempting to divide by 0");
+  assert(a >= 0 && b > 0 && "Expected non-negative numbers");
+  return (a + b - 1) / b;
+}
+
+inline int round_up_pages(int const num_elements) {
+  return ceilDiv(num_elements, kPagesize);
+}
+
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
+  }
+
+class AttentionMetaData {
+public:
+  AttentionMetaData() {
+    num_q_heads_ = 0;
+    num_kv_heads_ = 0;
+    head_dim_ = 0;
+    q_indptr = nullptr;
+    kv_indptr = nullptr;
+    kv_indices = nullptr;
+    kv_last_page_len = nullptr;
+    qk_indptr = nullptr;
+    custom_mask = nullptr;
+    workspace = nullptr;
+    workspace_size = 0;
+    float_workspace = nullptr;
+    float_workspace_size = 0;
+    int_workspace = nullptr;
+    int_workspace_size = 0;
+    mem_size_ = 0;
+    enabled_ = false;
+  }
+  AttentionMetaData(AttentionMetaData const &rhs) {
+    num_q_heads_ = rhs.num_q_heads_;
+    num_kv_heads_ = rhs.num_kv_heads_;
+    head_dim_ = rhs.head_dim_;
+    q_indptr = rhs.q_indptr;
+    kv_indptr = rhs.kv_indptr;
+    kv_indices = rhs.kv_indices;
+    kv_last_page_len = rhs.kv_last_page_len;
+    qk_indptr = rhs.qk_indptr;
+    custom_mask = rhs.custom_mask;
+    workspace = rhs.workspace;
+    workspace_size = rhs.workspace_size;
+    float_workspace = rhs.float_workspace;
+    float_workspace_size = rhs.float_workspace_size;
+    int_workspace = rhs.int_workspace;
+    int_workspace_size = rhs.int_workspace_size;
+    mem_size_ = rhs.mem_size_;
+    enabled_ = rhs.enabled_;
+    decode_handler_collections = rhs.decode_handler_collections;
+    prompt_handler_collections = rhs.prompt_handler_collections;
+  }
+
+  size_t mem_size() {
+    if (mem_size_ > 0) {
+      return mem_size_;
+    }
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = 0;
+
+    float_workspace_size = 128 * 1024 * 1024; // 128 MB
+    int_workspace_size = 8 * 1024 * 1024;     // 8 MB
+    workspace_size =
+        float_workspace_size + int_workspace_size; // float + int workspace
+
+    mem_size_ = alignTo(sizeof(int32_t) * indices_size +
+                            sizeof(uint8_t) * custom_mask_size + workspace_size,
+                        16);
+    return mem_size_;
+  }
+
+  void assign_address(void *ptr, int size) {
+    if (ptr == nullptr) {
+      q_indptr = nullptr;
+      kv_indptr = nullptr;
+      kv_indices = nullptr;
+      kv_last_page_len = nullptr;
+      qk_indptr = nullptr;
+      custom_mask = nullptr;
+      workspace = nullptr;
+      float_workspace = nullptr;
+      int_workspace = nullptr;
+      return;
+    }
+    assert(size >= mem_size() &&
+           "Insufficient memory size for attention metadata");
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = 0;
+
+    q_indptr = static_cast<int32_t *>(ptr);
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + max_num_pages * batch_size;
+    qk_indptr = kv_last_page_len + batch_size + 1;
+    custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
+    workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
+                                    sizeof(int32_t) * indices_size +
+                                    sizeof(uint8_t) * custom_mask_size);
+    float_workspace = workspace;
+    int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
+                                        float_workspace_size);
+  }
+
+  void set_num_q_heads(uint32_t const num_q_heads) {
+    num_q_heads_ = num_q_heads;
+  }
+  void set_num_kv_heads(uint32_t const num_kv_heads) {
+    num_kv_heads_ = num_kv_heads;
+  }
+  void set_head_dim(uint32_t const head_dim) {
+    head_dim_ = head_dim;
+  }
+  uint32_t num_q_heads() const {
+    return num_q_heads_;
+  }
+  uint32_t num_kv_heads() const {
+    return num_kv_heads_;
+  }
+  uint32_t head_dim() const {
+    return head_dim_;
+  }
+
+  void set_enabled(bool const enabled) {
+    enabled_ = enabled;
+  }
+  bool enabled() const {
+    return enabled_;
+  }
+
+  uint32_t num_q_heads_;
+  uint32_t num_kv_heads_;
+  uint32_t head_dim_;
+
+  int32_t *q_indptr;
+  int32_t *kv_indptr;
+  int32_t *kv_indices;
+  int32_t *kv_last_page_len;
+  int32_t *qk_indptr;
+  uint8_t *custom_mask;
+  void *workspace;
+  size_t workspace_size;
+  void *float_workspace;
+  size_t float_workspace_size;
+  void *int_workspace;
+  size_t int_workspace_size;
+
+  size_t mem_size_;
+
+  // batchsize -> handler
+  bool enabled_;
+  std::unordered_map<int, void *> decode_handler_collections;
+  std::unordered_map<int, void *> prompt_handler_collections;
+};
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_ATTENTION_CONFIG_H_
\ No newline at end of file
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 27def0a36..6fe3ae20e 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -29,6 +29,10 @@
 
 namespace FlexFlow {
 
+inline int alignTo(int x, int y) {
+  return ((x + y - 1) / y) * y;
+}
+
 class InferenceResult;
 class BeamInferenceResult;
 
@@ -58,19 +62,32 @@ class BatchConfig {
   static const RequestGuid INVALID_GUID = 0;
   using TokenId = int;
   BatchConfig();
+  // includes both FWD and BWD finetuning requests
   int num_active_requests() const;
+  // returns number of inference and finetuning FWD tokens
   int num_active_tokens() const;
+
+  // returns number of inference-only tokens
+  int num_inference_tokens() const;
+  int num_inference_requests() const;
+
+  // return the index where the finetuning request would be stored (i.e. last
+  // slot of the batch)
   int finetuning_request_index() const;
+  // returns the number of finetuning FWD requests, or 0 if there is none
   int num_finetuning_fwd_requests() const;
+
   int num_finetuning_fwd_tokens() const;
   int num_finetuning_bwd_requests() const;
   int num_finetuning_bwd_tokens() const;
+
   bool peft_bwd_applies_to_this_layer(int layer) const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
+
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
   void save_to_file(std::string const &filename) const;
@@ -111,6 +128,15 @@ class BatchConfig {
     int num_tokens_in_batch;
     int max_length;
 
+    // paged attention
+    static constexpr size_t request_guid_size = sizeof(RequestGuid);
+    static constexpr size_t alignment = 16;
+    static constexpr size_t padding_size =
+        (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
+        alignment;
+    static constexpr size_t padding_length = padding_size / sizeof(int);
+    int padding[padding_length] = {}; // Padding for memory pointer alignment
+
     // request id in batch config:
     int batch_config_request_id = -1;
     bool prompt_phase = false;
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index eed47dc9a..d9ba03ae2 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -16,6 +16,7 @@
 #ifndef _FLEXFLOW_CONFIG_H_
 #define _FLEXFLOW_CONFIG_H_
 #include "ffconst.h"
+#include "flexflow/attention_config.h"
 #include "flexflow/batch_config.h"
 #include "legion.h"
 #include <cstring>
@@ -87,16 +88,19 @@ struct CombinedBatchConfigMetaStruct {
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-  cudnnHandle_t dnn;
-  cublasHandle_t blas;
+  cudnnHandle_t dnn, peft_dnn;
+  cublasHandle_t blas, peft_blas;
 #else
-  miopenHandle_t dnn;
-  hipblasHandle_t blas;
+  miopenHandle_t dnn, peft_dnn;
+  hipblasHandle_t blas, peft_blas;
 #endif
   void *workSpace;
   size_t workSpaceSize;
   CombinedBatchConfigMetaStruct *batch_config_metadata;
 
+  // flashinfer
+  AttentionMetaData *incr_attention_metadata;
+
   // request info + token info + topolopgy mask info
   size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
   void *offload_reserve_space;
@@ -106,6 +110,7 @@ struct FFHandler {
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
   ncclComm_t ncclComm;
+  ncclComm_t ncclCommPeft;
 #endif
 };
 
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 68cab9ce0..ae5b5014f 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -60,6 +60,7 @@ FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
 // FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t);
 FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
 FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
+FF_NEW_OPAQUE_TYPE(flexflow_page_manager_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -444,78 +445,6 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
     char const *name);
 
 flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    bool apply_rotary_embedding,
-    float rope_theta,
-    char const *rope_type,
-    float rope_factor,
-    float low_freq_factor,
-    float high_freq_factor,
-    int original_max_position_embeddings,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name);
-
-flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    bool apply_rotary_embedding,
-    float rope_theta,
-    char const *rope_type,
-    float rope_factor,
-    float low_freq_factor,
-    float high_freq_factor,
-    int original_max_position_embeddings,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name);
-
-flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    bool apply_rotary_embedding,
-    float rope_theta,
-    char const *rope_type,
-    float rope_factor,
-    float low_freq_factor,
-    float high_freq_factor,
-    int original_max_position_embeddings,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name);
-
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     flexflow_model_t handle_,
     const flexflow_tensor_t input_,
     int embed_dim,
@@ -540,7 +469,7 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
     bool position_bias,
     char const *name);
 
-flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
+flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     flexflow_model_t handle_,
     const flexflow_tensor_t input_,
     int embed_dim,
@@ -565,7 +494,7 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
     bool position_bias,
     char const *name);
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
+flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     flexflow_model_t handle_,
     const flexflow_tensor_t input_,
     int embed_dim,
@@ -656,6 +585,13 @@ flexflow_perf_metrics_t
 
 void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
+void flexflow_model_set_num_kv_cache_pages(flexflow_model_t handle_,
+                                           int num_kv_cache_pages);
+
+int flexflow_compute_num_kv_cache_pages_needed(int max_seq_len,
+                                               int batch_size,
+                                               bool is_spec);
+
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
                              enum RequestType *request_types,
@@ -1021,12 +957,21 @@ flexflow_request_manager_t flexflow_request_manager_get_request_manager(void);
 void flexflow_request_manager_set_max_requests_per_batch(
     flexflow_request_manager_t handle_, int max_num_requests);
 
+int flexflow_request_manager_get_max_requests_per_batch(
+    flexflow_request_manager_t handle_);
+
 void flexflow_request_manager_set_max_tokens_per_batch(
     flexflow_request_manager_t handle_, int max_num_tokens);
 
+int flexflow_request_manager_get_max_tokens_per_batch(
+    flexflow_request_manager_t handle_);
+
 void flexflow_request_manager_set_max_spec_tree_token_num(
     flexflow_request_manager_t handle_, int max_num_tokens);
 
+int flexflow_request_manager_get_max_spec_tree_token_num(
+    flexflow_request_manager_t handle_);
+
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
@@ -1065,6 +1010,17 @@ void flexflow_request_manager_start_background_server(
 void flexflow_request_manager_terminate_background_server(
     flexflow_request_manager_t handle_);
 
+// -----------------------------------------------------------------------
+// PageManager
+// -----------------------------------------------------------------------
+
+flexflow_page_manager_t
+    flexflow_page_manager_get_page_manager(int num_total_pages);
+
+int flexflow_page_manager_get_tot_num_pages(flexflow_page_manager_t handle_);
+
+int flexflow_page_manager_get_tokens_per_page(flexflow_page_manager_t handle_);
+
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------
diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h
index 807b0c9c0..4224f8474 100644
--- a/include/flexflow/machine_view.h
+++ b/include/flexflow/machine_view.h
@@ -96,6 +96,7 @@ struct ParallelConfig {
   int device_ids[MAX_NUM_WORKERS];
 #ifdef FF_USE_NCCL
   ncclComm_t nccl_comms[MAX_NUM_WORKERS];
+  ncclComm_t nccl_comms_peft[MAX_NUM_WORKERS];
 #endif
 };
 
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index a0daf3e51..874211172 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -736,54 +736,6 @@ class FFModel {
                              Initializer *kernel_initializer = NULL,
                              char const *name = NULL);
   Tensor inc_multihead_self_attention(
-      const Tensor input,
-      int embed_dim,
-      int num_heads,
-      int kdim = 0,
-      int vdim = 0,
-      float dropout = 0.0f,
-      bool add_zero_attn = false,
-      DataType data_type = DT_NONE,
-      Initializer *kernel_initializer = NULL,
-      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
-      bool scaling_query = false,
-      float scaling_factor = 1.0f,
-      bool qk_prod_scaling = true,
-      bool position_bias = false,
-      char const *name = NULL);
-  Tensor spec_inc_multihead_self_attention(
-      const Tensor input,
-      int embed_dim,
-      int num_heads,
-      int kdim = 0,
-      int vdim = 0,
-      float dropout = 0.0f,
-      bool add_zero_attn = false,
-      DataType data_type = DT_NONE,
-      Initializer *kernel_initializer = NULL,
-      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
-      bool scaling_query = false,
-      float scaling_factor = 1.0f,
-      bool qk_prod_scaling = true,
-      bool position_bias = false,
-      char const *name = NULL);
-  Tensor inc_multihead_self_attention_verify(
-      const Tensor input,
-      int embed_dim,
-      int num_heads,
-      int kdim = 0,
-      int vdim = 0,
-      float dropout = 0.0f,
-      bool add_zero_attn = false,
-      DataType data_type = DT_NONE,
-      Initializer *kernel_initializer = NULL,
-      RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
-      bool scaling_query = false,
-      float scaling_factor = 1.0f,
-      bool qk_prod_scaling = true,
-      bool position_bias = false,
-      char const *name = NULL);
-  Tensor inc_multiquery_self_attention(
       const Tensor input,
       int embed_dim,
       int num_q_heads,
@@ -800,7 +752,7 @@ class FFModel {
       bool qk_prod_scaling = true,
       bool position_bias = false,
       char const *name = NULL);
-  Tensor spec_inc_multiquery_self_attention(
+  Tensor spec_inc_multihead_self_attention(
       const Tensor input,
       int embed_dim,
       int num_q_heads,
@@ -817,7 +769,7 @@ class FFModel {
       bool qk_prod_scaling = true,
       bool position_bias = false,
       char const *name = NULL);
-  Tensor inc_multiquery_self_attention_verify(
+  Tensor inc_multihead_self_attention_verify(
       const Tensor input,
       int embed_dim,
       int num_q_heads,
@@ -1095,6 +1047,11 @@ class FFModel {
                CompMode comp_mode = COMP_MODE_TRAINING);
   void compile_inference();
   void set_transformer_layer_id(int id);
+
+  // paged attention
+  void set_num_kv_cache_pages(int num_pages);
+  int get_num_kv_cache_pages() const;
+
   void set_position_offset(int offset);
   void graph_optimize(size_t budget,
                       bool only_data_parallel,
@@ -1114,6 +1071,7 @@ class FFModel {
                      bool use_propagation) const;
 #ifdef FF_USE_NCCL
   ncclComm_t *find_nccl_comms(MachineView const &view) const;
+  ncclComm_t *find_nccl_comms_peft(MachineView const &view) const;
   void finish_nccl_comms();
 #endif
 #ifdef FF_USE_PROPAGATE
@@ -1158,6 +1116,10 @@ class FFModel {
   size_t op_global_guid, layer_global_guid, peft_model_global_guid;
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
+
+  // paged attention
+  int num_kv_cache_pages;
+
   // positional embedding start offset
   int position_offset;
   FFConfig config;
@@ -1305,6 +1267,7 @@ class FFModel {
                 // inference_debugging mode.
 #ifdef FF_USE_NCCL
   std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms;
+  std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms_peft;
 #endif
 private:
   bool debug;
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 7206d1a15..8f24ad07f 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -45,6 +45,7 @@ class IncMultiHeadSelfAttention : public Op {
                             DataType _quantization_type,
                             bool _offload,
                             int _tensor_parallelism_degree,
+                            int _num_kv_cache_pages,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             ParallelTensor const _input,
@@ -63,6 +64,7 @@ class IncMultiHeadSelfAttention : public Op {
                             DataType _quantization_type,
                             bool _offload,
                             int _tensor_parallelism_degree,
+                            int _num_kv_cache_pages,
                             char const *name);
   IncMultiHeadSelfAttention(FFModel &model,
                             IncMultiHeadSelfAttention const &other,
@@ -126,11 +128,11 @@ class IncMultiHeadSelfAttention : public Op {
   Params get_params() const;
 
 public:
-  int num_q_heads, num_kv_heads, tensor_parallelism_degree;
+  int num_q_heads, num_kv_heads, tensor_parallelism_degree, num_kv_cache_pages;
   float dropout, scaling_factor;
   bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
   RotaryEmbeddingMeta rotary_embedding_meta;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
@@ -141,15 +143,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   IncMultiHeadSelfAttentionMeta(FFHandler handler,
                                 IncMultiHeadSelfAttention const *attn,
                                 MemoryAllocator &gpu_mem_allocator,
-                                int num_samples,
                                 int _num_q_heads,
                                 int _num_kv_heads);
   IncMultiHeadSelfAttentionMeta(FFHandler handler,
                                 InferenceMode infer_mode,
                                 Op const *attn,
-                                int _qSize,
-                                int _kSize,
-                                int _vSize,
                                 int _qProjSize,
                                 int _kProjSize,
                                 int _vProjSize,
@@ -160,11 +158,11 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
                                 bool _position_bias,
                                 float _scaling_factor,
                                 MemoryAllocator &gpu_mem_allocator,
-                                int num_samples,
                                 int _global_num_q_heads,
                                 int _global_num_kv_heads,
                                 int _num_q_heads,
                                 int _num_kv_heads,
+                                int _num_kv_cache_pages,
                                 DataType _quantization_type,
                                 bool _offload);
   ~IncMultiHeadSelfAttentionMeta(void);
@@ -172,40 +170,56 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
 public:
   Realm::RegionInstance reserveInst;
   size_t reserveSpaceSize;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
-  int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
-      hidden_size;
+  int qProjSize, kProjSize, vProjSize, oProjSize;
+  int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads;
   RotaryEmbeddingMeta *rotary_embedding_meta;
   bool *scaling_query;
   bool *qk_prod_scaling;
   bool *position_bias;
   float scaling_factor;
-  void *devQKVProjArray, *keyCache, *valueCache;
+  DataType quantization_type;
+  bool offload;
+  int num_kv_cache_pages;
+
+  // GPU memory sizes (or num elements)
+  size_t gqa_ptr_array_size = 0;
+  size_t key_cache_size = 0, value_cache_size = 0;           // numel
+  size_t peft_key_cache_size = 0, peft_value_cache_size = 0; // numel
+  size_t qkv_max_proj_size, qkv_max_proj_size_bwd = 0;       // numel
+  size_t query_tmp_size = 0, output_tmp_size = 0;            // numel
+  size_t complex_size = 0, complex_size_bwd = 0;             // numel
+  size_t qk_prod_size = 0;                                   // numel
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0,
+         peft_token_infos_size = 0;
+
+  void *devQKVProjArray, *devQKVProjArrayBWD;
+  void *kvCache, *keyCache, *valueCache;
+  void *keyCachePeft, *valueCachePeft;
   void *qk_prods, *qk_prods_softmax;
-  void *attn_heads;
+  // flashinfer
+  void *queryTmp, *outputTmp;
+
   BatchConfig::PerTokenInfo *token_infos;
-  BatchConfig::PerTokenInfo *peft_token_infos;
-  BatchConfig::PerTokenInfo *peft_token_infos_device;
   BatchConfig::PerRequestInfo *request_infos;
-  DataType quantization_type;
-  bool offload;
+  bool *request_completed;
+
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t qk_tensor;
-  cuFloatComplex *complex_input;
+  cuFloatComplex *complex_input, *complex_input_bwd;
 #elif defined(FF_USE_HIP_ROCM)
   miopenTensorDescriptor_t qk_tensor;
-  //  typedef hipFloatComplex attFloatComplex;
-  hipFloatComplex *complex_input;
+  hipFloatComplex *complex_input, *complex_input_bwd;
 #endif
+
   // GQA
   void **d_A_array, **d_B_array, **d_C_array;
-  void **d_A_array2, **d_B_array2, **d_C_array2;
-  size_t gqa_ptr_array_size;
+
   // PEFT specific fields
+  void **d_A_array2, **d_B_array2, **d_C_array2;
   void *softmax_activation_buffer;
   void *query_activation_buffer;
-  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0,
-         peft_token_infos_size = 0;
+  BatchConfig::PerTokenInfo *peft_token_infos = nullptr;
+  BatchConfig::PerTokenInfo *peft_token_infos_device;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/inc_multihead_self_attention_params.h b/include/flexflow/ops/inc_multihead_self_attention_params.h
index 9b0a26e5d..f9fdbe5d0 100644
--- a/include/flexflow/ops/inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/inc_multihead_self_attention_params.h
@@ -11,7 +11,7 @@ namespace FlexFlow {
 struct IncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
-      tensor_parallelism_degree;
+      tensor_parallelism_degree, num_kv_cache_pages;
   float dropout, scaling_factor;
   bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
   RotaryEmbeddingMeta rotary_embedding_meta;
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
index 925501846..dbac7df10 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_kernels.h
@@ -14,16 +14,76 @@ namespace FlexFlow {
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+// flashinfer
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t
+    get_k_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index = ((page_idx)*kPagesize * 2 + (token_idx % kPagesize)) *
+                 head_dim * num_heads;
+  return index;
+}
+
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t
+    get_v_entry_offset_verify(int const token_idx,
+                              int const page_idx,
+                              int const num_heads,
+                              int const head_dim) {
+  size_t index =
+      ((page_idx)*kPagesize * 2 + kPagesize + (token_idx % kPagesize)) *
+      head_dim * num_heads;
+  return index;
+}
+
+// // kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t get_k_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const num_heads,
+                                                     int const head_dim) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
+}
+
+// kv layout: [num_pages, 2, page_size, num_kv_heads, head_dim]
+__device__ __forceinline__ size_t get_v_entry_offset(int const req_idx,
+                                                     int const token_idx,
+                                                     int const max_num_pages,
+                                                     int const num_heads,
+                                                     int const head_dim) {
+  return ((req_idx * max_num_pages + token_idx / kPagesize) * kPagesize * 2 +
+          kPagesize + token_idx % kPagesize) * /* page slot index */
+         num_heads *
+         head_dim;
+}
+// [For the tokens in batch]
+// Update the kv cache, and compact the q array.
+// Source: qkv projeciton array of tokens in the batch.
+// Destination: q&kv ptr took by the attention kernel.
+// Note that the q&k here are the value after applying with position encoding.
+void update_qkv_in_batch(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         ffStream_t stream);
+template <typename DT>
+void update_kv_cache_kernel_flashinfer(IncMultiHeadSelfAttentionMeta const *m,
+                                       BatchConfig const *bc,
+                                       ffStream_t stream);
+template <typename DT>
+void produce_output(IncMultiHeadSelfAttentionMeta const *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    ffStream_t stream);
+
 template <typename DT>
 void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                      BatchConfig const *bc,
                                      int shard_id,
                                      ffStream_t stream);
-template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         ffStream_t stream);
 
 template <typename DT>
 void apply_scaling_and_rotary(IncMultiHeadSelfAttentionMeta const *m,
@@ -66,7 +126,7 @@ void run_batched_matmul(IncMultiHeadSelfAttentionMeta const *meta,
                         int batchCount,
                         cudaDataType computeType,
                         cublasGemmAlgo_t algo,
-                        cudaStream_t stream,
+                        ffStream_t stream,
                         int batch_ratio_a = 1,
                         int batch_ratio_b = 1,
                         int batch_ratio_c = 1,
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention.h b/include/flexflow/ops/spec_inc_multihead_self_attention.h
index 155132a7f..3a5b7ec5f 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention.h
@@ -39,6 +39,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
+                                int _num_kv_cache_pages,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 const ParallelTensor _input,
@@ -54,6 +55,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
                                 float _scaling_factor,
                                 bool _qk_prod_scaling,
                                 bool _position_bias,
+                                int _num_kv_cache_pages,
                                 char const *name);
   SpecIncMultiHeadSelfAttention(FFModel &model,
                                 SpecIncMultiHeadSelfAttention const &other,
@@ -107,11 +109,11 @@ class SpecIncMultiHeadSelfAttention : public Op {
   Params get_params() const;
 
 public:
-  int num_q_heads, num_kv_heads, tensor_parallelism_degree;
+  int num_q_heads, num_kv_heads, tensor_parallelism_degree, num_kv_cache_pages;
   float dropout, scaling_factor;
   bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
   RotaryEmbeddingMeta rotary_embedding_meta;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
 };
 
@@ -120,7 +122,6 @@ class SpecIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   SpecIncMultiHeadSelfAttentionMeta(FFHandler handler,
                                     SpecIncMultiHeadSelfAttention const *attn,
                                     MemoryAllocator &gpu_mem_allocator,
-                                    int num_samples,
                                     int _num_q_heads,
                                     int _num_kv_heads);
   ~SpecIncMultiHeadSelfAttentionMeta(void);
diff --git a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
index a0ae3fc4f..5c5df3a53 100644
--- a/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/spec_inc_multihead_self_attention_params.h
@@ -12,6 +12,7 @@ struct SpecIncMultiHeadSelfAttentionParams {
   int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
   float dropout, scaling_factor;
   bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
+  int num_kv_cache_pages;
   RotaryEmbeddingMeta rotary_embedding_meta;
   char name[MAX_OPNAME];
   bool is_valid(ParallelTensorShape const &) const;
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 9755e62d4..71645eb28 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -42,6 +42,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 DataType _quantization_type,
                                 bool _offload,
                                 int _tensor_parallelism_degree,
+                                int _num_kv_cache_pages,
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 const ParallelTensor _input,
@@ -60,6 +61,7 @@ class TreeIncMultiHeadSelfAttention : public Op {
                                 DataType _quantization_type,
                                 bool _offload,
                                 int _tensor_parallelism_degree,
+                                int _num_kv_cache_pages,
                                 char const *name);
   TreeIncMultiHeadSelfAttention(FFModel &model,
                                 TreeIncMultiHeadSelfAttention const &other,
@@ -109,11 +111,11 @@ class TreeIncMultiHeadSelfAttention : public Op {
   Params get_params() const;
 
 public:
-  int num_q_heads, num_kv_heads, tensor_parallelism_degree;
+  int num_q_heads, num_kv_heads, tensor_parallelism_degree, num_kv_cache_pages;
   float dropout, scaling_factor;
   bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
   RotaryEmbeddingMeta rotary_embedding_meta;
-  int qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize;
+  int qProjSize, kProjSize, vProjSize, oProjSize;
   int qoSeqLength, kvSeqLength;
   DataType quantization_type;
   bool offload;
@@ -124,7 +126,6 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   TreeIncMultiHeadSelfAttentionMeta(FFHandler handler,
                                     TreeIncMultiHeadSelfAttention const *attn,
                                     MemoryAllocator &gpu_mem_allocator,
-                                    int num_samples,
                                     int _num_q_heads,
                                     int _num_kv_heads);
   ~TreeIncMultiHeadSelfAttentionMeta(void);
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
index b49db2c10..8309b7915 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention_params.h
@@ -10,7 +10,7 @@ namespace FlexFlow {
 struct TreeIncMultiHeadSelfAttentionParams {
   LayerID layer_guid;
   int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
-      tensor_parallelism_degree;
+      tensor_parallelism_degree, num_kv_cache_pages;
   float dropout, scaling_factor;
   bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
   RotaryEmbeddingMeta rotary_embedding_meta;
diff --git a/include/flexflow/page_manager.h b/include/flexflow/page_manager.h
new file mode 100644
index 000000000..1c19bd107
--- /dev/null
+++ b/include/flexflow/page_manager.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "flexflow/attention_config.h"
+#include "flexflow/batch_config.h"
+#include "flexflow/config.h"
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/file_loader.h"
+#include <deque>
+#include <future>
+#include <mutex>
+#include <tokenizers_cpp.h>
+
+namespace FlexFlow {
+
+using RequestGuid = BatchConfig::RequestGuid;
+using TokenId = BatchConfig::TokenId;
+
+/*
+ * @class PageManager
+ * @brief A wrapper class that manages the kv cache allocation status
+ * notice that all the layers of model will share the same page manager because
+ * the position of kv cache will be the same
+ */
+class PageManager {
+public:
+  // Get the singleton instance of the PageManager as it will be shared in
+  // multiple places
+  static PageManager *get_page_manager();
+  static PageManager *get_page_manager(int num_total_pages);
+  PageManager(int tot_num_pages_);
+
+  int get_tot_num_pages() const;
+  int get_tokens_per_page() const;
+
+  // returns the number of pages used by the request (excluding those allocated
+  // but not used yet)
+  int get_num_pages_used_by_req(RequestGuid const &request_guid) const;
+  // returns the indices of the pages in use by the request (excluding those
+  // allocated but not used yet)
+  std::vector<int> get_req_page_indices(RequestGuid const &request_guid) const;
+  int get_num_tokens_in_last_used_page(RequestGuid const &request_guid) const;
+
+  // check if there is enough space for request with given total number of
+  // prompt/evicted tokens even if the tokens will be run in multiple steps
+  // (chunked prefills)
+  bool enough_space_to_add_request(int num_prompt_tokens,
+                                   int num_prompt_tokens_in_first_batch,
+                                   int max_tokens_per_batch) const;
+  // check if there is enough space to append new tokens to the existing
+  // requests
+  bool enough_space_to_append_tokens(
+      std::vector<std::pair<RequestGuid, int>> tokens_per_request) const;
+  void add_request(RequestGuid const &guid, int num_tokens);
+  void remove_request(RequestGuid const &request_guid);
+  RequestGuid evict_request_fifo();
+  // add tokens to an existing request
+  void append_tokens(RequestGuid const &guid, int num_tokens);
+
+  struct PerRequestPageInfo {
+    RequestGuid guid;
+    // pages (ordered logically by token depth) assigned to each request
+    std::vector<int> page_indices;
+    // number of pages (from those assigned to the request) that are already
+    // filled with tokens. Of these, only the last one is allowed to be
+    // partially filled. The others should be full.
+    int num_used_pages;
+    // slots in use in each last page of each request (all previous pages must
+    // be full)
+    int num_tokens_in_last_used_page;
+  };
+
+  friend std::ostream &operator<<(std::ostream &os, PageManager const &pm);
+
+private:
+  // requests ordered by arrival. We use this order for FIFO eviction
+  std::deque<RequestGuid> active_requests;
+  // request info keyed by guid
+  std::unordered_map<RequestGuid, PerRequestPageInfo> requests_info;
+  // pool of available pages
+  std::set<int> free_pages;
+
+  int tot_num_pages;
+};
+
+// returns number of kv cache pages needed to guarantee no evictions if all
+// requests are up to max_seq_len if is_spec==true, it also initializes the page
+// manager
+int compute_num_kv_cache_pages_needed(int max_seq_len,
+                                      int batch_size,
+                                      bool is_spec);
+
+}; // namespace FlexFlow
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index 98f8dc93a..d0b391720 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -18,6 +18,7 @@
 #include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/utils/file_loader.h"
 #include <future>
 #include <mutex>
@@ -88,6 +89,7 @@ struct Request {
     RUNNING = 102,   // running inference
     COMPLETED = 103, // finished and verified
     FINISHING = 104, // finishing request, but not yet verified
+    EVICTED = 105,   // request evicted from kv cache, put back in queue
   };
   enum FinetuningStatus {
     FORWARD_PHASE = 201,
@@ -183,6 +185,7 @@ class RequestManager {
   int get_max_verify_tokens_per_batch();
   int get_max_sequence_length();
   void set_max_sequence_length(int max_seq_length);
+
   void push_spec_infer_tree_width(int tree_width);
   void set_enable_peft_finetuning(bool enable_peft_finetuning_);
   void set_inference_finished(bool finished = true);
@@ -235,6 +238,9 @@ class RequestManager {
   // Methods for preparing next batches
   bool is_eos_token(int token_id);
   bool inf_req_completed(BatchConfig const &old_bc, int i);
+  bool inf_req_evicted(BatchConfig const &old_bc, int i);
+  bool enough_space_to_add_request(BatchConfig const &new_bc,
+                                   int num_concurrent_inf_adapters);
   void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
   void add_peft_config_to_request_info(BatchConfig &bc,
                                        int req_idx,
@@ -243,6 +249,8 @@ class RequestManager {
   void process_inf_req_progress(BatchConfig const &old_fwd_bc,
                                 InferenceResult const &result);
   void handle_completed_inf_req(BatchConfig const &old_bc, int i);
+  void evict_requests_if_needed(BatchConfig const &old_bc,
+                                int inference_batch_size);
   void add_continuing_inf_req_to_new_batch(BatchConfig &new_bc,
                                            BatchConfig const &old_bc,
                                            int &num_active_req,
@@ -387,6 +395,7 @@ class RequestManager {
   int max_fwd_finetuning_tokens_per_batch;
   int max_spec_tree_token_num;
   int max_sequence_length;
+
   Status request_manager_status;
 
   // peft
@@ -410,7 +419,7 @@ class RequestManager {
   std::vector<int> eos_token_ids;
   bool old_llama_tokenizer = false;
   std::string output_filepath;
-  std::queue<Request> pending_infr_request_queue;
+  std::deque<Request> pending_infr_request_queue;
   std::queue<Request> pending_peft_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index f148d440e..0f64cb9a7 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -47,7 +47,8 @@ void parse_input_args(char **argv,
                       float &topp,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
-                      int &max_sequence_length) {
+                      int &max_sequence_length,
+                      int &max_length) {
   for (int i = 1; i < argc; i++) {
     // llm model type
     if (!strcmp(argv[i], "-llm-model")) {
@@ -101,10 +102,16 @@ void parse_input_args(char **argv,
       max_tokens_per_batch = std::stoi(argv[++i]);
       continue;
     }
+    // max allowed sequence length
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
     }
+    // max length before stopping if we haven't reached the EOS
+    if (!strcmp(argv[i], "--max-length")) {
+      max_length = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -133,9 +140,10 @@ void FlexFlow::top_level_task(Task const *task,
   bool do_sample = false;
   float temperature = 0.0f;
   float topp = 0.0f;
-  int max_requests_per_batch = 8;
-  int max_tokens_per_batch = 128;
+  int max_requests_per_batch = 4;
+  int max_tokens_per_batch = 64;
   int max_sequence_length = 256;
+  int max_length = 128;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -151,7 +159,8 @@ void FlexFlow::top_level_task(Task const *task,
                    topp,
                    max_requests_per_batch,
                    max_tokens_per_batch,
-                   max_sequence_length);
+                   max_sequence_length,
+                   max_length);
 
   assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
              ffconfig.pipeline_parallelism_degree ==
@@ -227,6 +236,9 @@ void FlexFlow::top_level_task(Task const *task,
   rm->register_output_filepath(file_paths.output_file_path);
 
   FFModel model(ffconfig, ffconfig.cpu_offload);
+  // set amount of kv cache needed
+  model.set_num_kv_cache_pages(compute_num_kv_cache_pages_needed(
+      max_sequence_length, max_requests_per_batch, false));
   if (model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(model,
                               config_filepath,
@@ -282,7 +294,7 @@ void FlexFlow::top_level_task(Task const *task,
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_length = 128;
+      inference_req.max_length = max_length;
       requests.push_back(inference_req);
       total_num_requests++;
     }
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index b4f961b00..34c726afd 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -34,7 +34,10 @@ void FALCON::create_falcon_model(FFModel &ff,
                     "divisible by the tensor parallelism degree");
   }
 
-  std::unordered_map<std::string, Layer *> weights_layers;
+  assert(falcon_config.hidden_size % falcon_config.n_head == 0 &&
+         "Hidden size not divisible by number of attention heads");
+  int head_dim = falcon_config.hidden_size / falcon_config.n_head;
+  int tot_num_heads = falcon_config.n_head + 2 * falcon_config.n_head_kv;
 
   Tensor input;
   {
@@ -100,9 +103,7 @@ void FALCON::create_falcon_model(FFModel &ff,
 
     qkv_proj = ff.dense(
         att_norm,
-        falcon_config.hidden_size *
-            3, // q, k, v. need to change if want to remove replication.
-               // (q_heads + 2 * kv_heads) * proj_size
+        head_dim * tot_num_heads,
         AC_MODE_NONE,
         false,         // seems like it does not use bias
         DT_NONE,       // what is this
@@ -117,13 +118,13 @@ void FALCON::create_falcon_model(FFModel &ff,
 
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        o_proj = ff.spec_inc_multiquery_self_attention(
+        o_proj = ff.spec_inc_multihead_self_attention(
             qkv_proj,
             falcon_config.hidden_size,
             falcon_config.n_head,
             falcon_config.n_head_kv,
-            falcon_config.hidden_size / falcon_config.n_head,
-            falcon_config.hidden_size / falcon_config.n_head,
+            head_dim,
+            head_dim,
             0.0f,    /*dropout*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
@@ -140,13 +141,13 @@ void FALCON::create_falcon_model(FFModel &ff,
       }
 
       case TREE_VERIFY_MODE: {
-        o_proj = ff.inc_multiquery_self_attention_verify(
+        o_proj = ff.inc_multihead_self_attention_verify(
             qkv_proj,
             falcon_config.hidden_size,
             falcon_config.n_head,
             falcon_config.n_head_kv,
-            falcon_config.hidden_size / falcon_config.n_head,
-            falcon_config.hidden_size / falcon_config.n_head,
+            head_dim,
+            head_dim,
             0.0f,    /*dropout*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
@@ -163,13 +164,13 @@ void FALCON::create_falcon_model(FFModel &ff,
       }
 
       case INC_DECODING_MODE: {
-        o_proj = ff.inc_multiquery_self_attention(
+        o_proj = ff.inc_multihead_self_attention(
             qkv_proj,
             falcon_config.hidden_size,
             falcon_config.n_head,
             falcon_config.n_head_kv,
-            falcon_config.hidden_size / falcon_config.n_head,
-            falcon_config.hidden_size / falcon_config.n_head,
+            head_dim,
+            head_dim,
             0.0f,    /*dropout*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
@@ -283,7 +284,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                          falcon_config.n_head,
                          falcon_config.n_head_kv,
                          falcon_config.hidden_size,
-                         falcon_config.hidden_size / falcon_config.n_head,
+                         head_dim,
                          ff.config.tensor_parallelism_degree,
                          use_full_precision);
 
diff --git a/inference/models/falcon.h b/inference/models/falcon.h
index 565d7e541..996cfb838 100644
--- a/inference/models/falcon.h
+++ b/inference/models/falcon.h
@@ -16,7 +16,9 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
 #include <string>
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 7b4a14b47..afee1e307 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -37,17 +37,18 @@ void LLAMA::create_llama_model(FFModel &ff,
                     "divisible by the tensor parallelism degree");
   }
 
-  std::unordered_map<std::string, Layer *> weights_layers;
+  assert(llama_config.hidden_size % llama_config.num_attention_heads == 0 &&
+         "Hidden size not divisible by number of attention heads");
+  int head_dim = llama_config.hidden_size / llama_config.num_attention_heads;
+  int tot_num_heads =
+      llama_config.num_attention_heads + 2 * llama_config.num_key_value_heads;
 
-  Tensor input;
-  {
-    int const token_dims[] = {
-        (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
-            ? BatchConfig::max_verify_tokens_per_batch()
-            : BatchConfig::max_tokens_per_batch(),
-        1};
-    input = ff.create_tensor<2>(token_dims, DT_INT32);
-  }
+  int const token_dims[] = {
+      (mode == TREE_VERIFY_MODE || mode == BEAM_SEARCH_MODE)
+          ? BatchConfig::max_verify_tokens_per_batch()
+          : BatchConfig::max_tokens_per_batch(),
+      1};
+  Tensor input = ff.create_tensor<2>(token_dims, DT_INT32);
 
   Initializer *embed_init = new UniformInitializer(std::rand(), 0, 0);
 
@@ -91,11 +92,10 @@ void LLAMA::create_llama_model(FFModel &ff,
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
     }
+
     Tensor qkv_proj = ff.dense(
         att_norm,
-        llama_config.hidden_size *
-            3, // q, k, v. need to change if want to remove replication.
-               // (q_heads + 2 * kv_heads) * proj_size
+        head_dim * tot_num_heads,
         AC_MODE_NONE,
         false,         // seems like llama does not use bias
         DT_NONE,       // what is this
@@ -110,13 +110,13 @@ void LLAMA::create_llama_model(FFModel &ff,
     Tensor mha;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multiquery_self_attention(
+        mha = ff.spec_inc_multihead_self_attention(
             qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.num_key_value_heads,
-            llama_config.hidden_size / llama_config.num_attention_heads,
-            llama_config.hidden_size / llama_config.num_attention_heads,
+            head_dim,
+            head_dim,
             0.0f,    /*dropout*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
@@ -132,13 +132,13 @@ void LLAMA::create_llama_model(FFModel &ff,
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multiquery_self_attention_verify(
+        mha = ff.inc_multihead_self_attention_verify(
             qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.num_key_value_heads,
-            llama_config.hidden_size / llama_config.num_attention_heads,
-            llama_config.hidden_size / llama_config.num_attention_heads,
+            head_dim,
+            head_dim,
             0.0f,    /*dropout*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
@@ -154,13 +154,13 @@ void LLAMA::create_llama_model(FFModel &ff,
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multiquery_self_attention(
+        mha = ff.inc_multihead_self_attention(
             qkv_proj,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
             llama_config.num_key_value_heads,
-            llama_config.hidden_size / llama_config.num_attention_heads,
-            llama_config.hidden_size / llama_config.num_attention_heads,
+            head_dim,
+            head_dim,
             0.0f,    /*dropout*/
             false,   /*add_zero_attn*/
             DT_NONE, /*data_type*/
@@ -302,15 +302,15 @@ void LLAMA::create_llama_model(FFModel &ff,
     ff.add_lora_layers(target_modules);
   }
 
-  FileDataLoader *fileloader = new FileDataLoader(
-      "",
-      weight_file_path,
-      llama_config.num_attention_heads,
-      llama_config.num_key_value_heads,
-      llama_config.hidden_size,
-      llama_config.hidden_size / llama_config.num_attention_heads,
-      ff.config.tensor_parallelism_degree,
-      use_full_precision);
+  FileDataLoader *fileloader =
+      new FileDataLoader("",
+                         weight_file_path,
+                         llama_config.num_attention_heads,
+                         llama_config.num_key_value_heads,
+                         llama_config.hidden_size,
+                         head_dim,
+                         ff.config.tensor_parallelism_degree,
+                         use_full_precision);
 
   InferenceManager *im = InferenceManager::get_inference_manager();
   im->register_model_weights_loader(&ff, fileloader);
diff --git a/inference/models/llama.h b/inference/models/llama.h
index 853a51a99..e74f8d52b 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -16,7 +16,9 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
 #include <string>
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 6807266ef..6ad74cb2b 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -35,8 +35,6 @@ void MPT::create_mpt_model(FFModel &ff,
                     "divisible by the tensor parallelism degree");
   }
 
-  std::unordered_map<std::string, Layer *> weights_layers;
-
   //------------------------------ build the model --------------------------
   Tensor input;
   {
@@ -115,6 +113,7 @@ void MPT::create_mpt_model(FFModel &ff,
             qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
+            mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             0.0f,
@@ -137,6 +136,7 @@ void MPT::create_mpt_model(FFModel &ff,
             qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
+            mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             0.0f,
@@ -159,6 +159,7 @@ void MPT::create_mpt_model(FFModel &ff,
             qkv_proj,
             mpt_config.hidden_size,
             mpt_config.n_heads,
+            mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             mpt_config.hidden_size / mpt_config.n_heads,
             0.0f,
diff --git a/inference/models/mpt.h b/inference/models/mpt.h
index 3001420ad..a3b4c663f 100644
--- a/inference/models/mpt.h
+++ b/inference/models/mpt.h
@@ -16,7 +16,9 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
 #include <string>
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index cb3d5290c..3870be225 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -35,8 +35,6 @@ void OPT::create_opt_model(FFModel &ff,
                     "divisible by the tensor parallelism degree");
   }
 
-  std::unordered_map<std::string, Layer *> weights_layers;
-
   //------------------------------ build the model --------------------------
   Tensor input;
   Tensor position_input;
@@ -124,6 +122,7 @@ void OPT::create_opt_model(FFModel &ff,
             qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
+            opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -146,6 +145,7 @@ void OPT::create_opt_model(FFModel &ff,
             qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
+            opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -168,6 +168,7 @@ void OPT::create_opt_model(FFModel &ff,
             qkv_proj,
             opt_config.hidden_size,
             opt_config.num_attention_heads,
+            opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             opt_config.hidden_size / opt_config.num_attention_heads,
             0.0f,    /*dropout*/
diff --git a/inference/models/opt.h b/inference/models/opt.h
index 8b85f81aa..73a76b917 100644
--- a/inference/models/opt.h
+++ b/inference/models/opt.h
@@ -16,7 +16,9 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
 #include <string>
@@ -35,11 +37,11 @@ class OPT {
           config_file >> model_config;
           do_layer_norm_before = model_config["do_layer_norm_before"];
           dropout = model_config["dropout"];
-          enable_bias = model_config["enable_bias"];
+          enable_bias = model_config.value("enable_bias", true);
           ffn_dim = model_config["ffn_dim"];
           hidden_size = model_config["hidden_size"];
           layer_norm_elementwise_affine =
-              model_config["layer_norm_elementwise_affine"];
+              model_config.value("layer_norm_elementwise_affine", true);
           max_position_embeddings = model_config["max_position_embeddings"];
           num_attention_heads = model_config["num_attention_heads"];
           num_hidden_layers = model_config["num_hidden_layers"];
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 3dd61be98..9006867ae 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -40,7 +40,14 @@ void STARCODER::create_starcoder_model(
                     "divisible by the tensor parallelism degree");
   }
 
-  std::unordered_map<std::string, Layer *> weights_layers;
+  assert(startcoder_config.hidden_size %
+                 startcoder_config.num_attention_heads ==
+             0 &&
+         "Hidden size not divisible by number of attention heads");
+  int head_dim =
+      startcoder_config.hidden_size / startcoder_config.num_attention_heads;
+  int tot_num_heads = startcoder_config.num_attention_heads + 2 * 1;
+
   std::vector<int> axes = {0};
 
   Tensor input;
@@ -104,9 +111,7 @@ void STARCODER::create_starcoder_model(
 
     Tensor qkv_proj = ff.dense(
         ln_1,
-        startcoder_config.hidden_size *
-            3, // q, k, v. need to change if want to remove replication.
-               // (q_heads + 2 * kv_heads) * proj_size
+        head_dim * tot_num_heads,
         AC_MODE_NONE,
         false,         // seems like it does not use bias
         DT_NONE,       // what is this
@@ -122,15 +127,13 @@ void STARCODER::create_starcoder_model(
     Tensor o_proj;
     switch (mode) {
       case INC_DECODING_MODE: {
-        o_proj = ff.inc_multiquery_self_attention(
+        o_proj = ff.inc_multihead_self_attention(
             qkv_proj,
             startcoder_config.hidden_size,
             startcoder_config.num_attention_heads,
             1,
-            startcoder_config.hidden_size /
-                startcoder_config.num_attention_heads,
-            startcoder_config.hidden_size /
-                startcoder_config.num_attention_heads,
+            head_dim,
+            head_dim,
             startcoder_config.dropout_p,             /*dropout*/
             false,                                   /*add_zero_attn*/
             DT_NONE,                                 /*data_type*/
@@ -261,15 +264,15 @@ void STARCODER::create_starcoder_model(
   }
 
   InferenceManager *im = InferenceManager::get_inference_manager();
-  FileDataLoader *fileloader = new FileDataLoader(
-      "",
-      weight_file_path,
-      startcoder_config.num_attention_heads,
-      1,
-      startcoder_config.hidden_size,
-      startcoder_config.hidden_size / startcoder_config.num_attention_heads,
-      ff.config.tensor_parallelism_degree,
-      use_full_precision);
+  FileDataLoader *fileloader =
+      new FileDataLoader("",
+                         weight_file_path,
+                         startcoder_config.num_attention_heads,
+                         1,
+                         startcoder_config.hidden_size,
+                         head_dim,
+                         ff.config.tensor_parallelism_degree,
+                         use_full_precision);
   im->register_model_weights_loader(&ff, fileloader);
 }
 
diff --git a/inference/models/starcoder.h b/inference/models/starcoder.h
index 7ff6f3377..89897652f 100644
--- a/inference/models/starcoder.h
+++ b/inference/models/starcoder.h
@@ -16,7 +16,9 @@
 
 // #include "file_loader.h"
 #include "flexflow/batch_config.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/inference.h"
+#include "flexflow/page_manager.h"
 #include "flexflow/request_manager.h"
 #include <nlohmann/json.hpp>
 #include <string>
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
index 7dc717ddb..6197d1432 100644
--- a/inference/peft/peft.cc
+++ b/inference/peft/peft.cc
@@ -379,6 +379,10 @@ void FlexFlow::top_level_task(Task const *task,
   } else {
     assert(false && "unknow model type");
   }
+
+  model.set_num_kv_cache_pages(compute_num_kv_cache_pages_needed(
+      max_sequence_length, max_requests_per_batch, false));
+
   rm->set_num_transformer_layers(model.current_transformer_layer_id + 1);
   if (num_layers_per_finetuning_step > 0) {
     rm->set_num_layers_per_finetuning_step(num_layers_per_finetuning_step);
diff --git a/inference/python/chat.py b/inference/python/chat.py
index e70440f33..3601e1989 100644
--- a/inference/python/chat.py
+++ b/inference/python/chat.py
@@ -47,6 +47,10 @@ def get_configs():
         "cache_path": os.environ.get("FF_CACHE_PATH", ""),
         "refresh_cache": False,
         "full_precision": False,
+        "max_requests_per_batch": 4,
+        "max_seq_length": 2048,
+        "max_tokens_per_batch": 256,
+        "max_new_tokens": 1024
     }
     # Merge dictionaries
     ff_init_configs.update(llm_configs)
@@ -77,9 +81,9 @@ def main():
     )
     llm.compile(
         generation_config,
-        max_requests_per_batch=1,
-        max_seq_length=2048,
-        max_tokens_per_batch=256,
+        max_requests_per_batch = configs_dict.get("max_requests_per_batch", 4),
+        max_seq_length = configs_dict.get("max_seq_length", 256),
+        max_tokens_per_batch = configs_dict.get("max_tokens_per_batch", 64),
     )
 
     llm.start_server()
@@ -92,7 +96,7 @@ def main():
         {"role": "system", "content": nemotron_system},
         {"role": "user", "content": "Is Rust better than Python?"},
     ]
-    llm.generate(messages, max_new_tokens=1024)
+    llm.generate(messages, max_new_tokens=configs_dict.get("max_new_tokens", 10244))
     
     llm.stop_server()
 
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
index 324e10b80..2adf5367b 100644
--- a/inference/python/ff_peft.py
+++ b/inference/python/ff_peft.py
@@ -64,9 +64,6 @@ def get_configs():
             "base_model": "JackFram/llama-160m",
             "inference_peft_model_id": "goliaro/llama-160m-lora",
             "finetuning_peft_model_id": "goliaro/llama-160m-lora",
-            # "base_model": "meta-llama/Meta-Llama-3-8B",
-            # "inference_peft_model_id": "goliaro/llama-3-8b-lora",
-            # "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly",
             # optional parameters
             "cache_path": os.environ.get("FF_CACHE_PATH", ""),
             "refresh_cache": False,
@@ -77,6 +74,10 @@ def get_configs():
                 "../prompt/peft_dataset.json",
             ),
             "output_file": "",
+            "max_requests_per_batch": 1,
+            "max_seq_length": 256,
+            "max_tokens_per_batch": 128,
+            "max_concurrent_adapters": 1,
         }
         # Merge dictionaries
         ff_init_configs.update(model_configs)
@@ -109,11 +110,11 @@ def main():
     enable_peft_finetuning = len(configs.finetuning_dataset) > 0
     llm.compile(
         generation_config,
-        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
-        max_seq_length=256,
-        max_tokens_per_batch=128,
-        max_concurrent_adapters=1 if not enable_peft_finetuning else 2,
-        enable_peft_finetuning=enable_peft_finetuning,
+        max_requests_per_batch = configs_dict.get("max_requests_per_batch", 1) + enable_peft_finetuning,
+        max_seq_length = configs_dict.get("max_seq_length", 256),
+        max_tokens_per_batch = configs_dict.get("max_tokens_per_batch", 128),
+        max_concurrent_adapters = configs_dict.get("max_concurrent_adapters", 1) + enable_peft_finetuning,
+        enable_peft_finetuning = enable_peft_finetuning,
     )
 
     llm.start_server()
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 968aa65b2..1c5dba7f2 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -69,6 +69,9 @@ def get_configs():
             "full_precision": False,
             "prompt": "",
             "output_file": "",
+            "max_requests_per_batch": 4,
+            "max_seq_length": 256,
+            "max_tokens_per_batch": 64,
             "max_length": 128,
         }
         # Merge dictionaries
@@ -101,9 +104,9 @@ def main():
     )
     llm.compile(
         generation_config,
-        max_requests_per_batch=4,
-        max_seq_length=256,
-        max_tokens_per_batch=64,
+        max_requests_per_batch = configs_dict.get("max_requests_per_batch", 4),
+        max_seq_length = configs_dict.get("max_seq_length", 256),
+        max_tokens_per_batch = configs_dict.get("max_tokens_per_batch", 64),
     )
 
     llm.start_server()
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index a7652be59..4351b4cc2 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -79,6 +79,9 @@ def get_configs():
             ],
             "prompt": "",
             "output_file": "",
+            "max_requests_per_batch": 4,
+            "max_seq_length": 256,
+            "max_tokens_per_batch": 64,
             "max_length": 128,
         }
         # Merge dictionaries
@@ -130,17 +133,17 @@ def main():
     for ssm in ssms:
         ssm.compile(
             generation_config,
-            max_requests_per_batch=4,
-            max_seq_length=256,
-            max_tokens_per_batch=64,
+            max_requests_per_batch = configs_dict.get("max_requests_per_batch", 4),
+            max_seq_length = configs_dict.get("max_seq_length", 256),
+            max_tokens_per_batch = configs_dict.get("max_tokens_per_batch", 64),
         )
 
     # Compile the LLM for inference and load the weights into memory
     llm.compile(
         generation_config,
-        max_requests_per_batch=4,
-        max_seq_length=256,
-        max_tokens_per_batch=64,
+        max_requests_per_batch = configs_dict.get("max_requests_per_batch", 4),
+        max_seq_length = configs_dict.get("max_seq_length", 256),
+        max_tokens_per_batch = configs_dict.get("max_tokens_per_batch", 64),
         ssms=ssms,
     )
 
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 7ec3cf61f..906c237bc 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -64,7 +64,8 @@ void parse_input_args(char **argv,
                       int &max_requests_per_batch,
                       int &max_tokens_per_batch,
                       int &max_sequence_length,
-                      int &expansion_degree) {
+                      int &expansion_degree,
+                      int &max_length) {
   for (int i = 1; i < argc; i++) {
     // llm model name
     if (!strcmp(argv[i], "-llm-model")) {
@@ -115,6 +116,7 @@ void parse_input_args(char **argv,
       max_tokens_per_batch = std::stoi(argv[++i]);
       continue;
     }
+    // max allowed sequence length
     if (!strcmp(argv[i], "--max-sequence-length")) {
       max_sequence_length = std::stoi(argv[++i]);
       continue;
@@ -123,6 +125,11 @@ void parse_input_args(char **argv,
       expansion_degree = std::stoi(argv[++i]);
       continue;
     }
+    // max length before stopping if we haven't reached the EOS
+    if (!strcmp(argv[i], "--max-length")) {
+      max_length = std::stoi(argv[++i]);
+      continue;
+    }
   }
   if (paths.cache_folder_path.empty()) {
     char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
@@ -290,6 +297,7 @@ void FlexFlow::top_level_task(Task const *task,
   int max_sequence_length = 1024;
   int max_spec_tree_token_num = 23;
   int expansion_degree = 3;
+  int max_length = 128;
 
   InputArgs const &command_args = HighLevelRuntime::get_input_args();
   char **argv = command_args.argv;
@@ -303,7 +311,8 @@ void FlexFlow::top_level_task(Task const *task,
                    max_requests_per_batch,
                    max_tokens_per_batch,
                    max_sequence_length,
-                   expansion_degree);
+                   expansion_degree,
+                   max_length);
 
   get_model_meta(file_paths, model_metadata, use_full_precision);
 
@@ -334,6 +343,11 @@ void FlexFlow::top_level_task(Task const *task,
 
   // Create LLM model
   FFModel tree_model(ffconfig, ffconfig.cpu_offload);
+  // set amount of kv cache needed
+  tree_model.set_num_kv_cache_pages(compute_num_kv_cache_pages_needed(
+      max_sequence_length + max_spec_tree_token_num,
+      max_requests_per_batch,
+      true));
   if (model_metadata.llm_model_type == ModelType::LLAMA) {
     LLAMA::create_llama_model(tree_model,
                               model_metadata.llm_model_config_path,
@@ -378,6 +392,10 @@ void FlexFlow::top_level_task(Task const *task,
 
   for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) {
     FFModel &beam_model = ssm_models[ssm_id];
+    beam_model.set_num_kv_cache_pages(compute_num_kv_cache_pages_needed(
+        max_sequence_length + max_spec_tree_token_num,
+        max_requests_per_batch,
+        true));
     if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) {
       LLAMA::create_llama_model(beam_model,
                                 model_metadata.ssm_model_config_paths[ssm_id],
@@ -432,7 +450,7 @@ void FlexFlow::top_level_task(Task const *task,
       // Add inference request
       Request inference_req;
       inference_req.prompt = text;
-      inference_req.max_length = 128;
+      inference_req.max_length = max_length;
       requests.push_back(inference_req);
       total_num_requests++;
     }
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 48c9bf211..469453c75 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -35,6 +35,7 @@
     ParameterSyncType,
     enum_to_int,
     int_to_enum,
+    data_type_size,
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
@@ -811,11 +812,11 @@ def pipeline_parallelism_degree(self, value):
     @property
     def python_data_loader_type(self):
         return ffc().flexflow_config_get_python_data_loader_type(self.handle)
-    
+
     @property
     def enable_peft(self):
         return ffc().flexflow_config_get_enable_peft(self.handle)
-    
+
     @property
     def enable_peft_finetuning(self):
         return ffc().flexflow_config_get_enable_peft_finetuning(self.handle)
@@ -823,9 +824,11 @@ def enable_peft_finetuning(self):
     @enable_peft_finetuning.setter
     def enable_peft_finetuning(self, value):
         if type(value) is not bool:
-            raise ValueError("enable_peft_finetuning must be specified as a boolean value")
+            raise ValueError(
+                "enable_peft_finetuning must be specified as a boolean value"
+            )
         ffc().flexflow_config_set_enable_peft_finetuning(self.handle, value)
-    
+
     @property
     def cpu_offload(self):
         return ffc().flexflow_config_get_offload(self.handle)
@@ -1584,6 +1587,24 @@ def __init__(self):
         )
 
 
+# -----------------------------------------------------------------------
+# PageManager
+# -----------------------------------------------------------------------
+
+
+class PageManager(object):
+    __slots__ = ["handle"]
+
+    def __init__(self, num_total_pages: int):
+        self.handle = ffc().flexflow_page_manager_get_page_manager(num_total_pages)
+
+    def get_tot_num_pages(self):
+        return ffc().flexflow_page_manager_get_tot_num_pages(self.handle)
+
+    def get_tokens_per_page(self):
+        return ffc().flexflow_page_manager_get_tokens_per_page(self.handle)
+
+
 # -----------------------------------------------------------------------
 # RequestManager
 # -----------------------------------------------------------------------
@@ -1621,21 +1642,34 @@ def register_ssm_model(self, model):
             self.handle, model.handle
         )
 
+    # Max requests per batch
     def set_max_requests_per_batch(self, max_requests):
         return ffc().flexflow_request_manager_set_max_requests_per_batch(
             self.handle, max_requests
         )
 
+    def get_max_requests_per_batch(self):
+        return ffc().flexflow_request_manager_get_max_requests_per_batch(self.handle)
+
+    # Max tokens per batch
     def set_max_tokens_per_batch(self, max_tokens):
         return ffc().flexflow_request_manager_set_max_tokens_per_batch(
             self.handle, max_tokens
         )
 
+    def get_max_tokens_per_batch(self):
+        return ffc().flexflow_request_manager_get_max_tokens_per_batch(self.handle)
+
+    # Max spec tree token num
     def set_max_spec_tree_token_num(self, max_tokens):
         return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
             self.handle, max_tokens
         )
 
+    def get_max_spec_tree_token_num(self):
+        return ffc().flexflow_request_manager_get_max_spec_tree_token_num(self.handle)
+
+    # Max sequence length
     def set_max_sequence_length(self, max_length):
         return ffc().flexflow_request_manager_set_max_sequence_length(
             self.handle, max_length
@@ -1644,15 +1678,18 @@ def set_max_sequence_length(self, max_length):
     def get_max_sequence_length(self):
         return ffc().flexflow_request_manager_get_max_sequence_length(self.handle)
 
+    # Num transformer layers
     def set_num_transformers_layers(self, num_layers):
         return ffc().flexflow_request_manager_set_num_transformers_layers(
             self.handle, num_layers
         )
+
+    # Num layers per finetuning steps
     def set_num_layers_per_finetuning_step(self, num_layers):
         return ffc().flexflow_request_manager_set_num_layers_per_finetuning_step(
             self.handle, num_layers
         )
-    
+
     def set_max_concurrent_adapters(self, max_adapters):
         return ffc().flexflow_request_manager_set_max_concurrent_adapters(
             self.handle, max_adapters
@@ -3550,303 +3587,6 @@ def multihead_attention(
         return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
 
     def inc_multihead_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        rotary_embedding_meta=RotaryEmbeddingMeta(),
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
-        :type rotary_embedding_meta: RotaryEmbeddingMeta
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
-
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
-
-        :param name: the name of the layer. Default is None.
-        :type name: string
-
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            rotary_embedding_meta.apply_rotary_embedding,
-            rotary_embedding_meta.rope_theta,
-            get_c_name(rotary_embedding_meta.rope_type),
-            rotary_embedding_meta.factor,
-            rotary_embedding_meta.low_freq_factor,
-            rotary_embedding_meta.high_freq_factor,
-            rotary_embedding_meta.original_max_position_embeddings,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
-        )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
-
-    def spec_inc_multihead_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        rotary_embedding_meta=RotaryEmbeddingMeta(),
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
-        :type rotary_embedding_meta: RotaryEmbeddingMeta
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
-
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
-
-        :param name: the name of the layer. Default is None.
-        :type name: string
-
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            rotary_embedding_meta.apply_rotary_embedding,
-            rotary_embedding_meta.rope_theta,
-            get_c_name(rotary_embedding_meta.rope_type),
-            rotary_embedding_meta.factor,
-            rotary_embedding_meta.low_freq_factor,
-            rotary_embedding_meta.high_freq_factor,
-            rotary_embedding_meta.original_max_position_embeddings,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
-        )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
-
-    def inc_multihead_self_attention_verify(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        rotary_embedding_meta=RotaryEmbeddingMeta(),
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param rotary_embedding_meta: Metadata regarding the RoPE embedding, if used.
-        :type rotary_embedding_meta: RotaryEmbeddingMeta
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
-
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
-
-        :param name: the name of the layer. Default is None.
-        :type name: string
-
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            rotary_embedding_meta.apply_rotary_embedding,
-            rotary_embedding_meta.rope_theta,
-            get_c_name(rotary_embedding_meta.rope_type),
-            rotary_embedding_meta.factor,
-            rotary_embedding_meta.low_freq_factor,
-            rotary_embedding_meta.high_freq_factor,
-            rotary_embedding_meta.original_max_position_embeddings,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
-        )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
-
-    def inc_multiquery_self_attention(
         self,
         input,
         embed_dim,
@@ -3922,7 +3662,7 @@ def inc_multiquery_self_attention(
         c_name = get_c_name(name)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
         c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
             self.handle,
             input.handle,
             embed_dim,
@@ -3950,7 +3690,7 @@ def inc_multiquery_self_attention(
         self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
         return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
 
-    def spec_inc_multiquery_self_attention(
+    def spec_inc_multihead_self_attention(
         self,
         input,
         embed_dim,
@@ -4026,7 +3766,7 @@ def spec_inc_multiquery_self_attention(
         c_name = get_c_name(name)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
         c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
+        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
             self.handle,
             input.handle,
             embed_dim,
@@ -4054,7 +3794,7 @@ def spec_inc_multiquery_self_attention(
         self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
         return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def inc_multiquery_self_attention_verify(
+    def inc_multihead_self_attention_verify(
         self,
         input,
         embed_dim,
@@ -4130,7 +3870,7 @@ def inc_multiquery_self_attention_verify(
         c_name = get_c_name(name)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
         c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
             self.handle,
             input.handle,
             embed_dim,
@@ -4318,10 +4058,14 @@ def argmax(self, input, beam_search, name=None):
 
     def add_lora_layers(self, target_modules: List[str]):
         c_target_modules = [get_c_name(module) for module in target_modules]
-        return ffc().flexflow_model_add_lora_layers(self.handle, len(target_modules), c_target_modules)
-    
+        return ffc().flexflow_model_add_lora_layers(
+            self.handle, len(target_modules), c_target_modules
+        )
+
     def register_peft_adapter(self, peft_config):
-        return ffc().flexflow_model_register_peft_adapter(self.handle, peft_config.handle)
+        return ffc().flexflow_model_register_peft_adapter(
+            self.handle, peft_config.handle
+        )
 
     def reset_metrics(self):
         """Reset performance metrics.
@@ -4552,6 +4296,9 @@ def get_perf_metrics(self):
     def set_transformer_layer_id(self, id):
         ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
 
+    def set_num_kv_cache_pages(self, num_kv_cache_pages):
+        ffc().flexflow_model_set_num_kv_cache_pages(self.handle, num_kv_cache_pages)
+
     def create_data_loader(self, batch_tensor, full_array):
         """Create a SingleDataloader instance.
 
@@ -4787,3 +4534,13 @@ def generate(self, requests_list: List[Request]):
 
     def set_position_offset(self, offset):
         ffc().flexflow_model_set_position_offset(self.handle, offset)
+
+
+def compute_num_kv_cache_pages_needed(
+    max_seq_len: int,
+    batch_size: int,
+    is_spec: bool,
+):
+    return ffc().flexflow_compute_num_kv_cache_pages_needed(
+        max_seq_len, batch_size, is_spec
+    )
diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index 17bb89425..fb6e8cec0 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -21,9 +21,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
-        # max_tokens_per_batch=64,
         weights_filepath="",
         tokenizer_filepath="",
     ):
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index ee1b090af..8f58c2715 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -19,11 +19,6 @@
 
 class FalconConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
-        self.max_beam_width = 1
-        self.max_beam_depth = 8
-        self.max_spec_tree_token_num = 20
         self.bias = hf_config.bias
         self.hidden_size = hf_config.hidden_size
         self.layer_norm_epsilon = hf_config.layer_norm_epsilon
@@ -43,15 +38,25 @@ def __init__(self, hf_config):
         self.vocab_size = hf_config.vocab_size
         self.rotary_embedding_meta = RotaryEmbeddingMeta(
             apply_rotary_embedding=True,
-            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+            rope_theta=(
+                hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0
+            ),
         )
         if "rope_scaling" in hf_config.__dict__:
             if hf_config.rope_scaling is not None:
-                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling[
+                    "rope_type"
+                ]
                 self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
-                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
-                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
-                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling[
+                    "low_freq_factor"
+                ]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling[
+                    "high_freq_factor"
+                ]
+                self.rotary_embedding_meta.original_max_position_embeddings = (
+                    hf_config.rope_scaling["original_max_position_embeddings"]
+                )
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = self.n_head
         self.num_key_value_heads = self.n_head_kv
@@ -65,7 +70,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
     ):
@@ -77,9 +81,6 @@ def __init__(
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
-        max_verify_tokens_per_batch = (
-            max_tokens_per_batch + self.falcon_config.max_spec_tree_token_num
-        )
 
         # Sanity checks
         if self.falcon_config.hidden_size % self.falcon_config.n_head != 0:
@@ -94,16 +95,26 @@ def __init__(
                 f"Number of q attention heads ({self.falcon_config.n_head}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
             )
 
-        self.build_model(
-            max_tokens_per_batch
-            if self.mode == InferenceMode.INC_DECODING_MODE
-            else max_verify_tokens_per_batch
-        )
+        self.build_model()
 
-    def build_model(self, max_tokens_per_batch):
+    def build_model(self):
         ffmodel = FFModel(self.ffconfig)
+        is_spec = self.mode != InferenceMode.INC_DECODING_MODE
+        self.rm = RequestManager()
+        self.max_requests_per_batch = self.rm.get_max_requests_per_batch()
+        self.max_sequence_length = self.rm.get_max_sequence_length()
+        self.max_tokens_per_batch = self.rm.get_max_tokens_per_batch()
+        if is_spec:
+            self.max_tokens_per_batch += self.rm.get_max_spec_tree_token_num()
+            self.max_sequence_length += self.rm.get_max_spec_tree_token_num()
+
+        ffmodel.set_num_kv_cache_pages(
+            compute_num_kv_cache_pages_needed(
+                self.max_sequence_length, self.max_requests_per_batch, is_spec
+            )
+        )
 
-        tokens_dims = [max_tokens_per_batch, 1]
+        tokens_dims = [self.max_tokens_per_batch, 1]
         input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
 
         embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0)
@@ -143,20 +154,21 @@ def build_model(self, max_tokens_per_batch):
                     self.falcon_config.layer_norm_epsilon,
                     name=f"layers.{i}.input_layernorm",
                 )
-            
-            assert(self.falcon_config.hidden_size % self.falcon_config.n_head == 0)
+
+            assert self.falcon_config.hidden_size % self.falcon_config.n_head == 0
             head_dim = self.falcon_config.hidden_size // self.falcon_config.n_head
 
             qkv_proj = ffmodel.dense(
                 att_norm,
-                head_dim * (self.falcon_config.n_head + 2*self.falcon_config.n_head_kv),
+                head_dim
+                * (self.falcon_config.n_head + 2 * self.falcon_config.n_head_kv),
                 ActiMode.AC_MODE_NONE,
                 False,
                 name=f"layers.{i}.self_attention.qkv_proj",
             )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
-                o_proj = ffmodel.spec_inc_multiquery_self_attention(
+                o_proj = ffmodel.spec_inc_multihead_self_attention(
                     qkv_proj,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
@@ -171,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
-                o_proj = ffmodel.inc_multiquery_self_attention_verify(
+                o_proj = ffmodel.inc_multihead_self_attention_verify(
                     qkv_proj,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
@@ -186,7 +198,7 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                o_proj = ffmodel.inc_multiquery_self_attention(
+                o_proj = ffmodel.inc_multihead_self_attention(
                     qkv_proj,
                     self.falcon_config.hidden_size,
                     self.falcon_config.n_head,
@@ -208,7 +220,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.self_attention.o_proj"
+                name=f"layers.{i}.self_attention.o_proj",
             )
 
             dense_h_to_4h = ffmodel.dense(
@@ -260,7 +272,7 @@ def build_model(self, max_tokens_per_batch):
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
                 softmax = ffmodel.softmax(lm_head, -1)
                 output = ffmodel.argmax(softmax, False)
-        
+
         if self.ffconfig.enable_peft:
             # TODO: add attention projections
             ffmodel.add_lora_layers(["dense_h_to_4h", "dense_4h_to_h"])
@@ -269,7 +281,8 @@ def build_model(self, max_tokens_per_batch):
 
     # TODO: finish this
     def convert_hf_weight_name(name):
-        return (name.replace("transformer.h.", "layers.")
+        return (
+            name.replace("transformer.h.", "layers.")
             .replace("transformer.", "")
             .replace("self_attention.dense", "self_attention.o_proj")
         )
@@ -285,9 +298,15 @@ def convert_hf_model(model, dst_folder):
             name = FlexFlowFalcon.convert_hf_weight_name(name)
             # Split Q,K,V attention weights
             if "self_attention.query_key_value" in name:
-                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
-                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
-                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
+                name_q = name.replace(
+                    "self_attention.query_key_value", "self_attention.q_proj"
+                )
+                name_k = name.replace(
+                    "self_attention.query_key_value", "self_attention.k_proj"
+                )
+                name_v = name.replace(
+                    "self_attention.query_key_value", "self_attention.v_proj"
+                )
                 q, k, v = torch.split(
                     params,
                     [
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 2e46d575f..e51147ab3 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -19,9 +19,6 @@
 
 class LLAMAConfig:
     def __init__(self, hf_config):
-        self.max_beam_width = 1
-        self.max_beam_depth = 8
-        self.max_spec_tree_token_num = 20
         self.num_hidden_layers = hf_config.num_hidden_layers
         self.vocab_size = hf_config.vocab_size
         self.hidden_size = hf_config.hidden_size
@@ -29,15 +26,25 @@ def __init__(self, hf_config):
         self.intermediate_size = hf_config.intermediate_size
         self.rotary_embedding_meta = RotaryEmbeddingMeta(
             apply_rotary_embedding=True,
-            rope_theta=hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0,
+            rope_theta=(
+                hf_config.rope_theta if "rope_theta" in hf_config.__dict__ else 10000.0
+            ),
         )
         if "rope_scaling" in hf_config.__dict__:
             if hf_config.rope_scaling is not None:
-                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling["rope_type"]
+                self.rotary_embedding_meta.rope_type = hf_config.rope_scaling[
+                    "rope_type"
+                ]
                 self.rotary_embedding_meta.factor = hf_config.rope_scaling["factor"]
-                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling["low_freq_factor"]
-                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling["high_freq_factor"]
-                self.rotary_embedding_meta.original_max_position_embeddings = hf_config.rope_scaling["original_max_position_embeddings"]
+                self.rotary_embedding_meta.low_freq_factor = hf_config.rope_scaling[
+                    "low_freq_factor"
+                ]
+                self.rotary_embedding_meta.high_freq_factor = hf_config.rope_scaling[
+                    "high_freq_factor"
+                ]
+                self.rotary_embedding_meta.original_max_position_embeddings = (
+                    hf_config.rope_scaling["original_max_position_embeddings"]
+                )
         # Standardized FlexFlow num heads fields below
         self.num_attention_heads = hf_config.num_attention_heads
         self.num_key_value_heads = (
@@ -55,9 +62,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
-        max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
     ):
@@ -68,10 +72,7 @@ def __init__(
         self.llama_config = LLAMAConfig(hf_config)
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
-        self.maxint = 2 ** 31 - 1
-        max_verify_tokens_per_batch = (
-            max_tokens_per_batch + self.llama_config.max_spec_tree_token_num
-        )
+        self.maxint = 2**31 - 1
 
         # Sanity checks
         if self.llama_config.hidden_size % self.llama_config.num_attention_heads != 0:
@@ -91,16 +92,27 @@ def __init__(
                 f"Number of attention heads ({self.llama_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
             )
 
-        self.build_model(
-            max_tokens_per_batch
-            if self.mode == InferenceMode.INC_DECODING_MODE
-            else max_verify_tokens_per_batch
-        )
+        self.build_model()
 
-    def build_model(self, max_tokens_per_batch):
+    def build_model(self):
         ffmodel = FFModel(self.ffconfig)
 
-        tokens_dims = [max_tokens_per_batch, 1]
+        is_spec = self.mode != InferenceMode.INC_DECODING_MODE
+        self.rm = RequestManager()
+        self.max_requests_per_batch = self.rm.get_max_requests_per_batch()
+        self.max_sequence_length = self.rm.get_max_sequence_length()
+        self.max_tokens_per_batch = self.rm.get_max_tokens_per_batch()
+        if is_spec:
+            self.max_tokens_per_batch += self.rm.get_max_spec_tree_token_num()
+            self.max_sequence_length += self.rm.get_max_spec_tree_token_num()
+
+        ffmodel.set_num_kv_cache_pages(
+            compute_num_kv_cache_pages_needed(
+                self.max_sequence_length, self.max_requests_per_batch, is_spec
+            )
+        )
+
+        tokens_dims = [self.max_tokens_per_batch, 1]
         input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
 
         embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0)
@@ -134,19 +146,28 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.input_layernorm",
                 )
 
-            assert( self.llama_config.hidden_size % self.llama_config.num_attention_heads == 0 )
-            head_dim = self.llama_config.hidden_size // self.llama_config.num_attention_heads
+            assert (
+                self.llama_config.hidden_size % self.llama_config.num_attention_heads
+                == 0
+            )
+            head_dim = (
+                self.llama_config.hidden_size // self.llama_config.num_attention_heads
+            )
 
             qkv_proj = ffmodel.dense(
                 attn_norm,
-                head_dim * (self.llama_config.num_attention_heads + 2 * self.llama_config.num_key_value_heads),
+                head_dim
+                * (
+                    self.llama_config.num_attention_heads
+                    + 2 * self.llama_config.num_key_value_heads
+                ),
                 ActiMode.AC_MODE_NONE,
                 False,
                 name=f"layers.{i}.self_attn.qkv_proj",
             )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
-                mha = ffmodel.spec_inc_multiquery_self_attention(
+                mha = ffmodel.spec_inc_multihead_self_attention(
                     qkv_proj,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
@@ -161,7 +182,7 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
-                mha = ffmodel.inc_multiquery_self_attention_verify(
+                mha = ffmodel.inc_multihead_self_attention_verify(
                     qkv_proj,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
@@ -178,7 +199,7 @@ def build_model(self, max_tokens_per_batch):
                     name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
-                mha = ffmodel.inc_multiquery_self_attention(
+                mha = ffmodel.inc_multihead_self_attention(
                     qkv_proj,
                     self.llama_config.hidden_size,
                     self.llama_config.num_attention_heads,
@@ -202,7 +223,7 @@ def build_model(self, max_tokens_per_batch):
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.self_attn.o_proj"
+                name=f"layers.{i}.self_attn.o_proj",
             )
 
             token, ff_norm = ffmodel.residual_rms_norm(
@@ -265,7 +286,7 @@ def build_model(self, max_tokens_per_batch):
                 # output = ffmodel.arg_top_k(dense, 1, False)
                 softmax = ffmodel.softmax(dense, -1)
                 output = ffmodel.argmax(softmax, False)
-        
+
         if self.ffconfig.enable_peft:
             # TODO: add attention projections
             ffmodel.add_lora_layers(["gate_proj", "up_proj", "down_proj"])
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index d927a1fbb..3b18cce6a 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -19,9 +19,6 @@
 
 class MPTConfig:
     def __init__(self, hf_config):
-        self.max_beam_width = 1
-        self.max_beam_depth = 8
-        self.max_spec_tree_token_num = 20
         self.hidden_size = hf_config.d_model
         self.n_heads = hf_config.n_heads
         self.n_layers = hf_config.n_layers
@@ -40,8 +37,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        # max_batch_size=1,
-        # max_seq_length=256,
         max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
@@ -54,9 +49,6 @@ def __init__(
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
-        max_verify_tokens_per_batch = (
-            max_tokens_per_batch + self.mpt_config.max_spec_tree_token_num
-        )
 
         # Sanity checks
         if self.mpt_config.hidden_size % self.mpt_config.n_heads != 0:
@@ -72,16 +64,27 @@ def __init__(
             raise ValueError(
                 f"Number of attention heads ({self.mpt_config.n_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
             )
-        self.build_model(
-            max_tokens_per_batch
-            if self.mode == InferenceMode.INC_DECODING_MODE
-            else max_verify_tokens_per_batch
-        )
+        self.build_model()
 
-    def build_model(self, max_tokens_per_batch):
+    def build_model(self):
         ffmodel = FFModel(self.ffconfig)
 
-        tokens_dims = [max_tokens_per_batch, 1]
+        is_spec = self.mode != InferenceMode.INC_DECODING_MODE
+        self.rm = RequestManager()
+        self.max_requests_per_batch = self.rm.get_max_requests_per_batch()
+        self.max_sequence_length = self.rm.get_max_sequence_length()
+        self.max_tokens_per_batch = self.rm.get_max_tokens_per_batch()
+        if is_spec:
+            self.max_tokens_per_batch += self.rm.get_max_spec_tree_token_num()
+            self.max_sequence_length += self.rm.get_max_spec_tree_token_num()
+
+        ffmodel.set_num_kv_cache_pages(
+            compute_num_kv_cache_pages_needed(
+                self.max_sequence_length, self.max_requests_per_batch, is_spec
+            )
+        )
+
+        tokens_dims = [self.max_tokens_per_batch, 1]
         input = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
 
         embed_init = UniformInitializer(random.randint(0, self.maxint), 0, 0)
@@ -138,6 +141,7 @@ def build_model(self, max_tokens_per_batch):
                     qkv_proj,
                     self.mpt_config.hidden_size,
                     self.mpt_config.n_heads,
+                    self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     0.0,  # dropout
@@ -157,6 +161,7 @@ def build_model(self, max_tokens_per_batch):
                     qkv_proj,
                     self.mpt_config.hidden_size,
                     self.mpt_config.n_heads,
+                    self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     0.0,  # dropout
@@ -176,6 +181,7 @@ def build_model(self, max_tokens_per_batch):
                     qkv_proj,
                     self.mpt_config.hidden_size,
                     self.mpt_config.n_heads,
+                    self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     self.mpt_config.hidden_size // self.mpt_config.n_heads,
                     0.0,  # dropout
@@ -198,7 +204,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.attn.o_proj"
+                name=f"layers.{i}.attn.o_proj",
             )
 
             hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -261,7 +267,7 @@ def build_model(self, max_tokens_per_batch):
         if self.ffconfig.enable_peft:
             # TODO: add attention projections
             ffmodel.add_lora_layers(["up_proj", "down_proj"])
-        
+
         self.ffmodel = ffmodel
 
     # TODO: finish this
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index e8d6fec9a..14c230d7c 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -19,11 +19,6 @@
 
 class OPTConfig:
     def __init__(self, hf_config):
-        # self.max_seq_len = 256
-        # self.max_num_tokens = 64
-        self.max_beam_width = 1
-        self.max_beam_depth = 8
-        self.max_spec_tree_token_num = 20
         self.do_layer_norm_before = hf_config.do_layer_norm_before
         self.dropout = hf_config.dropout
         self.enable_bias = hf_config.enable_bias
@@ -48,7 +43,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
     ):
@@ -60,9 +54,6 @@ def __init__(
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
-        max_verify_tokens_per_batch = (
-            max_tokens_per_batch + self.opt_config.max_spec_tree_token_num
-        )
 
         # Sanity checks
         if self.opt_config.hidden_size % self.opt_config.num_attention_heads != 0:
@@ -82,16 +73,26 @@ def __init__(
                 f"Number of attention heads ({self.opt_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
             )
 
-        self.build_model(
-            max_tokens_per_batch
-            if self.mode == InferenceMode.INC_DECODING_MODE
-            else max_verify_tokens_per_batch
-        )
+        self.build_model()
 
-    def build_model(self, max_tokens_per_batch):
+    def build_model(self):
         ffmodel = FFModel(self.ffconfig)
+        is_spec = self.mode != InferenceMode.INC_DECODING_MODE
+        self.rm = RequestManager()
+        self.max_requests_per_batch = self.rm.get_max_requests_per_batch()
+        self.max_sequence_length = self.rm.get_max_sequence_length()
+        self.max_tokens_per_batch = self.rm.get_max_tokens_per_batch()
+        if is_spec:
+            self.max_tokens_per_batch += self.rm.get_max_spec_tree_token_num()
+            self.max_sequence_length += self.rm.get_max_spec_tree_token_num()
+
+        ffmodel.set_num_kv_cache_pages(
+            compute_num_kv_cache_pages_needed(
+                self.max_sequence_length, self.max_requests_per_batch, is_spec
+            )
+        )
 
-        tokens_dims = [max_tokens_per_batch, 1]
+        tokens_dims = [self.max_tokens_per_batch, 1]
         input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
         position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
 
@@ -142,7 +143,7 @@ def build_model(self, max_tokens_per_batch):
                 residual = hidden_states
 
             qkv_proj = ffmodel.dense(
-               hidden_states,
+                hidden_states,
                 3 * self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
@@ -154,6 +155,7 @@ def build_model(self, max_tokens_per_batch):
                     qkv_proj,
                     self.opt_config.hidden_size,
                     self.opt_config.num_attention_heads,
+                    self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     0.0,  # dropout
@@ -172,6 +174,7 @@ def build_model(self, max_tokens_per_batch):
                     qkv_proj,
                     self.opt_config.hidden_size,
                     self.opt_config.num_attention_heads,
+                    self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     0.0,  # dropout
@@ -190,6 +193,7 @@ def build_model(self, max_tokens_per_batch):
                     qkv_proj,
                     self.opt_config.hidden_size,
                     self.opt_config.num_attention_heads,
+                    self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     self.opt_config.hidden_size // self.opt_config.num_attention_heads,
                     0.0,  # dropout
@@ -211,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.self_attn.o_proj"
+                name=f"layers.{i}.self_attn.o_proj",
             )
             # This is either a before or after attention LayerNorm. In both cases, we need to compute the LN here.
             residual, ff_norm = ffmodel.add_bias_residual_layer_norm(
@@ -290,7 +294,7 @@ def build_model(self, max_tokens_per_batch):
         if self.ffconfig.enable_peft:
             # TODO: add attention projections
             ffmodel.add_lora_layers(["fc1", "fc2"])
-        
+
         self.ffmodel = ffmodel
 
     def convert_hf_weight_name(name):
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 107614e9d..393d288d4 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -19,9 +19,6 @@
 
 class STARCODERConfig:
     def __init__(self, hf_config):
-        self.max_beam_width = 1
-        self.max_beam_depth = 8
-        self.max_spec_tree_token_num = 20
         self.dropout_p = hf_config.attn_pdrop
         self.hidden_size = hf_config.n_embd
         self.layer_norm_epsilon = hf_config.layer_norm_epsilon
@@ -44,7 +41,6 @@ def __init__(
         ffconfig,
         hf_config,
         data_type,
-        max_tokens_per_batch,
         weights_filepath="",
         tokenizer_filepath="",
     ):
@@ -56,9 +52,6 @@ def __init__(
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
         self.maxint = 2**31 - 1
-        max_verify_tokens_per_batch = (
-            max_tokens_per_batch + self.starcoder_config.max_spec_tree_token_num
-        )
 
         # Sanity checks
         if (
@@ -82,16 +75,27 @@ def __init__(
                 f"Number of attention heads ({self.starcoder_config.num_attention_heads}) is smaller, or not divisible by tensor parallelism degree ({self.ffconfig.tensor_parallelism_degree})"
             )
 
-        self.build_model(
-            max_tokens_per_batch
-            if self.mode == InferenceMode.INC_DECODING_MODE
-            else max_verify_tokens_per_batch
-        )
+        self.build_model()
 
-    def build_model(self, max_tokens_per_batch):
+    def build_model(self):
         ffmodel = FFModel(self.ffconfig)
 
-        tokens_dims = [max_tokens_per_batch, 1]
+        is_spec = self.mode != InferenceMode.INC_DECODING_MODE
+        self.rm = RequestManager()
+        self.max_requests_per_batch = self.rm.get_max_requests_per_batch()
+        self.max_sequence_length = self.rm.get_max_sequence_length()
+        self.max_tokens_per_batch = self.rm.get_max_tokens_per_batch()
+        if is_spec:
+            self.max_tokens_per_batch += self.rm.get_max_spec_tree_token_num()
+            self.max_sequence_length += self.rm.get_max_spec_tree_token_num()
+
+        ffmodel.set_num_kv_cache_pages(
+            compute_num_kv_cache_pages_needed(
+                self.max_sequence_length, self.max_requests_per_batch, is_spec
+            )
+        )
+
+        tokens_dims = [self.max_tokens_per_batch, 1]
         input_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
         position_tensor = ffmodel.create_tensor(tokens_dims, DataType.DT_INT32)
 
@@ -145,7 +149,7 @@ def build_model(self, max_tokens_per_batch):
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
-            o_proj = ffmodel.inc_multiquery_self_attention(
+            o_proj = ffmodel.inc_multihead_self_attention(
                 qkv_proj,
                 self.starcoder_config.hidden_size,
                 self.starcoder_config.num_attention_heads,
@@ -167,7 +171,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers.{i}.self_attn.o_proj"
+                name=f"layers.{i}.self_attn.o_proj",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -231,7 +235,7 @@ def build_model(self, max_tokens_per_batch):
         if self.ffconfig.enable_peft:
             # TODO: add attention projections
             ffmodel.add_lora_layers(["c_fc", "c_proj"])
-        
+
         self.ffmodel = ffmodel
 
     def convert_hf_model(model, dst_folder):
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index 6db415aea..2e52c7649 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -267,7 +267,7 @@ def __is_empty_dir(self, folder: str) -> bool:
         if not os.path.isdir(folder) or not os.path.exists(folder):
             return True
         return len(os.listdir(folder)) == 1 and "rev_sha.txt" in os.listdir(folder)
-    
+
     def __need_cache_refresh(
         self, model_name: str, resource_type: CachedResourceType
     ) -> bool:
@@ -282,8 +282,15 @@ def __need_cache_refresh(
             bool: True if the weights or tokenizer need a refresh, False otherwise
         """
         resource_path = self.__get_resource_path(model_name, resource_type)
-        ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path)
-        if self.refresh_cache or not os.path.exists(resource_path) or self.__is_empty_dir(resource_path) or ff_revision != latest_revision:
+        ff_revision, latest_revision = self.__get_revision_hashes(
+            self.model_name, resource_path
+        )
+        if (
+            self.refresh_cache
+            or not os.path.exists(resource_path)
+            or self.__is_empty_dir(resource_path)
+            or ff_revision != latest_revision
+        ):
             print(
                 f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..."
             )
@@ -302,8 +309,12 @@ def download_hf_weights_if_needed(self) -> None:
         """
 
         def download_and_convert_llm_weights(model_name):
-            num_cores = os.cpu_count() -1 if os.cpu_count() > 1 else 1
-            snapshot_download(repo_id=model_name, allow_patterns="*.safetensors", max_workers=min(30, num_cores))
+            num_cores = os.cpu_count() - 1 if os.cpu_count() > 1 else 1
+            snapshot_download(
+                repo_id=model_name,
+                allow_patterns="*.safetensors",
+                max_workers=min(30, num_cores),
+            )
             hf_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 trust_remote_code=True,
@@ -486,6 +497,7 @@ def compile(
         self.rm = RequestManager()
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
         self.rm.set_max_tokens_per_batch(max_tokens_per_batch)
+        self.rm.set_max_spec_tree_token_num(20)
         self.rm.set_max_sequence_length(max_seq_length)
         self.rm.set_max_concurrent_adapters(max_concurrent_adapters)
         self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
@@ -493,9 +505,10 @@ def compile(
         if num_bwd_layers_per_ft_step != -1:
             self.rm.set_num_layers_per_finetuning_step(num_bwd_layers_per_ft_step)
         else:
-            self.rm.set_num_layers_per_finetuning_step(
-                self.hf_config.num_hidden_layers
-            )
+            self.rm.set_num_layers_per_finetuning_step(self.hf_config.num_hidden_layers)
+
+        # Create file data loader, load weights into tensors
+        model_configs = self.config_class(self.hf_config)
 
         # Instantiate the relevant model
         self.model = self.model_class(
@@ -516,15 +529,6 @@ def compile(
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
-        # Create file data loader, load weights into tensors
-        model_configs = self.config_class(self.hf_config)
-
-        self.rm.set_max_spec_tree_token_num(
-            model_configs.max_spec_tree_token_num
-            if "max_spec_tree_token_num" in model_configs.__dict__
-            else 20
-        )
-
         weights_path = self.__get_resource_path(
             self.model_name, CachedResourceType.WEIGHTS
         )
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 0f4726837..c2eebe899 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -194,3 +194,21 @@ def str_to_enum(enum, value):
             return item
 
     assert 0, "unknown enum value " + value + " " + str(enum)
+
+def data_type_size(value: DataType):
+    if value == DataType.DT_BOOLEAN:
+        return 1
+    elif value == DataType.DT_INT32:
+        return 4
+    elif value == DataType.DT_INT64:
+        return 8
+    elif value == DataType.DT_HALF:
+        return 2
+    elif value == DataType.DT_FLOAT:
+        return 4
+    elif value == DataType.DT_DOUBLE:
+        return 8
+    else:
+        raise ValueError(f"{value} is not a valid DataType")
+
+    
\ No newline at end of file
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index ae21fd0c5..5abde0579 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -74,6 +74,7 @@ class FFCObjectWrapper {
   //                       LoraAdamOptimizerConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_page_manager_t, PageManager *);
 };
 
 Logger ffc_log("flexflow_c");
@@ -1215,7 +1216,8 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
     flexflow_model_t handle_,
     const flexflow_tensor_t input_,
     int embed_dim,
-    int num_heads,
+    int num_q_heads,
+    int num_kv_heads,
     int kdim,
     int vdim,
     float dropout,
@@ -1247,7 +1249,8 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
                                             original_max_position_embeddings);
   Tensor tensor = handle->inc_multihead_self_attention(input,
                                                        embed_dim,
-                                                       num_heads,
+                                                       num_q_heads,
+                                                       num_kv_heads,
                                                        kdim,
                                                        vdim,
                                                        dropout,
@@ -1267,7 +1270,8 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
     flexflow_model_t handle_,
     const flexflow_tensor_t input_,
     int embed_dim,
-    int num_heads,
+    int num_q_heads,
+    int num_kv_heads,
     int kdim,
     int vdim,
     float dropout,
@@ -1300,7 +1304,8 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
   Tensor tensor =
       handle->spec_inc_multihead_self_attention(input,
                                                 embed_dim,
-                                                num_heads,
+                                                num_q_heads,
+                                                num_kv_heads,
                                                 kdim,
                                                 vdim,
                                                 dropout,
@@ -1320,7 +1325,8 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
     flexflow_model_t handle_,
     const flexflow_tensor_t input_,
     int embed_dim,
-    int num_heads,
+    int num_q_heads,
+    int num_kv_heads,
     int kdim,
     int vdim,
     float dropout,
@@ -1353,7 +1359,8 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   Tensor tensor =
       handle->inc_multihead_self_attention_verify(input,
                                                   embed_dim,
-                                                  num_heads,
+                                                  num_q_heads,
+                                                  num_kv_heads,
                                                   kdim,
                                                   vdim,
                                                   dropout,
@@ -1369,170 +1376,6 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
   return FFCObjectWrapper::wrap(tensor);
 }
 
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_q_heads,
-    int num_kv_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    bool apply_rotary_embedding,
-    float rope_theta,
-    char const *rope_type,
-    float rope_factor,
-    float low_freq_factor,
-    float high_freq_factor,
-    int original_max_position_embeddings,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name) {
-  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  Tensor input = FFCObjectWrapper::unwrap(input_);
-  Initializer *kernel_initializer =
-      FFCObjectWrapper::unwrap(kernel_initializer_);
-  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
-                                            rope_theta,
-                                            rope_type,
-                                            rope_factor,
-                                            low_freq_factor,
-                                            high_freq_factor,
-                                            original_max_position_embeddings);
-  Tensor tensor = handle->inc_multiquery_self_attention(input,
-                                                        embed_dim,
-                                                        num_q_heads,
-                                                        num_kv_heads,
-                                                        kdim,
-                                                        vdim,
-                                                        dropout,
-                                                        add_zero_attn,
-                                                        data_type,
-                                                        kernel_initializer,
-                                                        rotary_embedding_meta,
-                                                        scaling_query,
-                                                        scaling_factor,
-                                                        qk_prod_scaling,
-                                                        position_bias,
-                                                        name);
-  return FFCObjectWrapper::wrap(tensor);
-}
-
-flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_q_heads,
-    int num_kv_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    bool apply_rotary_embedding,
-    float rope_theta,
-    char const *rope_type,
-    float rope_factor,
-    float low_freq_factor,
-    float high_freq_factor,
-    int original_max_position_embeddings,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name) {
-  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  Tensor input = FFCObjectWrapper::unwrap(input_);
-  Initializer *kernel_initializer =
-      FFCObjectWrapper::unwrap(kernel_initializer_);
-  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
-                                            rope_theta,
-                                            rope_type,
-                                            rope_factor,
-                                            low_freq_factor,
-                                            high_freq_factor,
-                                            original_max_position_embeddings);
-  Tensor tensor =
-      handle->spec_inc_multiquery_self_attention(input,
-                                                 embed_dim,
-                                                 num_q_heads,
-                                                 num_kv_heads,
-                                                 kdim,
-                                                 vdim,
-                                                 dropout,
-                                                 add_zero_attn,
-                                                 data_type,
-                                                 kernel_initializer,
-                                                 rotary_embedding_meta,
-                                                 scaling_query,
-                                                 scaling_factor,
-                                                 qk_prod_scaling,
-                                                 position_bias,
-                                                 name);
-  return FFCObjectWrapper::wrap(tensor);
-}
-
-flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
-    flexflow_model_t handle_,
-    const flexflow_tensor_t input_,
-    int embed_dim,
-    int num_q_heads,
-    int num_kv_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    enum DataType data_type,
-    flexflow_initializer_t kernel_initializer_,
-    bool apply_rotary_embedding,
-    float rope_theta,
-    char const *rope_type,
-    float rope_factor,
-    float low_freq_factor,
-    float high_freq_factor,
-    int original_max_position_embeddings,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name) {
-  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  Tensor input = FFCObjectWrapper::unwrap(input_);
-  Initializer *kernel_initializer =
-      FFCObjectWrapper::unwrap(kernel_initializer_);
-  RotaryEmbeddingMeta rotary_embedding_meta(apply_rotary_embedding,
-                                            rope_theta,
-                                            rope_type,
-                                            rope_factor,
-                                            low_freq_factor,
-                                            high_freq_factor,
-                                            original_max_position_embeddings);
-  Tensor tensor =
-      handle->inc_multiquery_self_attention_verify(input,
-                                                   embed_dim,
-                                                   num_q_heads,
-                                                   num_kv_heads,
-                                                   kdim,
-                                                   vdim,
-                                                   dropout,
-                                                   add_zero_attn,
-                                                   data_type,
-                                                   kernel_initializer,
-                                                   rotary_embedding_meta,
-                                                   scaling_query,
-                                                   scaling_factor,
-                                                   qk_prod_scaling,
-                                                   position_bias,
-                                                   name);
-  return FFCObjectWrapper::wrap(tensor);
-}
-
 flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
                                               const flexflow_tensor_t input_,
                                               float eps,
@@ -1700,6 +1543,18 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
   handle->set_transformer_layer_id(id);
 }
 
+void flexflow_model_set_num_kv_cache_pages(flexflow_model_t handle_,
+                                           int num_kv_cache_pages) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_num_kv_cache_pages(num_kv_cache_pages);
+}
+
+int flexflow_compute_num_kv_cache_pages_needed(int max_seq_len,
+                                               int batch_size,
+                                               bool is_spec) {
+  return compute_num_kv_cache_pages_needed(max_seq_len, batch_size, is_spec);
+}
+
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
                              enum RequestType *request_types,
@@ -2758,6 +2613,12 @@ void flexflow_request_manager_set_max_requests_per_batch(
               max_num_requests);
 }
 
+int flexflow_request_manager_get_max_requests_per_batch(
+    flexflow_request_manager_t handle_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->get_max_requests_per_batch();
+}
+
 void flexflow_request_manager_set_max_tokens_per_batch(
     flexflow_request_manager_t handle_, int max_num_tokens) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2765,6 +2626,12 @@ void flexflow_request_manager_set_max_tokens_per_batch(
   DEBUG_PRINT("[RequestManager] set max_tokens_per_batch %d", max_num_tokens);
 }
 
+int flexflow_request_manager_get_max_tokens_per_batch(
+    flexflow_request_manager_t handle_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->get_max_tokens_per_batch();
+}
+
 void flexflow_request_manager_set_max_spec_tree_token_num(
     flexflow_request_manager_t handle_, int max_num_tokens) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2773,6 +2640,12 @@ void flexflow_request_manager_set_max_spec_tree_token_num(
               max_num_tokens);
 }
 
+int flexflow_request_manager_get_max_spec_tree_token_num(
+    flexflow_request_manager_t handle_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->get_max_spec_tree_token_num();
+}
+
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length) {
   RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
@@ -2875,6 +2748,33 @@ void flexflow_request_manager_terminate_background_server(
   handle->terminate_background_server();
 }
 
+// -----------------------------------------------------------------------
+// PageManager
+// -----------------------------------------------------------------------
+
+flexflow_page_manager_t
+    flexflow_page_manager_get_page_manager(int num_total_pages) {
+  assert(num_total_pages);
+  PageManager *pm = PageManager::get_page_manager(num_total_pages);
+  DEBUG_PRINT("[PageManager] get %p", pm);
+  return FFCObjectWrapper::wrap(pm);
+}
+
+int flexflow_page_manager_get_tot_num_pages(flexflow_page_manager_t handle_) {
+  PageManager *handle = FFCObjectWrapper::unwrap(handle_);
+  int num_pages = handle->get_tot_num_pages();
+  DEBUG_PRINT("[PageManager] %p get_tot_num_pages %d", handle, num_pages);
+  return num_pages;
+}
+
+int flexflow_page_manager_get_tokens_per_page(flexflow_page_manager_t handle_) {
+  PageManager *handle = FFCObjectWrapper::unwrap(handle_);
+  int tokens_per_page = handle->get_tokens_per_page();
+  DEBUG_PRINT(
+      "[PageManager] %p get_tokens_per_page %d", handle, tokens_per_page);
+  return tokens_per_page;
+}
+
 // -----------------------------------------------------------------------
 // InferenceManager
 // -----------------------------------------------------------------------
diff --git a/src/ops/attention_impl.cu b/src/ops/attention_impl.cu
new file mode 100644
index 000000000..f3cc8df92
--- /dev/null
+++ b/src/ops/attention_impl.cu
@@ -0,0 +1,818 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "cuComplex.h"
+#endif
+#include "flashinfer/attention_impl.cuh"
+
+// This is for instantiating the template attention kernels
+namespace flashinfer {
+
+// warp_layout_literal[] = {
+//   "WarpLayout::k4x1x2",
+//   "WarpLayout::k4x1x1",
+//   "WarpLayout::k1x4x1",
+// }
+// head_dim[] = {64, 128, 256};
+
+/********** batch append instantiations for half precision **********/
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCustom,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+/********** batch prefill instantiations for half precision **********/
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x2,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k4x1x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           64,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           128,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchPrefillWithPagedKVCacheDispatched<PageStorage::kIndices,
+                                           WarpLayout::k1x4x1,
+                                           256,
+                                           LogitsPostHook::kNone,
+                                           PosEncodingMode::kNone,
+                                           false,
+                                           MaskMode::kCausal,
+                                           half,
+                                           half,
+                                           half,
+                                           int32_t>(
+        half *q,
+        int32_t *request_indices,
+        int32_t *q_tile_indices,
+        int32_t *kv_tile_indices,
+        int32_t *q_indptr,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        uint8_t *custom_mask,
+        int32_t *qk_indptr,
+        int32_t *o_indptr,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        int32_t *merge_indptr,
+        bool *block_valid_mask,
+        int32_t *kv_chunk_size_ptr,
+        uint32_t total_num_rows,
+        uint32_t num_qo_heads,
+        uint32_t padded_batch_size,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+/********** batch decode instantiations for half precision **********/
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<64,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<128,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+template cudaError_t
+    BatchDecodeWithPagedKVCacheDispatched<256,
+                                          PageStorage::kIndices,
+                                          LogitsPostHook::kNone,
+                                          PosEncodingMode::kNone,
+                                          half,
+                                          half,
+                                          half,
+                                          int32_t>(
+        half *q,
+        int32_t *q_offset,
+        paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv,
+        kv_partition_info_t<int32_t> kv_partition_info,
+        half *o,
+        half *tmp_v,
+        float *tmp_s,
+        float *lse,
+        bool *block_valid_mask,
+        uint32_t padded_batch_size,
+        uint32_t num_qo_heads,
+        int32_t window_left,
+        float logits_soft_cap,
+        float sm_scale,
+        float rope_scale,
+        float rope_theta,
+        cudaStream_t stream);
+
+} // namespace flashinfer
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 1b8afad81..e140697a0 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -745,6 +745,10 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
     if (metas->meta[op] != NULL) {
       assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
       assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
+      assert(metas->meta[start]->handle.peft_blas ==
+             metas->meta[op]->handle.peft_blas);
+      assert(metas->meta[start]->handle.peft_dnn ==
+             metas->meta[op]->handle.peft_dnn);
     }
   }
 
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index 0a11a0668..794b4c537 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -55,40 +55,6 @@ bool IncMultiHeadSelfAttentionParams::is_valid(
 }
 
 Tensor FFModel::inc_multihead_self_attention(
-    const Tensor input,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    DataType data_type,
-    Initializer *kernel_initializer,
-    RotaryEmbeddingMeta rotary_embedding_meta,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name) {
-  return inc_multiquery_self_attention(input,
-                                       embed_dim,
-                                       num_heads,
-                                       num_heads,
-                                       kdim,
-                                       vdim,
-                                       dropout,
-                                       add_zero_attn,
-                                       data_type,
-                                       kernel_initializer,
-                                       rotary_embedding_meta,
-                                       scaling_query,
-                                       scaling_factor,
-                                       qk_prod_scaling,
-                                       position_bias,
-                                       name);
-}
-
-Tensor FFModel::inc_multiquery_self_attention(
     const Tensor input,
     int embed_dim,
     int num_q_heads,
@@ -169,6 +135,7 @@ Tensor FFModel::inc_multiquery_self_attention(
   li->add_int_property("offload", offload);
   li->add_int_property("tensor_parallelism_degree",
                        config.tensor_parallelism_degree);
+  li->add_int_property("num_kv_cache_pages", get_num_kv_cache_pages());
   layers.push_back(li);
 
   return li->outputs[0];
@@ -220,6 +187,8 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
   bool offload = (bool)value;
   layer->get_int_property("tensor_parallelism_degree", value);
   int tensor_parallelism_degree = (int)value;
+  layer->get_int_property("num_kv_cache_pages", value);
+  int num_kv_cache_pages = (int)value;
 
   return new IncMultiHeadSelfAttention(model,
                                        layer->layer_guid,
@@ -239,6 +208,7 @@ Op *IncMultiHeadSelfAttention::create_operator_from_layer(
                                        quantization_type,
                                        offload,
                                        tensor_parallelism_degree,
+                                       num_kv_cache_pages,
                                        layer->name);
 }
 
@@ -261,6 +231,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
+    int _num_kv_cache_pages,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -273,10 +244,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       add_zero_attn(_add_zero_attn),
-      rotary_embedding_meta(_rotary_embedding_meta),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      rotary_embedding_meta(_rotary_embedding_meta), qProjSize(_kdim),
+      kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -299,6 +268,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
   /* assert(check_output_input_weight_parallel_dims()); */
+
+  num_kv_cache_pages = _num_kv_cache_pages;
 }
 
 IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
@@ -319,6 +290,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
+    int _num_kv_cache_pages,
     char const *name)
     // Initializer* _bias_initializer)
     : Op(model,
@@ -331,10 +303,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       add_zero_attn(_add_zero_attn),
-      rotary_embedding_meta(_rotary_embedding_meta),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      rotary_embedding_meta(_rotary_embedding_meta), qProjSize(_kdim),
+      kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -353,6 +323,8 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
 
+  num_kv_cache_pages = _num_kv_cache_pages;
+
   // Check correctness
   /* assert(check_output_input_weight_parallel_dims()); */
 }
@@ -379,6 +351,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 other.quantization_type,
                                 other.offload,
                                 other.tensor_parallelism_degree,
+                                other.num_kv_cache_pages,
                                 other.name) {}
 
 IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
@@ -404,6 +377,7 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
                                 params.quantization_type,
                                 params.offload,
                                 params.tensor_parallelism_degree,
+                                params.num_kv_cache_pages,
                                 params.name) {}
 
 void IncMultiHeadSelfAttention::init_inference(
@@ -505,7 +479,6 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
                                        ctx,
                                        runtime);
 
-  int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1;
   assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   int num_q_heads = attn->num_q_heads / attn->tensor_parallelism_degree;
@@ -522,15 +495,15 @@ OpMeta *IncMultiHeadSelfAttention::init_task(
         handle.offload_reserve_space, handle.offload_reserve_space_size);
   }
   IncMultiHeadSelfAttentionMeta *m = new IncMultiHeadSelfAttentionMeta(
-      handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads);
+      handle, attn, gpu_mem_allocator, num_q_heads, num_kv_heads);
   if (handle.offload_reserve_space == nullptr) {
     // assert that we didn't over allocate memory
     assert(gpu_mem_allocator.reserved_allocated_size ==
            gpu_mem_allocator.reserved_total_size);
   }
-  m->profiling = attn->profiling;
-  m->inference_debugging = attn->inference_debugging;
-  m->enable_peft_finetuning = attn->enable_peft_finetuning;
+  // m->profiling = attn->profiling;
+  // m->inference_debugging = attn->inference_debugging;
+  // m->enable_peft_finetuning = attn->enable_peft_finetuning;
   std::strcpy(m->op_name, attn->name);
   m->layer_guid = attn->layer_guid;
 
@@ -751,7 +724,8 @@ bool IncMultiHeadSelfAttention::measure_operator_cost(
 bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
                 IncMultiHeadSelfAttentionParams const &rhs) {
   return lhs.layer_guid == rhs.layer_guid && lhs.embed_dim == rhs.embed_dim &&
-         lhs.num_q_heads == rhs.num_q_heads && lhs.kdim == rhs.kdim &&
+         lhs.num_q_heads == rhs.num_q_heads &&
+         lhs.num_kv_heads == rhs.num_kv_heads && lhs.kdim == rhs.kdim &&
          lhs.vdim == rhs.vdim && lhs.dropout == rhs.dropout &&
          lhs.add_zero_attn == rhs.add_zero_attn &&
          lhs.rotary_embedding_meta.apply_rotary_embedding ==
@@ -770,7 +744,9 @@ bool operator==(IncMultiHeadSelfAttentionParams const &lhs,
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.tensor_parallelism_degree == rhs.tensor_parallelism_degree &&
+         lhs.num_kv_cache_pages == rhs.num_kv_cache_pages;
 }
 
 IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
@@ -786,8 +762,9 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.scaling_query = this->scaling_query;
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
-  params.position_bias = this->position_bias,
-  params.tensor_parallelism_degree = this->tensor_parallelism_degree,
+  params.position_bias = this->position_bias;
+  params.tensor_parallelism_degree = this->tensor_parallelism_degree;
+  params.num_kv_cache_pages = this->num_kv_cache_pages;
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
   params.num_kv_heads = this->num_kv_heads;
@@ -827,6 +804,7 @@ size_t hash<FlexFlow::IncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.quantization_type);
   hash_combine(key, params.offload);
   hash_combine(key, params.tensor_parallelism_degree);
+  hash_combine(key, params.num_kv_cache_pages);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index 41268ee4d..cd569300c 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -1178,9 +1178,7 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
-  assert(m->hidden_size % m->num_q_heads == 0);
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   if (num_tokens > 0) {
     int parallelism = head_dim * tot_num_heads * num_tokens;
     // devQKVProj has shape [qProjSize, tot_num_heads, num_new_tokens]
@@ -1776,7 +1774,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                m->peft_token_infos_size,
                                hipMemcpyHostToDevice,
                                stream));
-      assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+
       assert(m->qProjSize == m->kProjSize);
       /*q&k*/
       int half_proj = m->qProjSize / 2;
@@ -1835,10 +1833,10 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
     DT const *B = static_cast<DT *>(m->devQKVProjArray);
     // matrix C: gradients w.r.t. input
-    // matrix C's layout: [m->qSize, num_tokens]
-    DT *C = input_grad_ptr +
-            bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
-    // int m_ = m->qSize;
+    // matrix C's layout: [m->qProjSize * m->num_q_heads, num_tokens]
+    DT *C = input_grad_ptr + bc->requestsInfo[i].first_token_offset_in_batch *
+                                 m->qProjSize * m->num_q_heads;
+    // int m_ = m->qProjSize * m->num_q_heads;
     int n_ = num_tokens;
     int k_ = m->qProjSize * (m->num_q_heads + 2 * m->num_kv_heads);
 
@@ -1851,7 +1849,8 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     if (m->inference_debugging) {
       std::string filename =
           get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
-      save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      save_tensor(
+          C, num_tokens * m->qProjSize * m->num_q_heads, filename.c_str());
     }
   }
 }
@@ -1954,15 +1953,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
                                     INC_DECODING_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -1973,11 +1968,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->position_bias,
                                     attn->scaling_factor,
                                     gpu_mem_allocator,
-                                    num_samples,
                                     attn->num_q_heads,
                                     attn->num_kv_heads,
                                     _num_q_heads,
                                     _num_kv_heads,
+                                    attn->num_kv_cache_pages,
                                     attn->quantization_type,
                                     attn->offload) {}
 
@@ -1985,9 +1980,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     InferenceMode infer_mode,
     Op const *attn,
-    int _qSize,
-    int _kSize,
-    int _vSize,
     int _qProjSize,
     int _kProjSize,
     int _vProjSize,
@@ -1998,11 +1990,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     bool _position_bias,
     float _scaling_factor,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _global_num_q_heads,
     int _global_num_kv_heads,
     int _num_q_heads,
     int _num_kv_heads,
+    int _num_kv_cache_pages,
     DataType _quantization_type,
     bool _offload)
     : OpMeta(handler, attn) {
@@ -2010,12 +2002,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
   checkCUDNN(miopenCreateTensorDescriptor(&qk_tensor));
-  qSize = _qSize;
-  kSize = _kSize;
-  vSize = _vSize;
   // assume dimensions match for now
-  assert(qSize == kSize);
-  assert(kSize == vSize);
   qProjSize = _qProjSize;
   kProjSize = _kProjSize;
   assert(qProjSize == kProjSize); // required for attention QK.T matmul
@@ -2029,7 +2016,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  hidden_size = num_q_heads * qProjSize;
 
   rotary_embedding_meta =
       (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
@@ -2042,6 +2028,14 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   position_bias = (bool *)calloc(1, sizeof(bool));
   *position_bias = _position_bias;
 
+  num_kv_cache_pages = _num_kv_cache_pages;
+  assert(num_kv_cache_pages > 0 || enable_peft_finetuning);
+
+  // spec decoding and peft finetuning are mutually exclusive
+  if (enable_peft_finetuning) {
+    assert(infer_mode == INC_DECODING_MODE);
+  }
+
   assert(num_q_heads % num_kv_heads == 0 &&
          "num_q_heads must be divisible by num_kv_heads");
   if (num_q_heads > num_kv_heads) {
@@ -2197,8 +2191,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped(
           qk_prod_size * size_of_dt);
-      attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
-                                                               size_of_dt);
+      // attn_heads =
+      // gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
+      //                                                          size_of_dt);
       complex_input =
           gpu_mem_allocator.allocate_reserved<hipFloatComplex>(complex_size);
     } else {
@@ -2206,8 +2201,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
           qk_prod_size * size_of_dt);
-      attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size *
-                                                               size_of_dt);
+      // attn_heads =
+      // gpu_mem_allocator.allocate_instance_untyped(attn_heads_size *
+      //                                                          size_of_dt);
       complex_input =
           gpu_mem_allocator.allocate_instance<hipFloatComplex>(complex_size);
     }
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index 1b02b0052..7402b7dee 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -21,6 +21,11 @@
 #include "flexflow/utils/cuda_helper.h"
 #include <math_constants.h>
 
+// flashinfer & paged attention
+#include "flashinfer/decode_attention_decl.cuh"
+#include "flashinfer/prefill_attention_decl.cuh"
+#include "flexflow/page_manager.h"
+
 namespace FlexFlow {
 
 // declare Legion names
@@ -32,6 +37,18 @@ using Legion::Memory;
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+// flashinfer & paged attention
+using flashinfer::BatchDecodeHandler;
+using flashinfer::BatchDecodeWithPagedKVCacheWrapperDispatched;
+using flashinfer::BatchPrefillHandler;
+using flashinfer::BatchPrefillWithPagedKVCacheWrapperDispatched;
+using flashinfer::LogitsPostHook;
+using flashinfer::MaskMode;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
+
 std::string get_fwd_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
                                int shard_id) {
   std::string op_name_without_uid =
@@ -59,8 +76,8 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
                                int num_kv_heads) {
   CUDA_KERNEL_LOOP(i, num_tokens * head_dim * num_kv_heads) {
     // devQKVProjArray: [head_dim, tot_num_heads, num_tokens]
-    // kCache_ptr: [head_dim, num_kv_heads, max_seq_len, max_batch_size]
-    // vCache_ptr: [head_dim, num_kv_heads, max_seq_len, max_batch_size]
+    // kCache_ptr: [head_dim, num_kv_heads, max_seq_len, 1]
+    // vCache_ptr: [head_dim, num_kv_heads, max_seq_len, 1]
 
     // i is iterating over one set of key/val projections from the input
     int token_idx = i / (head_dim * num_kv_heads);
@@ -72,11 +89,9 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
                       head_dim * num_q_heads + head_dim * head_idx + offset;
     int val_src_idx = key_src_idx + head_dim * num_kv_heads;
 
-    int const req_id = tokenInfos[token_idx].request_index;
     int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
-    int dst_idx = req_id * (head_dim * num_kv_heads * max_seq_len) +
-                  tok_id * head_dim * num_kv_heads + head_idx * head_dim +
-                  offset;
+    int dst_idx =
+        tok_id * head_dim * num_kv_heads + head_idx * head_dim + offset;
 
     kCache_ptr[dst_idx] = devQKVProjArray[key_src_idx];
     vCache_ptr[dst_idx] = devQKVProjArray[val_src_idx];
@@ -303,13 +318,17 @@ __global__ void store_softmax_activation(DT const *qk_prods_softmax,
 }
 
 template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
-                                     BatchConfig const *bc,
-                                     DT *attn_heads,
-                                     int shard_id,
-                                     cudaStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+void compute_attention_kernel_peft(IncMultiHeadSelfAttentionMeta *m,
+                                   BatchConfig const *bc,
+                                   DT *attn_heads,
+                                   int shard_id,
+                                   cudaStream_t peft_stream) {
+  if (bc->num_finetuning_fwd_tokens() <= 0) {
+    return;
+  }
+
+  checkCUDA(cublasSetStream(m->handle.peft_blas, peft_stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, peft_stream));
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
@@ -317,599 +336,309 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
 
   assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
   int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
-  int num_processed_prompt_tokens = 0;
-  for (int req_idx = 0; req_idx < bc->max_requests_per_batch(); req_idx++) {
-    if (bc->request_completed[req_idx] || is_decoding_request(bc, req_idx) ||
-        is_finetuning_bwd_request(bc, req_idx)) {
-      continue;
-    }
-    int num_new_tokens = bc->requestsInfo[req_idx].num_tokens_in_batch;
-    int total_tokens = bc->requestsInfo[req_idx].first_token_depth_in_request +
-                       bc->requestsInfo[req_idx].num_tokens_in_batch;
-    if (num_new_tokens <= 0) {
-      continue;
+
+  assert(bc->num_finetuning_fwd_tokens() > 0);
+  int req_idx = bc->finetuning_request_index();
+  assert(!bc->request_completed[req_idx]);
+  assert(bc->requestsInfo[req_idx].finetuning_request &&
+         !bc->requestsInfo[req_idx].finetuning_backward_phase);
+
+  int num_new_tokens = bc->requestsInfo[req_idx].num_tokens_in_batch;
+  int total_tokens = bc->requestsInfo[req_idx].first_token_depth_in_request +
+                     bc->requestsInfo[req_idx].num_tokens_in_batch;
+  assert(num_new_tokens > 0 && total_tokens > 0);
+
+  // Copy query to m->query_activation_buffer for BWD
+  // int max_peft_tokens = bc->requestsInfo[i].max_length;
+  int max_peft_tokens = BatchConfig::max_sequence_length();
+  size_t activation_size_needed =
+      sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+  if (activation_size_needed != m->allocated_peft_buffer_size1) {
+    std::cout << "activation_size_needed: " << activation_size_needed
+              << std::endl;
+    std::cout << "m->allocated_peft_buffer_size1: "
+              << m->allocated_peft_buffer_size1 << std::endl;
+    std::cout << "max_peft_tokens: " << max_peft_tokens << std::endl;
+    std::cout << "m->num_q_heads: " << m->num_q_heads << std::endl;
+    std::cout << "m->qProjSize: " << m->qProjSize << std::endl;
+    std::cout << "BatchConfig::max_sequence_length()"
+              << BatchConfig::max_sequence_length() << std::endl;
+    std::cout << "sizeof(DT)" << sizeof(DT) << std::endl;
+  }
+  assert(activation_size_needed == m->allocated_peft_buffer_size1);
+  int parallelism = m->qProjSize * m->num_q_heads * num_new_tokens;
+  int tokens_previous_steps = total_tokens - num_new_tokens;
+  int tokens_previous_requests =
+      bc->requestsInfo[req_idx].first_token_offset_in_batch;
+  store_query_cache<<<GET_BLOCKS(parallelism),
+                      min(CUDA_NUM_THREADS, parallelism),
+                      0,
+                      peft_stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<DT *>(m->query_activation_buffer),
+      num_new_tokens,
+      tokens_previous_requests,
+      tokens_previous_steps,
+      m->qProjSize,
+      m->num_q_heads,
+      m->num_kv_heads);
+
+  // Step 1: compute query-key product QK.T/sqrt(d_k)
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    if (*m->qk_prod_scaling) {
+      // Scale by sqrt(d_k) as per the original attention paper
+      alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
     }
-    // Copy query to m->query_activation_buffer if we need to compute
-    // PEFT backward
-    if (bc->requestsInfo[req_idx].finetuning_request &&
-        !bc->requestsInfo[req_idx].finetuning_backward_phase) {
-      // int max_peft_tokens = bc->requestsInfo[i].max_length;
-      int max_peft_tokens = BatchConfig::max_sequence_length();
-      size_t activation_size_needed =
-          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
-      if (activation_size_needed != m->allocated_peft_buffer_size1) {
-        std::cout << "activation_size_needed: " << activation_size_needed
-                  << std::endl;
-        std::cout << "m->allocated_peft_buffer_size1: "
-                  << m->allocated_peft_buffer_size1 << std::endl;
-        std::cout << "max_peft_tokens: " << max_peft_tokens << std::endl;
-        std::cout << "m->num_q_heads: " << m->num_q_heads << std::endl;
-        std::cout << "m->qProjSize: " << m->qProjSize << std::endl;
-        std::cout << "BatchConfig::max_sequence_length()"
-                  << BatchConfig::max_sequence_length() << std::endl;
-        std::cout << "sizeof(DT)" << sizeof(DT) << std::endl;
-      }
-      assert(activation_size_needed == m->allocated_peft_buffer_size1);
-      int parallelism = m->qProjSize * m->num_q_heads * num_new_tokens;
-      int tokens_previous_steps = total_tokens - num_new_tokens;
-      store_query_cache<<<GET_BLOCKS(parallelism),
-                          min(CUDA_NUM_THREADS, parallelism),
-                          0,
-                          stream>>>(
-          static_cast<DT *>(m->devQKVProjArray),
-          static_cast<DT *>(m->query_activation_buffer),
-          num_new_tokens,
-          num_processed_prompt_tokens,
-          tokens_previous_steps,
-          m->qProjSize,
-          m->num_q_heads,
-          m->num_kv_heads);
+    // after transpositions
+    int m_ = num_new_tokens;
+    int n = total_tokens;
+    int k = m->qProjSize;
+    // before transpositions
+    int lda = m->qProjSize * tot_num_heads;
+    int ldb = m->kProjSize * m->num_kv_heads;
+    int ldc = num_new_tokens;
+    // N.B. strides are applied before transpose operations
+    int strideA = m->qProjSize;
+    int strideB = m->kProjSize;
+    int strideC = num_new_tokens * total_tokens;
+
+    // matrix A: devQKVProjArray
+    // matrix A's layout: [qProjSize, tot_num_heads, num_new_tokens]
+    // To get query projection, skip over Q entries from previous requests
+    DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                  tokens_previous_requests * m->qProjSize *
+                      (m->num_q_heads + 2 * m->num_kv_heads);
+    // matrix B: key cache (peft)
+    // matrix B's layout: [kProjSize, num_kv_heads, total_tokens]
+    // To get B, skip over K entries from previous requests (all heads +
+    // padding)
+    DT const *B = static_cast<DT *>(m->keyCachePeft);
+    // matrix C: qk_prods (current req only)
+    // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
+    DT *C = static_cast<DT *>(m->qk_prods);
+    run_batched_matmul<DT>(m,
+                           m->handle.peft_blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           m_,
+                           n,
+                           k,
+                           &alpha,
+                           A,
+                           cublas_data_type,
+                           lda,
+                           strideA,
+                           B,
+                           cublas_data_type,
+                           ldb,
+                           strideB,
+                           &beta,
+                           C,
+                           cublas_data_type,
+                           ldc,
+                           strideC,
+                           m->num_q_heads,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP,
+                           peft_stream,
+                           1,
+                           m->num_q_heads / m->num_kv_heads,
+                           1);
+    if (m->inference_debugging) {
+      std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".qk_prods";
+      save_tensor(static_cast<DT const *>(m->qk_prods),
+                  num_new_tokens * total_tokens * m->num_q_heads,
+                  fpath.c_str());
     }
-    // Step 1: compute query-key product QK.T/sqrt(d_k)
-    {
-      DT alpha = 1.0f, beta = 0.0f;
-      if (*m->qk_prod_scaling) {
-        // Scale by sqrt(d_k) as per the original attention paper
-        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
-      }
-      // after transpositions
-      int m_ = num_new_tokens;
-      int n = total_tokens;
-      int k = m->qProjSize;
-      // before transpositions
-      int lda = m->qProjSize * tot_num_heads;
-      int ldb = m->kProjSize * m->num_kv_heads;
-      int ldc = num_new_tokens;
-      // N.B. strides are applied before transpose operations
-      int strideA = m->qProjSize;
-      int strideB = m->kProjSize;
-      int strideC = num_new_tokens * total_tokens;
-
-      // matrix A: devQKVProjArray
-      // matrix A's layout: [qProjSize, tot_num_heads, num_new_tokens]
-      // To get query projection, skip over Q entries from previous requests
-      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                    bc->requestsInfo[req_idx].first_token_offset_in_batch *
-                        m->qProjSize * (m->num_q_heads + 2 * m->num_kv_heads);
-      // matrix B: key cache
-      // matrix B's layout: [kProjSize, num_kv_heads, total_tokens]
-      // To get B, skip over K entries from previous requests (all heads +
-      // padding)
-      DT const *B = static_cast<DT *>(m->keyCache) +
-                    req_idx * (m->kProjSize * m->num_kv_heads *
-                               BatchConfig::max_sequence_length());
-      // matrix C: qk_prods (current req only)
-      // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
-      DT *C = static_cast<DT *>(m->qk_prods);
-      run_batched_matmul<DT>(m,
-                             m->handle.blas,
-                             CUBLAS_OP_T,
-                             CUBLAS_OP_N,
-                             m_,
-                             n,
-                             k,
-                             &alpha,
-                             A,
-                             cublas_data_type,
-                             lda,
-                             strideA,
-                             B,
-                             cublas_data_type,
-                             ldb,
-                             strideB,
-                             &beta,
-                             C,
-                             cublas_data_type,
-                             ldc,
-                             strideC,
-                             m->num_q_heads,
-                             compute_type,
-                             CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-                             stream,
-                             1,
-                             m->num_q_heads / m->num_kv_heads,
-                             1);
-      if (m->inference_debugging) {
-        std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".qk_prods";
-        save_tensor(static_cast<DT const *>(m->qk_prods),
-                    num_new_tokens * total_tokens * m->num_q_heads,
-                    fpath.c_str());
-      }
+  }
+  // Step 2: Add alibi position bias to qk production
+  // matrix C: qk_prods
+  // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+  // To get C, skip over QK.T products from previous requests
+  {
+    // matrix C: qk_prods (current req only)
+    // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
+    DT *C = static_cast<DT *>(m->qk_prods);
+    if (*m->position_bias) {
+      size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
+      apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
+                                  min((size_t)CUDA_NUM_THREADS, parallelism),
+                                  0,
+                                  peft_stream>>>(C,
+                                                 num_new_tokens,
+                                                 total_tokens,
+                                                 m->num_q_heads,
+                                                 m->global_num_q_heads,
+                                                 shard_id);
     }
-    // Step 2: Add alibi position bias to qk production
-    // matrix C: qk_prods
-    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
-    // To get C, skip over QK.T products from previous requests
-    {
+  }
+
+  // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
+  // with -inf to force causal attention.
+  {
+    assert(num_new_tokens <= total_tokens);
+    size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
+    if (entries_above_diagonal > 0) {
       // matrix C: qk_prods (current req only)
       // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
       DT *C = static_cast<DT *>(m->qk_prods);
-      if (*m->position_bias) {
-        size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-        apply_position_bias_qkprd<<<GET_BLOCKS(parallelism),
+      size_t parallelism = m->num_q_heads * entries_above_diagonal;
+      fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
                                     min((size_t)CUDA_NUM_THREADS, parallelism),
                                     0,
-                                    stream>>>(C,
-                                              num_new_tokens,
-                                              total_tokens,
-                                              m->num_q_heads,
-                                              m->global_num_q_heads,
-                                              shard_id);
-      }
-    }
-
-    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
-    // with -inf to force causal attention.
-    {
-      assert(num_new_tokens <= total_tokens);
-      size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
-      if (entries_above_diagonal > 0) {
-        // matrix C: qk_prods (current req only)
-        // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
-        DT *C = static_cast<DT *>(m->qk_prods);
-        size_t parallelism = m->num_q_heads * entries_above_diagonal;
-        fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
-                                      min((size_t)CUDA_NUM_THREADS,
-                                          parallelism),
-                                      0,
-                                      stream>>>(C,
-                                                num_new_tokens,
-                                                total_tokens,
-                                                m->num_q_heads,
-                                                entries_above_diagonal,
-                                                static_cast<DT>(-INFINITY));
-      }
-      if (m->inference_debugging) {
-        std::string fpath =
-            get_fwd_dbg_folder(m, shard_id) + ".qk_prods.masked";
-        save_tensor(static_cast<DT const *>(m->qk_prods),
-                    num_new_tokens * total_tokens * m->num_q_heads,
-                    fpath.c_str());
-      }
+                                    peft_stream>>>(C,
+                                                   num_new_tokens,
+                                                   total_tokens,
+                                                   m->num_q_heads,
+                                                   entries_above_diagonal,
+                                                   static_cast<DT>(-INFINITY));
     }
-
-    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
-    {
-      // Before modifying the parameters below, make sure to read the following
-      // description of the CUDNN_TENSOR_NCHW tensor layout, from
-      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-      // This tensor format specifies that the data is laid out in the following
-      // order: batch size, feature maps, rows, columns. The strides are
-      // implicitly defined in such a way that the data are contiguous in memory
-      // with no padding between images, feature maps, rows, and columns; the
-      // columns are the inner dimension and the images are the outermost
-      // dimension.
-      int n_param = m->num_q_heads;
-      int c_param = total_tokens;
-      int h_param = 1;
-      int w_param = num_new_tokens;
-      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
-                                            CUDNN_TENSOR_NCHW,
-                                            cudnn_data_type,
-                                            n_param,
-                                            c_param,
-                                            h_param,
-                                            w_param));
-      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-      // matrix C: qk_prods (current req only)
-      // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
-      DT *C = static_cast<DT *>(m->qk_prods);
-      // matrix C_softmax: qk_prods_softmax (current req only)
-      // matrix C_softmax's layout: [num_new_tokens, total_tokens, num_q_heads]
-      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-      // The softmax operation below is executed according to the
-      // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-      // softmax operation is computed per spatial location (H,W) per image (N)
-      // across dimension C.
-      checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
-                                     CUDNN_SOFTMAX_ACCURATE,
-                                     CUDNN_SOFTMAX_MODE_CHANNEL,
-                                     &softmax_alpha,
-                                     m->qk_tensor,
-                                     C,
-                                     &softmax_beta,
-                                     m->qk_tensor,
-                                     C_softmax));
-      // Copy C_softmax to m->softmax_activation_buffer if we need to compute
-      // PEFT backward
-      if (bc->requestsInfo[req_idx].finetuning_request) {
-        int max_peft_tokens = BatchConfig::max_sequence_length();
-        int max_dataset_entry_size = bc->requestsInfo[req_idx].max_length;
-        size_t activation_size_needed =
-            sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
-        assert(activation_size_needed == m->allocated_peft_buffer_size2);
-        int parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-        store_softmax_activation<<<GET_BLOCKS(parallelism),
-                                   min(CUDA_NUM_THREADS, parallelism),
-                                   0,
-                                   stream>>>(
-            static_cast<DT *>(m->qk_prods_softmax),
-            static_cast<DT *>(m->softmax_activation_buffer),
-            num_new_tokens,
-            total_tokens,
-            max_dataset_entry_size,
-            m->num_q_heads);
-      }
-      if (m->inference_debugging) {
-        std::string fpath =
-            get_fwd_dbg_folder(m, shard_id) + ".qk_prods_softmax";
-        save_tensor(static_cast<DT const *>(m->qk_prods_softmax),
-                    num_new_tokens * total_tokens * m->num_q_heads,
-                    fpath.c_str());
-      }
-    }
-
-    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
-    // softmax(QK.T/sqrt(d_k)).T
-    {
-      DT alpha = 1.0f, beta = 0.0f;
-      // after transpositions
-      int m_ = m->vProjSize;
-      int n = num_new_tokens;
-      int k = total_tokens;
-      // before transpositions
-      int lda = m_ * m->num_kv_heads;
-      int ldb = n;
-      int ldc = m_ * m->num_q_heads;
-      // N.B. strides are applied before transpose operations
-      int strideA = m->vProjSize;
-      int strideB = num_new_tokens * total_tokens;
-      int strideC = m->vProjSize;
-      // matrix A: value cache
-      // matrix A's layout: [vProjSize, num_kv_heads, total_tokens]
-      // To get A, skip over V.T entries from previous requests (all heads +
-      // padding)
-      DT *A = static_cast<DT *>(m->valueCache) +
-              req_idx * (m->vProjSize * m->num_kv_heads *
-                         BatchConfig::max_sequence_length());
-      // matrix B: qk_prods_softmax (current req only)
-      // matrix B's layout: [num_new_tokens, total_tokens, num_q_heads]
-      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      // matrix C: attn heads
-      // matrix C's layout: [vProjSize, num_q_heads, num_new_tokens]
-      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
-      // requests
-      // store the result attn heads, also skip the genration tokens
-      DT *C = static_cast<DT *>(attn_heads) +
-              (bc->requestsInfo[req_idx].first_token_offset_in_batch) *
-                  m->num_q_heads * m->vProjSize;
-      run_batched_matmul<DT>(m,
-                             m->handle.blas,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_T,
-                             m_,
-                             n,
-                             k,
-                             &alpha,
-                             A,
-                             cublas_data_type,
-                             lda,
-                             strideA,
-                             B,
-                             cublas_data_type,
-                             ldb,
-                             strideB,
-                             &beta,
-                             C,
-                             cublas_data_type,
-                             ldc,
-                             strideC,
-                             m->num_q_heads,
-                             compute_type,
-                             CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-                             stream,
-                             m->num_q_heads / m->num_kv_heads,
-                             1,
-                             1);
-      if (m->inference_debugging) {
-        std::string fpath =
-            get_fwd_dbg_folder(m, shard_id) + ".qk_prods_softmax";
-        save_tensor(static_cast<DT const *>(attn_heads),
-                    num_new_tokens * m->num_q_heads * m->vProjSize,
-                    fpath.c_str());
-      }
-    }
-    num_processed_prompt_tokens += num_new_tokens;
-  }
-  if (num_processed_prompt_tokens !=
-      (bc->num_active_tokens() - bc->num_generation_tokens)) {
-    bc->print();
-    printf("num_processed_prompt_tokens: %i\n", num_processed_prompt_tokens);
-    printf("num_tokens: %i\n", bc->num_active_tokens());
-    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
-  }
-  assert(num_processed_prompt_tokens ==
-         (bc->num_active_tokens() - bc->num_generation_tokens));
-}
-
-// gridDim = num_heads
-// blockDim = num_tokens/num_request * head_size
-// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
-// one thread process one head_size
-template <typename DT,
-          int THREADS_PER_BLOCK,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE>
-__global__ void compute_attention_kernel_generation_kernel(
-    DT const *query,
-    DT const *key_cache,
-    DT const *value_cache,
-    DT *output_ptr,
-    float const scale,
-    int max_seq_length,
-    int per_head_size,
-    int num_q_heads,
-    int num_kv_heads,
-    BatchConfig::PerRequestInfo *request_infos) {
-
-  int total_num_heads = num_q_heads + 2 * num_kv_heads;
-
-  // q, k
-  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
-  using V_vec = typename VEC_V<DT>::Type;
-  using Out_sum = typename Vec_fp32_<V_vec>::Type;
-
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
-  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
-  //  K_ELTS_PER_THREAD = 128 / 4 = 32
-  //  K_VECS_PER_THREAD = 32 / 1 = 32
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
-  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
-  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
-  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
-
-  // thread id
-  int const tidx = threadIdx.x;
-  // head id
-  int const head_idx = blockIdx.x;
-  int const kv_head_idx = head_idx / (num_q_heads / num_kv_heads);
-  // request idx
-  int const request_idx = blockIdx.y;
-
-  int const batch_config_request_id =
-      request_infos[request_idx].batch_config_request_id;
-
-  int const first_step = 0;
-
-  int const tlength =
-      request_infos[batch_config_request_id].first_token_depth_in_request +
-      request_infos[batch_config_request_id].num_tokens_in_batch;
-
-  // shared memory objects
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-  float *out_smem = reinterpret_cast<float *>(smem_);
-
-  float qk_max = -FLT_MAX;
-
-  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-
-  const DT *q_ptr = query + request_idx * per_head_size * total_num_heads +
-                    head_idx * per_head_size;
-  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
-  // DT const *q_ptr =
-  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
-
-  // q tensor in this thread
-  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
-  // K_VECS_PER_THREAD elements
-  // QK_vec_k: 32->1, 64->2, 128->4... head_size
-  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
-
-  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
-  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
-  int ki_o = tidx % THREADS_PER_KEY;
-  // the first key's offset for this thread
-  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
-  int ko = tidx / THREADS_PER_KEY;
-  // load q tensor
-  Q_vec q_vec[K_VECS_PER_THREAD];
-#pragma unroll
-  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
-        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
-  }
-  __syncthreads();
-  // first iter = 128 / 4 = 32
-  // K_VECS_PER_THREAD = 32
-  //  K_PER_ITER how many keys in this loop
-  //  The number of timesteps loaded per iteration.
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  //   // The number of keys per warp.
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  DT const *k_cache_batch = key_cache +
-                            batch_config_request_id *
-                                (per_head_size * num_kv_heads) *
-                                max_seq_length +
-                            ki;
-
-  int ti_end =
-      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
-  // get k, perform qk proj
-
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    K_vec k[K_VECS_PER_THREAD];
-    int const ti_circ = ti % max_seq_length;
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
-      if (ti < tlength) {
-        k[ii] = *reinterpret_cast<K_vec const *>(
-            k_cache_batch + ti_circ * (per_head_size * num_kv_heads) +
-            kv_head_idx * per_head_size + jj);
-      }
-      // Compute dot product.
-      // This includes a reduction across the threads in the same thread group.
-    }
-    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
-    // // todo add positional embedding to the qk production
-    // // Store the product to shared memory. There's one qk value per
-    // timestep.
-    // // Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      // todo add alobi here
-      bool const mask = ti_circ >= tlength;
-      if (mask) {
-        assert(false);
-      }
-      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-      qk_smem[ti - first_step] = mask ? 0.f : qk;
+    if (m->inference_debugging) {
+      std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".qk_prods.masked";
+      save_tensor(static_cast<DT const *>(m->qk_prods),
+                  num_new_tokens * total_tokens * m->num_q_heads,
+                  fpath.c_str());
     }
   }
 
-  __syncthreads();
-
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Decompose the thread index into warp and lane.
-  int const warp = tidx / WARP_SIZE;
-  int const lane = tidx % WARP_SIZE;
-
-  // The warp leader writes the max to shared memory.
-  if (lane == 0) {
-    red_smem[warp] = qk_max;
-  }
-
-  // Make sure the products are in shared memory.
-  __syncthreads();
-
-  // The warps finalize the reduction.
-  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  // Broadcast to all the threads in the warp.
-  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-  float exp_sum = 0.f;
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    float logit = __expf(qk_smem[ti - first_step] - qk_max);
-    exp_sum += logit;
-    qk_smem[ti - first_step] = logit;
-  }
-
-  // Compute the sum.
-  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
-
-  // softmax
-  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
-  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
-    qk_smem[ti - first_step] *= inv_sum;
-  }
-
-  __syncthreads();
-  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
-  //   printf("softmax %.10f\n", qk_smem[0]);
-  // }
+  // Step 4: Compute Softmax(QK.T/sqrt(d_k))
+  {
+    // Before modifying the parameters below, make sure to read the following
+    // description of the CUDNN_TENSOR_NCHW tensor layout, from
+    // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
+    // This tensor format specifies that the data is laid out in the following
+    // order: batch size, feature maps, rows, columns. The strides are
+    // implicitly defined in such a way that the data are contiguous in memory
+    // with no padding between images, feature maps, rows, and columns; the
+    // columns are the inner dimension and the images are the outermost
+    // dimension.
+    int n_param = m->num_q_heads;
+    int c_param = total_tokens;
+    int h_param = 1;
+    int w_param = num_new_tokens;
+    checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
+                                          CUDNN_TENSOR_NCHW,
+                                          cudnn_data_type,
+                                          n_param,
+                                          c_param,
+                                          h_param,
+                                          w_param));
+    float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+    // matrix C: qk_prods (current req only)
+    // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
+    DT *C = static_cast<DT *>(m->qk_prods);
+    // matrix C_softmax: qk_prods_softmax (current req only)
+    // matrix C_softmax's layout: [num_new_tokens, total_tokens, num_q_heads]
+    DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+    // The softmax operation below is executed according to the
+    // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+    // softmax operation is computed per spatial location (H,W) per image (N)
+    // across dimension C.
+    checkCUDNN(cudnnSoftmaxForward(m->handle.peft_dnn,
+                                   CUDNN_SOFTMAX_ACCURATE,
+                                   CUDNN_SOFTMAX_MODE_CHANNEL,
+                                   &softmax_alpha,
+                                   m->qk_tensor,
+                                   C,
+                                   &softmax_beta,
+                                   m->qk_tensor,
+                                   C_softmax));
+    // Copy C_softmax to m->softmax_activation_buffer for PEFT backward
+    int max_peft_tokens = BatchConfig::max_sequence_length();
+    int max_dataset_entry_size = bc->requestsInfo[req_idx].max_length;
+    size_t activation_size_needed =
+        sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+    assert(activation_size_needed == m->allocated_peft_buffer_size2);
+    int parallelism = m->num_q_heads * total_tokens * num_new_tokens;
+    store_softmax_activation<<<GET_BLOCKS(parallelism),
+                               min(CUDA_NUM_THREADS, parallelism),
+                               0,
+                               peft_stream>>>(
+        static_cast<DT *>(m->qk_prods_softmax),
+        static_cast<DT *>(m->softmax_activation_buffer),
+        num_new_tokens,
+        total_tokens,
+        max_dataset_entry_size,
+        m->num_q_heads);
 
-  // value projection
-  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
-  // A vector of V elements for the current timestep.
-  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
-  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
-
-  // The value computed by this thread.
-  int vo = tidx / THREADS_PER_VALUE;
-  // The hidden dimensions computed by this particular thread.
-  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
-  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-
-  Out_sum out;
-  zero(out);
-
-  // The base pointer for the value in the cache buffer.
-  DT const *v_cache_batch = value_cache +
-                            batch_config_request_id * max_seq_length *
-                                (per_head_size * num_kv_heads) +
-                            vi;
-
-  if (Dh == Dh_MAX || vi < Dh) {
-    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
-      // Load the values from the cache.
-      int const ti_circ = ti % max_seq_length;
-
-      V_vec v = *reinterpret_cast<V_vec const *>(
-          v_cache_batch + ti_circ * (per_head_size * num_kv_heads) +
-          kv_head_idx * per_head_size);
-      float logit = qk_smem[ti - first_step];
-      out = FlexFlow::fma(logit, cast_to_float(v), out);
+    if (m->inference_debugging) {
+      std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".qk_prods_softmax";
+      save_tensor(static_cast<DT const *>(m->qk_prods_softmax),
+                  num_new_tokens * total_tokens * m->num_q_heads,
+                  fpath.c_str());
     }
   }
 
-  //   // Make sure we can start writing to shared memory.
-  __syncthreads();
-
-  // Run the final reduction amongst the different groups computing different
-  // partial outputs.
-  if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-    for (int active_groups = V_PER_ITER; active_groups >= 2;
-         active_groups /= 2) {
-
-      // The midpoint in the number of active groups.
-      int midpoint = active_groups / 2;
-
-      // The upper part of active threads store to shared memory.
-      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
-            out;
-      }
-      __syncthreads();
-
-      // The bottom warps update their values.
-      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
-                  out);
-      }
-      __syncthreads();
+  // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
+  // softmax(QK.T/sqrt(d_k)).T
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_ = m->vProjSize;
+    int n = num_new_tokens;
+    int k = total_tokens;
+    // before transpositions
+    int lda = m_ * m->num_kv_heads;
+    int ldb = n;
+    int ldc = m_ * m->num_q_heads;
+    // N.B. strides are applied before transpose operations
+    int strideA = m->vProjSize;
+    int strideB = num_new_tokens * total_tokens;
+    int strideC = m->vProjSize;
+    // matrix A: value cache (peft)
+    // matrix A's layout: [vProjSize, num_kv_heads, total_tokens]
+    // To get A, skip over V.T entries from previous requests (all heads +
+    // padding)
+    DT *A = static_cast<DT *>(m->valueCachePeft);
+    // matrix B: qk_prods_softmax (current req only)
+    // matrix B's layout: [num_new_tokens, total_tokens, num_q_heads]
+    // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
+    // requests (all heads)
+    DT *B = static_cast<DT *>(m->qk_prods_softmax);
+    // matrix C: attn heads
+    // matrix C's layout: [vProjSize, num_q_heads, num_new_tokens]
+    // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
+    // requests
+    // store the result attn heads, also skip the genration tokens
+    DT *C = static_cast<DT *>(attn_heads) +
+            (bc->requestsInfo[req_idx].first_token_offset_in_batch) *
+                m->num_q_heads * m->vProjSize;
+    run_batched_matmul<DT>(m,
+                           m->handle.peft_blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_T,
+                           m_,
+                           n,
+                           k,
+                           &alpha,
+                           A,
+                           cublas_data_type,
+                           lda,
+                           strideA,
+                           B,
+                           cublas_data_type,
+                           ldb,
+                           strideB,
+                           &beta,
+                           C,
+                           cublas_data_type,
+                           ldc,
+                           strideC,
+                           m->num_q_heads,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP,
+                           peft_stream,
+                           m->num_q_heads / m->num_kv_heads,
+                           1,
+                           1);
+    if (m->inference_debugging) {
+      std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".qk_prods_softmax";
+      save_tensor(static_cast<DT const *>(attn_heads),
+                  num_new_tokens * m->num_q_heads * m->vProjSize,
+                  fpath.c_str());
     }
   }
-
-  // Output the final values.
-  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-    convert_from_float(
-        *reinterpret_cast<V_vec *>(output_ptr +
-                                   request_idx * (per_head_size * num_q_heads) +
-                                   head_idx * per_head_size + vi),
-        out);
-  }
 }
 
 // only used by MPT model. https://arxiv.org/abs/2108.12409
@@ -1099,10 +828,10 @@ void apply_scaling_and_rotary(IncMultiHeadSelfAttentionMeta const *m,
                               BatchConfig const *bc,
                               int shard_id,
                               DT *output_ptr,
-                              cudaStream_t stream) {
+                              cudaStream_t inf_stream) {
 
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDA(cublasSetStream(m->handle.blas, inf_stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, inf_stream));
   assert(m->qProjSize == m->kProjSize && m->qProjSize == m->vProjSize);
 
   int num_tokens = bc->num_active_tokens();
@@ -1112,12 +841,12 @@ void apply_scaling_and_rotary(IncMultiHeadSelfAttentionMeta const *m,
     scaling_query_kernel<<<GET_BLOCKS(parallelism),
                            min(CUDA_NUM_THREADS, parallelism),
                            0,
-                           stream>>>(output_ptr,
-                                     m->qProjSize,
-                                     num_tokens,
-                                     m->num_q_heads,
-                                     m->num_kv_heads,
-                                     m->scaling_factor);
+                           inf_stream>>>(output_ptr,
+                                         m->qProjSize,
+                                         num_tokens,
+                                         m->num_q_heads,
+                                         m->num_kv_heads,
+                                         m->scaling_factor);
   }
 
   // Step 3: apply rotary embedding if needed
@@ -1130,7 +859,7 @@ void apply_scaling_and_rotary(IncMultiHeadSelfAttentionMeta const *m,
     apply_rotary_embedding_fwd<<<GET_BLOCKS(parallelism),
                                  min(CUDA_NUM_THREADS, parallelism),
                                  0,
-                                 stream>>>(
+                                 inf_stream>>>(
         output_ptr,
         m->complex_input,
         m->token_infos,
@@ -1148,76 +877,354 @@ void apply_scaling_and_rotary(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                            BatchConfig const *bc,
-                            cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+__global__ void update_kv_cache_kernel_flashinfer_kernel(
+    DT *qkv_proj_array,
+    half *qTmp_ptr,
+    half *kvCache_ptr,
+    int32_t *kv_indptr,
+    int32_t *kv_page_indices,
+    bool const *request_completed,
+    int peft_req_idx,
+    BatchConfig::PerTokenInfo const *tokenInfos,
+    int num_q_heads,
+    int num_kv_heads,
+    int head_dim,
+    int num_new_tokens) {
+  int tot_num_heads = num_q_heads + 2 * num_kv_heads;
+  // iterate over the whole qkv_proj_array
+  CUDA_KERNEL_LOOP(i, num_new_tokens * (tot_num_heads * head_dim)) {
+    // qkv_proj_array: [head_dim, tot_num_heads, num_new_tokens]
+    // qTmp_ptr: [head_dim, num_q_heads, num_new_tokens]
+    // kvCache_ptr: [head_dim, num_kv_heads, page_size, 2, max_num_pages]
+    int proj_offset = i % head_dim;
+    int head_idx = (i / head_dim) % tot_num_heads;
+    int token_idx = i / (tot_num_heads * head_dim);
+    assert(proj_offset < head_dim && "Invalid proj_offset");
+    assert(head_idx < tot_num_heads && "Invalid head_idx");
+    assert(token_idx < num_new_tokens && "Invalid token_idx");
+
+    int token_abs_idx = tokenInfos[token_idx].abs_depth_in_request;
+    int const req_idx = tokenInfos[token_idx].request_index;
+
+    assert(req_idx != peft_req_idx &&
+           "Attempting to use inference KV cache for PEFT tokens");
+
+    int req_idx_compact = 0;
+    for (int j = 0; j < req_idx; j++) {
+      if (!request_completed[j]) {
+        req_idx_compact++;
+      }
+    }
+    assert(req_idx_compact >= 0 && req_idx_compact <= req_idx &&
+           "Invalid request index");
+
+    if (head_idx < num_q_heads) {
+      // copy value into qTmp_ptr
+      int offset = head_idx * head_dim + proj_offset;
+      assert(offset >= 0 && offset < num_q_heads * head_dim &&
+             "Q-tmp offset out of bounds");
+      qTmp_ptr[token_idx * head_dim * num_q_heads + offset] = qkv_proj_array[i];
+    } else {
+      int logical_page_idx = token_abs_idx / kPagesize;
+      int page_idx =
+          kv_page_indices[kv_indptr[req_idx_compact] + logical_page_idx];
+      int to_k_idx = get_k_entry_offset_verify(
+          token_abs_idx, page_idx, num_kv_heads, head_dim);
+      int to_v_idx = get_v_entry_offset_verify(
+          token_abs_idx, page_idx, num_kv_heads, head_dim);
+      if (head_idx - num_q_heads < num_kv_heads) {
+        // key
+        int offset = (head_idx - num_q_heads) * head_dim + proj_offset;
+        assert(offset >= 0 && offset < num_kv_heads * head_dim &&
+               "K-cache offset out of bounds");
+        kvCache_ptr[to_k_idx + offset] = qkv_proj_array[i];
+      } else {
+        // value
+        int offset =
+            (head_idx - num_q_heads - num_kv_heads) * head_dim + proj_offset;
+        assert(offset >= 0 && offset < num_kv_heads * head_dim &&
+               "V-cache offset out of bounds");
+        kvCache_ptr[to_v_idx + offset] = qkv_proj_array[i];
+      }
+    }
+  }
+}
+
+// template <typename DT>
+// __global__ void update_kv_cache_kernel_flashinfer_kernel(
+//     DT *qkv_proj_array,
+//     half *qTmp_ptr,
+//     half *kvCache_ptr,
+//     int32_t *kv_indptr,
+//     int32_t *kv_page_indices,
+//     bool const *request_completed,
+//     int peft_req_idx,
+//     BatchConfig::PerTokenInfo const *tokenInfos,
+//     int num_q_heads,
+//     int num_kv_heads,
+//     int head_dim,
+//     int num_new_tokens) {
+//   int const q_hidden_size = num_q_heads * head_dim;
+//   int const kv_hidden_size = num_kv_heads * head_dim;
+
+//   int const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+//   int const token_idx = thread_idx / q_hidden_size;
+//   int const offset = thread_idx % q_hidden_size;
+//   if (token_idx >= num_new_tokens) {
+//     return;
+//   }
+//   int const req_idx = tokenInfos[token_idx].request_index;
+//   int token_abs_idx = tokenInfos[token_idx].abs_depth_in_request;
+//   // calculate the compact request index in the easiest way
+//   // TODO: recheck
+//   int req_idx_compact = -1;
+//   int cnt = 0;
+//   while (cnt < req_idx + 1) {
+//     if (!request_completed[cnt] && cnt != peft_req_idx) {
+//       req_idx_compact++;
+//     }
+//     cnt++;
+//   }
+//   assert(req_idx_compact >= 0 && "Invalid request index");
+//   size_t from_idx = token_idx * (q_hidden_size + temp_kv_hidden_size * 2);
+//   qTmp_ptr[token_idx * q_hidden_size + offset] =
+//       static_cast<half>(qkv_proj_array[from_idx + offset]);
+//   if (offset < kv_hidden_size) {
+//     int start = kv_indptr[req_idx_compact];
+//     int end = kv_indptr[req_idx_compact + 1] - 1;
+//     assert(start <= end && "Invalid kv_indptr");
+//     assert(start + (token_abs_idx / kPagesize) <= end && "Invalid page
+//     index"); int page_idx = kv_page_indices[start + (token_abs_idx /
+//     kPagesize)]; size_t to_k_idx = get_k_entry_offset_verify(
+//                token_abs_idx, page_idx, num_kv_heads, head_dim),
+//            to_v_idx = get_v_entry_offset_verify(
+//                token_abs_idx, page_idx, num_kv_heads, head_dim);
+//     // key and value cache should be stored interleaved
+//     int const stride = num_q_heads / num_kv_heads;
+//     int const kv_offset =
+//         offset / head_dim * stride * head_dim + offset % head_dim;
+//     kvCache_ptr[to_k_idx + offset] =
+//         static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+//         kv_offset]);
+//     kvCache_ptr[to_v_idx + offset] =
+//         static_cast<half>(qkv_proj_array[from_idx + q_hidden_size +
+//                                          temp_kv_hidden_size + kv_offset]);
+//   }
+// }
+
+template <typename DT>
+void update_kv_cache_kernel_flashinfer(IncMultiHeadSelfAttentionMeta const *m,
+                                       BatchConfig const *bc,
+                                       cudaStream_t stream) {
+  // printf("entered update_qkv_in_batch_verify\n");
+  int num_new_tokens = bc->num_inference_tokens();
+  if (num_new_tokens == 0) {
+    return;
+  }
   int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
-  assert(m->hidden_size % m->num_q_heads == 0);
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
-  if (num_tokens > 0) {
-    int parallelism = head_dim * tot_num_heads * num_tokens;
-    // devQKVProj has shape [qProjSize, tot_num_heads, num_new_tokens]
-    store_kv_cache<<<GET_BLOCKS(parallelism),
-                     min(CUDA_NUM_THREADS, parallelism),
-                     0,
-                     stream>>>(static_cast<DT *>(m->devQKVProjArray),
-                               static_cast<DT *>(m->keyCache),
-                               static_cast<DT *>(m->valueCache),
-                               m->token_infos,
-                               num_tokens,
-                               BatchConfig::max_sequence_length(),
-                               head_dim,
-                               m->num_q_heads,
-                               m->num_kv_heads);
+  int parallelism = m->qProjSize * tot_num_heads * num_new_tokens;
+  int peft_req_idx = (bc->num_finetuning_fwd_tokens() > 0)
+                         ? bc->finetuning_request_index()
+                         : -1;
+  int32_t *kv_indptr = m->handle.incr_attention_metadata->kv_indptr;
+  int32_t *kv_indices = m->handle.incr_attention_metadata->kv_indices;
+  update_kv_cache_kernel_flashinfer_kernel<<<GET_BLOCKS(parallelism),
+                                             min(CUDA_NUM_THREADS, parallelism),
+                                             0,
+                                             stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<half *>(m->queryTmp),
+      static_cast<half *>(m->kvCache),
+      kv_indptr,
+      kv_indices,
+      m->request_completed,
+      peft_req_idx,
+      m->token_infos,
+      m->num_q_heads,
+      m->num_kv_heads,
+      m->qProjSize,
+      num_new_tokens);
+}
+
+template <typename DT>
+void update_kv_cache_kernel_peft(IncMultiHeadSelfAttentionMeta const *m,
+                                 BatchConfig const *bc,
+                                 cudaStream_t stream) {
+  int num_tokens = bc->num_finetuning_fwd_tokens();
+  if (num_tokens <= 0) {
+    return;
   }
+
+  int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
+  int head_dim = m->qProjSize;
+  int i = bc->finetuning_request_index();
+  int tokens_previous_requests =
+      bc->requestsInfo[i].first_token_offset_in_batch;
+  DT *qkv_ptr = static_cast<DT *>(m->devQKVProjArray) +
+                m->qProjSize * tot_num_heads * tokens_previous_requests;
+
+  int parallelism = head_dim * tot_num_heads * num_tokens;
+  // devQKVProj has shape [qProjSize, tot_num_heads, num_new_tokens]
+  store_kv_cache<<<GET_BLOCKS(parallelism),
+                   min(CUDA_NUM_THREADS, parallelism),
+                   0,
+                   stream>>>(qkv_ptr,
+                             static_cast<DT *>(m->keyCachePeft),
+                             static_cast<DT *>(m->valueCachePeft),
+                             m->token_infos,
+                             num_tokens,
+                             BatchConfig::max_sequence_length(),
+                             head_dim,
+                             m->num_q_heads,
+                             m->num_kv_heads);
 }
 
-#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
-    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
-  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
-                                   BatchConfig::max_sequence_length(),         \
-                                   THREADS_PER_VALUE,                          \
-                                   THDS_PER_BLOCK);                            \
-  compute_attention_kernel_generation_kernel<DT,                               \
-                                             THDS_PER_BLOCK,                   \
-                                             Dh,                               \
-                                             Dh_MAX,                           \
-                                             THDS_PER_KEY,                     \
-                                             THREADS_PER_VALUE>                \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
-          static_cast<DT *>(m->devQKVProjArray),                               \
-          static_cast<DT *>(m->keyCache),                                      \
-          static_cast<DT *>(m->valueCache),                                    \
-          output_ptr,                                                          \
-          scale,                                                               \
-          BatchConfig::max_sequence_length(),                                  \
-          m->qProjSize,                                                        \
-          m->num_q_heads,                                                      \
-          m->num_kv_heads,                                                     \
-          m->request_infos)
+template <typename DT>
+__global__ void produce_output_kernel(DT const *input_ptr,
+                                      DT *output_ptr,
+                                      int parallelism) {
+  CUDA_KERNEL_LOOP(idx, parallelism) {
+    output_ptr[idx] = static_cast<DT>(input_ptr[idx]);
+  }
+}
 
 template <typename DT>
-void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
-                                         BatchConfig const *bc,
-                                         DT *output_ptr,
-                                         cudaStream_t stream) {
-  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
-  int const per_head_size = m->qProjSize;
-  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
-  size_t smem_sz;
-  if (per_head_size == 64) {
-    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
-  } else if (per_head_size == 128) {
-    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
-    LAUNCH_ATTENTION_SCORE_KERNEL(
-        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
-  } else {
-    assert(false && "a unsupported head size");
+void produce_output(IncMultiHeadSelfAttentionMeta const *m,
+                    BatchConfig const *bc,
+                    DT *output_ptr,
+                    cudaStream_t stream) {
+  int const num_tokens = bc->num_inference_tokens();
+  if (num_tokens == 0) {
+    return;
   }
+  int parallelism = m->vProjSize * m->num_q_heads * num_tokens;
+  produce_output_kernel<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(
+      static_cast<DT *>(m->outputTmp), output_ptr, parallelism);
+}
+
+template <typename DT>
+void flashinfer_incr_attention(IncMultiHeadSelfAttentionMeta *m,
+                               BatchConfig const *bc,
+                               int shard_id,
+                               DT *output_ptr,
+                               cudaStream_t stream) {
+
+  // global constant parameters
+  uint32_t const num_q_heads = m->num_q_heads;
+  uint32_t const num_kv_heads = m->num_kv_heads;
+  uint32_t const head_dim = m->qProjSize;
+  uint32_t const batch_size = bc->num_inference_requests();
+  float const sm_scale =
+      (*m->qk_prod_scaling) ? 1.0f / sqrt(m->qProjSize) : 1.0f;
+  assert(batch_size > 0);
+  assert(num_q_heads > 0);
+  assert(num_kv_heads > 0);
+  assert(head_dim > 0);
+  assert(bc->num_inference_tokens() > 0);
+
+  half *q = static_cast<half *>(m->queryTmp),
+       *kv = static_cast<half *>(m->kvCache),
+       *o = static_cast<half *>(m->outputTmp);
+  assert(q != nullptr && "q is null!");
+  assert(kv != nullptr && "kv is null!");
+  assert(o != nullptr && "o is null!");
+  assert(m->handle.incr_attention_metadata->q_indptr != nullptr &&
+         "q_indptr is null!");
+  assert(m->handle.incr_attention_metadata->kv_indices != nullptr &&
+         "kv_indices is null!");
+  assert(m->handle.incr_attention_metadata->kv_indptr != nullptr &&
+         "kv_indptr is null!");
+  assert(m->handle.incr_attention_metadata->kv_last_page_len != nullptr &&
+         "kv_last_page_len is null!");
+  paged_kv_t<PageStorage::kIndices, half, int32_t> paged_kv(
+      num_kv_heads,
+      kPagesize,
+      head_dim,
+      batch_size,
+      QKVLayout::kNHD,
+      kv,
+      m->handle.incr_attention_metadata->kv_indices,
+      m->handle.incr_attention_metadata->kv_indptr,
+      m->handle.incr_attention_metadata->kv_last_page_len);
+
+  if (m->inference_debugging) {
+    bc->save_to_file(get_fwd_dbg_folder(m, shard_id) + ".batch_config");
+    std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".q_indptr";
+    save_tensor(
+        static_cast<int32_t *>(m->handle.incr_attention_metadata->q_indptr),
+        batch_size + 1,
+        fpath.c_str());
+    fpath = get_fwd_dbg_folder(m, shard_id) + ".kv_indptr";
+    save_tensor(
+        static_cast<int32_t *>(m->handle.incr_attention_metadata->kv_indptr),
+        batch_size + 1,
+        fpath.c_str());
+    fpath = get_fwd_dbg_folder(m, shard_id) + ".kv_indices";
+
+    int num_pages;
+    checkCUDA(
+        cudaMemcpy(&num_pages,
+                   m->handle.incr_attention_metadata->kv_indptr + batch_size,
+                   sizeof(int),
+                   cudaMemcpyDeviceToHost));
+    save_tensor(
+        static_cast<int32_t *>(m->handle.incr_attention_metadata->kv_indices),
+        num_pages,
+        fpath.c_str());
+    fpath = get_fwd_dbg_folder(m, shard_id) + ".kv_last_page_len";
+    save_tensor(static_cast<int32_t *>(
+                    m->handle.incr_attention_metadata->kv_last_page_len),
+                batch_size,
+                fpath.c_str());
+  }
+
+  assert(m->handle.incr_attention_metadata->prompt_handler_collections.count(
+             batch_size) != 0 &&
+         "Handler is not initialized");
+  void *handler =
+      m->handle.incr_attention_metadata->prompt_handler_collections[batch_size];
+  // printf("obtained handler\n");
+  assert(sizeof(DT) == 2 && "FlashInfer only supports half precision");
+  DISPATCH_HEADDIM(head_dim, HEAD_DIM, {
+    // printf("Launching BatchPrefillWithPagedKVCacheWrapperDispatched\n");
+    cudaError_t result =
+        BatchPrefillWithPagedKVCacheWrapperDispatched<PageStorage::kIndices,
+                                                      HEAD_DIM,
+                                                      LogitsPostHook::kNone,
+                                                      PosEncodingMode::kNone,
+                                                      false,
+                                                      MaskMode::kCausal,
+                                                      half,
+                                                      half,
+                                                      half,
+                                                      int32_t>(
+            static_cast<BatchPrefillHandler *>(handler),
+            q,
+            m->handle.incr_attention_metadata->q_indptr,
+            /*q_offset=*/nullptr,
+            paged_kv,
+            /*custom_mask=*/nullptr,
+            /*qk_indptr=*/nullptr,
+            o,
+            /*lse=*/nullptr,
+            num_q_heads,
+            /*window_left=*/-1,
+            /*logits_soft_cap=*/0.f,
+            sm_scale,
+            /*rope_scale=*/1.f,
+            /*rope_theta=*/static_cast<float>(1e4),
+            stream);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("Failed to run "
+                               "IncrementalDecodingAttentionForwardKernel: " +
+                               std::string(cudaGetErrorString(result)));
+    }
+  });
+
+  produce_output(m, bc, output_ptr, stream);
 }
 
 template <typename DT>
@@ -1226,7 +1233,8 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       int shard_id,
                       DT const *qkv_ptr,
                       DT *output_ptr,
-                      cudaStream_t stream) {
+                      cudaStream_t inf_stream,
+                      cudaStream_t peft_stream) {
 
   // phase 0: copy calculated qkv into devQKVProjArray
   // [qProjSize, tot_num_heads, num_new_tokens]
@@ -1238,7 +1246,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                   qkv_ptr,
                   qkv_proj_size * sizeof(DT),
                   cudaMemcpyDeviceToDevice,
-                  stream);
+                  inf_stream);
 
   if (m->inference_debugging) {
     std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".devQKVProjArray";
@@ -1249,7 +1257,7 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
 
   // phase 1: Implement kernel to apply rotary embedding and scaling
   apply_scaling_and_rotary(
-      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), stream);
+      m, bc, shard_id, static_cast<DT *>(m->devQKVProjArray), inf_stream);
 
   if (m->inference_debugging) {
     std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".post_rope";
@@ -1258,31 +1266,18 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                 fpath.c_str());
   }
 
-  update_kv_cache_kernel<DT>(m, bc, stream);
-
-  if (m->inference_debugging) {
-    size_t key_cache_size = m->kProjSize * m->num_kv_heads *
-                            BatchConfig::max_sequence_length() *
-                            BatchConfig::max_requests_per_batch();
-    std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".key_cache";
-    save_tensor(
-        static_cast<DT const *>(m->keyCache), key_cache_size, fpath.c_str());
-    fpath = get_fwd_dbg_folder(m, shard_id) + ".value_cache";
-    save_tensor(
-        static_cast<DT const *>(m->valueCache), key_cache_size, fpath.c_str());
-  }
-
-  if (bc->num_generation_tokens > 0) {
-    // phase 3: Compute attention score for generation tokens
-    compute_attention_kernel_generation<DT>(m, bc, output_ptr, stream);
-  }
+  // peft stream can only start after
+  if (bc->num_finetuning_fwd_tokens() > 0) {
+    // wait until copy to devQKVProjArray and application of scaling & rotary
+    // have finished
+    cudaEvent_t prep_done;
+    cudaEventCreate(&prep_done);
+    cudaEventRecord(prep_done, inf_stream);
+    cudaStreamWaitEvent(peft_stream, prep_done, 0);
 
-  if (bc->num_tokens > bc->num_generation_tokens) {
-    // phase 4: Compute attention score for prompt tokens;
-    compute_attention_kernel_prompt<DT>(m, bc, output_ptr, shard_id, stream);
-  }
+    update_kv_cache_kernel_peft<DT>(m, bc, peft_stream);
+    compute_attention_kernel_peft<DT>(m, bc, output_ptr, shard_id, peft_stream);
 
-  if (bc->num_finetuning_fwd_tokens() > 0) {
     assert(m->peft_token_infos != nullptr);
     assert(m->peft_token_infos_size == sizeof(BatchConfig::PerTokenInfo) *
                                            BatchConfig::max_sequence_length());
@@ -1296,6 +1291,27 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
           bc->tokensInfo[tokens_previous_requests + j];
     }
   }
+
+  // flashinfer sdpa
+  assert(bc->num_finetuning_fwd_tokens() >= 0 &&
+         bc->num_finetuning_bwd_tokens() >= 0);
+  if (bc->num_inference_tokens() > 0) {
+    update_kv_cache_kernel_flashinfer<DT>(m, bc, inf_stream);
+    flashinfer_incr_attention<DT>(m, bc, shard_id, output_ptr, inf_stream);
+  }
+
+  // if (m->inference_debugging) {
+  //   size_t key_cache_size = m->kProjSize * m->num_kv_heads *
+  //                           BatchConfig::max_sequence_length() *
+  //                           BatchConfig::max_requests_per_batch();
+  //   std::string fpath = get_fwd_dbg_folder(m, shard_id) + ".key_cache";
+  //   save_tensor(
+  //       static_cast<DT const *>(m->keyCache), key_cache_size, fpath.c_str());
+  //   fpath = get_fwd_dbg_folder(m, shard_id) + ".value_cache";
+  //   save_tensor(
+  //       static_cast<DT const *>(m->valueCache), key_cache_size,
+  //       fpath.c_str());
+  // }
 }
 
 std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
@@ -1382,10 +1398,10 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      int shard_id,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,
-                     cudaStream_t stream) {
+                     cudaStream_t peft_stream) {
   assert(!m->offload);
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDA(cublasSetStream(m->handle.peft_blas, peft_stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, peft_stream));
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
@@ -1402,6 +1418,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(num_tokens == num_total_tokens);
   assert(num_total_tokens == bc->requestsInfo[i].max_length);
   assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+  assert(bc->requestsInfo[i].first_token_offset_in_batch == 0);
 
   // Step 1: copy gradient before final projection into workspace
   {
@@ -1414,7 +1431,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             m->oProjSize,
                     m_ * n_ * sizeof(DT),
                     cudaMemcpyDeviceToDevice,
-                    stream);
+                    peft_stream);
     if (m->inference_debugging) {
       // save result to file for checking
       std::string filename =
@@ -1436,14 +1453,14 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     // note that we first need to compute the gradients wrt each q_heads, then
     // we can sum the gradients corresponding to each group of q_heads to obtain
     // the gradients wrt each value head
-    DT *C = static_cast<DT *>(m->devQKVProjArray) +
+    DT *C = static_cast<DT *>(m->devQKVProjArrayBWD) +
             2 * num_tokens *
                 (m->qProjSize * m->num_q_heads); // skip over regions reserved
                                                  // for Q and K gradients
     // after transpositions
-    int m_ = num_tokens;   // total_tokens
-    int n_ = m->vProjSize; // num_new_tokens
-    int k_ = num_tokens;   // num_new_tokens
+    int m_ = num_tokens; // total_tokens
+    int n_ = m->vProjSize;
+    int k_ = num_tokens; // num_new_tokens
     // before transpositions
     int lda = num_tokens; // num_new_tokens
     int ldb = m->vProjSize * m->num_q_heads;
@@ -1492,10 +1509,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     // matrix A's layout: [vProjSize * num_q_heads, num_new_tokens]
     DT const *A = static_cast<DT *>(m->handle.workSpace);
     // matrix B: value cache
-    // matrix B's layout: [vProjSize * num_kv_heads, max_num_tokens, num_req]
-    DT const *B =
-        static_cast<DT *>(m->valueCache) +
-        i * m->vProjSize * m->num_kv_heads * BatchConfig::max_sequence_length();
+    // matrix B's layout: [vProjSize * num_kv_heads, max_num_tokens, 1]
+    DT const *B = static_cast<DT *>(m->valueCachePeft);
+
     // matrix C: qk_prods_softmax gradients
     // matrix C's layout: [num_new_tokens, total_tokens, num_q_heads]
     DT *C = static_cast<DT *>(m->qk_prods_softmax);
@@ -1512,7 +1528,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
 
     run_batched_matmul<DT>(m,
-                           m->handle.blas,
+                           m->handle.peft_blas,
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
                            m_,
@@ -1535,7 +1551,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            m->num_q_heads,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-                           stream,
+                           peft_stream,
                            1,
                            m->num_q_heads / m->num_kv_heads,
                            1,
@@ -1566,7 +1582,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                           c_param,
                                           h_param,
                                           w_param));
-    checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+    checkCUDNN(cudnnSoftmaxBackward(m->handle.peft_dnn,
                                     CUDNN_SOFTMAX_ACCURATE,
                                     CUDNN_SOFTMAX_MODE_CHANNEL,
                                     &alpha,
@@ -1598,12 +1614,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
                                     min((size_t)CUDA_NUM_THREADS, parallelism),
                                     0,
-                                    stream>>>(static_cast<DT *>(m->qk_prods),
-                                              num_tokens,
-                                              num_tokens,
-                                              m->num_q_heads,
-                                              entries_above_diagonal,
-                                              DT(0.0f));
+                                    peft_stream>>>(
+          static_cast<DT *>(m->qk_prods),
+          num_tokens,
+          num_tokens,
+          m->num_q_heads,
+          entries_above_diagonal,
+          DT(0.0f));
     }
     if (m->inference_debugging) {
       DT *C = static_cast<DT *>(m->qk_prods);
@@ -1625,12 +1642,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     // matrix B: query activation (in query_activation_buffer)
     // matrix B's layout: [m->qProjSize * num_q_heads, num_new_tokens]
     DT const *B = static_cast<DT *>(m->query_activation_buffer);
-    // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+    // matrix C: gradients for key (saved as part of m->devQKVProjArrayBWD)
     // matrix C's layout: [num_tokens, qProjsize * num_q_heads, 3]
     // note that we first need to compute the gradients wrt each q_heads, then
     // we can sum the gradients corresponding to each group of q_heads to obtain
     // the gradients wrt each key head
-    DT *C = static_cast<DT *>(m->devQKVProjArray) +
+    DT *C = static_cast<DT *>(m->devQKVProjArrayBWD) +
             num_tokens *
                 (m->qProjSize *
                  m->num_q_heads); // skip over regions reserved for Q gradients
@@ -1645,7 +1662,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int strideA = num_tokens * num_tokens;
     int strideB = m->kProjSize;
     int strideC = num_tokens * m->kProjSize;
-    checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+    checkCUDA(cublasGemmStridedBatchedEx(m->handle.peft_blas,
                                          CUBLAS_OP_T,
                                          CUBLAS_OP_T,
                                          m_,
@@ -1690,12 +1707,10 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     DT const *A = static_cast<DT *>(m->qk_prods);
     // matrix B: key cache
     // matrix B's layout: [vProjSize * num_kv_heads, max_num_tokens, num_req]
-    DT const *B =
-        static_cast<DT *>(m->keyCache) +
-        i * m->kProjSize * m->num_kv_heads * BatchConfig::max_sequence_length();
-    // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+    DT const *B = static_cast<DT *>(m->keyCachePeft);
+    // matrix C: gradients for query (saved as part of m->devQKVProjArrayBWD)
     // matrix C's layout: [num_tokens, qProjsize * num_q_heads, 3]
-    DT *C = static_cast<DT *>(m->devQKVProjArray);
+    DT *C = static_cast<DT *>(m->devQKVProjArrayBWD);
     // after transposition & striding
     int m_ = num_tokens; // num_new_tokens
     int n_ = m->qProjSize;
@@ -1708,7 +1723,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int strideB = m->qProjSize;
     int strideC = num_tokens * m->qProjSize;
     run_batched_matmul<DT>(m,
-                           m->handle.blas,
+                           m->handle.peft_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
                            m_,
@@ -1731,7 +1746,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                            m->num_q_heads,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-                           stream,
+                           peft_stream,
                            1,
                            m->num_q_heads / m->num_kv_heads,
                            1,
@@ -1751,8 +1766,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
                                 m->peft_token_infos,
                                 m->peft_token_infos_size,
                                 cudaMemcpyHostToDevice,
-                                stream));
-      assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+                                peft_stream));
       assert(m->qProjSize == m->kProjSize);
       /*q&k*/
       int half_proj = m->qProjSize / 2;
@@ -1762,8 +1776,8 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
                                    min(CUDA_NUM_THREADS, parallelism),
                                    0,
-                                   stream>>>(
-          static_cast<DT *>(m->devQKVProjArray),
+                                   peft_stream>>>(
+          static_cast<DT *>(m->devQKVProjArrayBWD),
           m->complex_input,
           m->peft_token_infos_device,
           m->rotary_embedding_meta->rope_theta,
@@ -1776,7 +1790,7 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
           num_tokens,
           m->num_q_heads,
           m->num_kv_heads);
-      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      DT *C = static_cast<DT *>(m->devQKVProjArrayBWD);
       if (m->inference_debugging) {
         std::string filename =
             get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
@@ -1786,9 +1800,9 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       }
     }
 
-    // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+    // matrix C: gradients for key (saved as part of m->devQKVProjArrayBWD)
     // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
-    DT *C = static_cast<DT *>(m->devQKVProjArray) +
+    DT *C = static_cast<DT *>(m->devQKVProjArrayBWD) +
             num_tokens *
                 (m->qProjSize *
                  m->num_q_heads); // skip over regions reserved for Q gradients
@@ -1806,13 +1820,12 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
       beta = 1.0f;
     }
     // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
-    // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
-    DT const *B = static_cast<DT *>(m->devQKVProjArray);
+    // matrix B's layout: [num_tokens, qProjsize * tot_num_heads]
+    DT const *B = static_cast<DT *>(m->devQKVProjArrayBWD);
     // matrix C: gradients w.r.t. input
-    // matrix C's layout: [m->qSize, num_tokens]
-    DT *C = input_grad_ptr +
-            bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
-    // int m_ = m->qSize;
+    // matrix C's layout: [qProjsize * tot_num_heads, num_tokens]
+    DT *C = input_grad_ptr + bc->requestsInfo[i].first_token_offset_in_batch *
+                                 m->qProjSize * m->num_q_heads;
     int n_ = num_tokens;
     int k_ = m->qProjSize * (m->num_q_heads + 2 * m->num_kv_heads);
 
@@ -1820,12 +1833,13 @@ void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
     // do further calculation in a way different than the usual dense layer,
     // they are off by a transpose. So an explicit transpose is needed here.
     // The add here is just for gradient accumulation.
-    transposeAdd(C, B, n_, k_, alpha, beta, stream);
+    transposeAdd(C, B, n_, k_, alpha, beta, peft_stream);
 
     if (m->inference_debugging) {
       std::string filename =
           get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
-      save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      save_tensor(
+          C, num_tokens * m->qProjSize * m->num_q_heads, filename.c_str());
     }
   }
 }
@@ -1842,37 +1856,49 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     int shard_id,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorW const &output) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
-  cudaEvent_t t_start, t_end;
-  if (m->profiling) {
-    cudaEventCreate(&t_start);
-    cudaEventCreate(&t_end);
-    cudaEventRecord(t_start, stream);
-  }
+  cudaStream_t inf_stream;
+  checkCUDA(get_legion_stream(&inf_stream));
+  cudaStream_t peft_stream;
+  checkCUDA(get_legion_stream(&peft_stream));
+
+  // cudaEvent_t t_start, t_end;
+  // if (m->profiling) {
+  //   cudaEventCreate(&t_start);
+  //   cudaEventCreate(&t_end);
+  //   cudaEventRecord(t_start, stream);
+  // }
 
   assert(input.data_type == output.data_type);
 
   if (input.data_type == DT_HALF) {
-    Kernels::IncMultiHeadAttention::inference_kernel(
-        m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream);
+    Kernels::IncMultiHeadAttention::inference_kernel(m,
+                                                     bc,
+                                                     shard_id,
+                                                     input.get_half_ptr(),
+                                                     output.get_half_ptr(),
+                                                     inf_stream,
+                                                     peft_stream);
   } else if (input.data_type == DT_FLOAT) {
-    Kernels::IncMultiHeadAttention::inference_kernel(
-        m, bc, shard_id, input.get_float_ptr(), output.get_float_ptr(), stream);
+    Kernels::IncMultiHeadAttention::inference_kernel(m,
+                                                     bc,
+                                                     shard_id,
+                                                     input.get_float_ptr(),
+                                                     output.get_float_ptr(),
+                                                     inf_stream,
+                                                     peft_stream);
   } else {
     assert(false && "Unspported data type");
   }
 
-  if (m->profiling) {
-    cudaEventRecord(t_end, stream);
-    checkCUDA(cudaEventSynchronize(t_end));
-    float elapsed = 0;
-    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
-    cudaEventDestroy(t_start);
-    cudaEventDestroy(t_end);
-    printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
-  }
+  // if (m->profiling) {
+  //   cudaEventRecord(t_end, stream);
+  //   checkCUDA(cudaEventSynchronize(t_end));
+  //   float elapsed = 0;
+  //   checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+  //   cudaEventDestroy(t_start);
+  //   cudaEventDestroy(t_end);
+  //   printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
+  // }
 }
 
 /*static*/
@@ -1928,15 +1954,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
                                     INC_DECODING_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -1947,11 +1969,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                     attn->position_bias,
                                     attn->scaling_factor,
                                     gpu_mem_allocator,
-                                    num_samples,
                                     attn->num_q_heads,
                                     attn->num_kv_heads,
                                     _num_q_heads,
                                     _num_kv_heads,
+                                    attn->num_kv_cache_pages,
                                     attn->quantization_type,
                                     attn->offload) {}
 
@@ -1959,9 +1981,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     InferenceMode infer_mode,
     Op const *attn,
-    int _qSize,
-    int _kSize,
-    int _vSize,
     int _qProjSize,
     int _kProjSize,
     int _vProjSize,
@@ -1972,11 +1991,11 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     bool _position_bias,
     float _scaling_factor,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _global_num_q_heads,
     int _global_num_kv_heads,
     int _num_q_heads,
     int _num_kv_heads,
+    int _num_kv_cache_pages,
     DataType _quantization_type,
     bool _offload)
     : OpMeta(handler, attn) {
@@ -1984,17 +2003,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
   checkCUDNN(cudnnCreateTensorDescriptor(&qk_tensor));
-  qSize = _qSize;
-  kSize = _kSize;
-  vSize = _vSize;
   // assume dimensions match for now
-  assert(qSize == kSize);
-  assert(kSize == vSize);
   qProjSize = _qProjSize;
   kProjSize = _kProjSize;
-  assert(qProjSize == kProjSize); // required for attention QK.T matmul
   vProjSize = _vProjSize;
   oProjSize = _oProjSize;
+  assert(qProjSize == kProjSize &&
+         kProjSize == vProjSize); // required for attention QK.T matmul
   size_t size_of_dt = data_type_size(attn->data_type);
   quantization_type = _quantization_type;
   offload = _offload;
@@ -2003,7 +2018,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   global_num_kv_heads = _global_num_kv_heads;
   num_q_heads = _num_q_heads;
   num_kv_heads = _num_kv_heads;
-  hidden_size = num_q_heads * qProjSize;
 
   rotary_embedding_meta =
       (RotaryEmbeddingMeta *)calloc(1, sizeof(RotaryEmbeddingMeta));
@@ -2016,59 +2030,96 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   position_bias = (bool *)calloc(1, sizeof(bool));
   *position_bias = _position_bias;
 
-  assert(num_q_heads % num_kv_heads == 0 &&
-         "num_q_heads must be divisible by num_kv_heads");
-  if (num_q_heads > num_kv_heads) {
-    // grouped query attention
-    assert(attn->data_type == DT_FLOAT ||
-           attn->data_type == DT_HALF && "Unsupported data type");
-    gqa_ptr_array_size = num_q_heads * sizeof(void *);
+  num_kv_cache_pages = _num_kv_cache_pages;
+  assert(num_kv_cache_pages > 0 || enable_peft_finetuning);
+
+  // spec decoding and peft finetuning are mutually exclusive
+  if (enable_peft_finetuning) {
+    assert(infer_mode == INC_DECODING_MODE);
   }
 
-  // allocate memory for the seqArray and reserve space
+  size_t totalSize = 0;
+
+  // Compute total GPU memory size needed
   {
-    int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE
+    // 1. GQA pointers for batch matmul. Used by PEFT and spec_inc if
+    // num_q_heads > num_kv_heads
+    if (num_q_heads > num_kv_heads &&
+        (enable_peft_finetuning || infer_mode == BEAM_SEARCH_MODE)) {
+      assert(num_q_heads % num_kv_heads == 0 &&
+             "num_q_heads must be divisible by num_kv_heads");
+      assert(attn->data_type == DT_FLOAT ||
+             attn->data_type == DT_HALF && "Unsupported data type");
+      gqa_ptr_array_size = num_q_heads * sizeof(void *);
+      totalSize += 3 * gqa_ptr_array_size; // fwd
+      if (enable_peft_finetuning) {
+        totalSize += 3 * gqa_ptr_array_size; // bwd
+      }
+    }
+
+    // 2. KV cache
+    key_cache_size = value_cache_size =
+        num_kv_heads * kProjSize * kPagesize * num_kv_cache_pages;
+    if (infer_mode == BEAM_SEARCH_MODE || infer_mode == TREE_VERIFY_MODE) {
+      // a K-ary tree max node is (k^n - 1) / 2
+      assert(key_cache_size == value_cache_size);
+      assert(key_cache_size >=
+             num_kv_heads * kProjSize *
+                 BeamSearchBatchConfig::max_requests_per_batch() *
+                 (BatchConfig::max_sequence_length() +
+                  BatchConfig::max_spec_tree_token_num()));
+    }
+    totalSize += (key_cache_size + value_cache_size) * size_of_dt;
+    if (enable_peft_finetuning) {
+      // add kv cache for single sequence
+      peft_key_cache_size = peft_value_cache_size =
+          num_kv_heads * kProjSize * BatchConfig::max_sequence_length();
+      totalSize += (peft_key_cache_size + peft_value_cache_size) * size_of_dt;
+    }
+
+    // 3. buffers for intermediate results
+    int tot_num_heads = num_q_heads + 2 * num_kv_heads;
+    int max_tokens_per_batch = (infer_mode == TREE_VERIFY_MODE)
                                    ? BatchConfig::max_verify_tokens_per_batch()
                                    : BatchConfig::max_tokens_per_batch();
-    size_t qkv_max_proj_size =
-        max_tokens_per_batch *
-        (qProjSize * num_q_heads + kProjSize * num_kv_heads +
-         vProjSize * num_kv_heads);
-    size_t query_tmp_size = 0, key_cache_size = 0, value_cache_size = 0;
-    switch (infer_mode) {
-      case INC_DECODING_MODE: {
-        key_cache_size = num_kv_heads * kProjSize *
-                         BatchConfig::max_requests_per_batch() *
-                         BatchConfig::max_sequence_length();
-        value_cache_size = num_kv_heads * vProjSize *
-                           BatchConfig::max_requests_per_batch() *
-                           BatchConfig::max_sequence_length();
-        break;
-      }
-      case BEAM_SEARCH_MODE:
-      case TREE_VERIFY_MODE: {
-        // a K-ary tree max node is (k^n - 1) / 2
-        key_cache_size = num_kv_heads * kProjSize *
-                         BeamSearchBatchConfig::max_requests_per_batch() *
-                         (BatchConfig::max_sequence_length() +
-                          BatchConfig::max_spec_tree_token_num());
-        value_cache_size = num_kv_heads * vProjSize *
-                           BeamSearchBatchConfig::max_requests_per_batch() *
-                           (BatchConfig::max_sequence_length() +
-                            BatchConfig::max_spec_tree_token_num());
-        break;
-      }
-      default:
-        assert(false && "Unkown inference mode");
+    // devQKVProjArray
+    qkv_max_proj_size = qProjSize * tot_num_heads * max_tokens_per_batch;
+    totalSize += qkv_max_proj_size * size_of_dt;
+    if (enable_peft_finetuning) {
+      qkv_max_proj_size_bwd =
+          qProjSize * tot_num_heads * BatchConfig::max_sequence_length();
+      totalSize += qkv_max_proj_size_bwd * size_of_dt;
+    }
+    // queryTmp and outputTmp: only for paged attention
+    if (infer_mode == INC_DECODING_MODE) {
+      query_tmp_size = num_q_heads * qProjSize * max_tokens_per_batch;
+      output_tmp_size = max_tokens_per_batch * num_q_heads * vProjSize;
+      totalSize += (query_tmp_size + output_tmp_size) * size_of_dt;
+    }
+    // complex_input & complex_input_bwd
+    complex_size = max_tokens_per_batch * qProjSize *
+                   (num_q_heads + num_kv_heads) /
+                   2; // only used for Q and K, not V
+    totalSize += complex_size * sizeof(cuFloatComplex);
+    if (enable_peft_finetuning) {
+      complex_size_bwd = BatchConfig::max_sequence_length() * qProjSize *
+                         (num_q_heads + num_kv_heads) /
+                         2; // only used for Q and K, not V
+      totalSize += complex_size_bwd * sizeof(cuFloatComplex);
     }
-    size_t requestinfo_size = BatchConfig::max_requests_per_batch();
-    // size_t tokeninfo_size = max_tokens_per_batch;
-    size_t qk_prod_size =
-        max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads;
-    size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
-    size_t complex_size = (max_tokens_per_batch * (qProjSize * num_q_heads +
-                                                   kProjSize * num_kv_heads)) /
-                          2;
+    // QK prods and QK prods (softmax)
+    if (infer_mode == BEAM_SEARCH_MODE) {
+      qk_prod_size = max_tokens_per_batch * BatchConfig::max_sequence_length() *
+                     num_q_heads;
+      totalSize += 2 * qk_prod_size * size_of_dt;
+    } else if (enable_peft_finetuning) {
+      // only need one copy as they can be reused by PEFT fwd and PEFT bwd, as
+      // they never run concurrently
+      qk_prod_size = BatchConfig::max_sequence_length() *
+                     BatchConfig::max_sequence_length() * num_q_heads;
+      totalSize += 2 * qk_prod_size * size_of_dt;
+    }
+    // PEFT partial results buffers
     if (enable_peft_finetuning) {
       allocated_peft_buffer_size1 = BatchConfig::max_sequence_length() *
                                     num_q_heads * qProjSize * size_of_dt;
@@ -2081,66 +2132,22 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
               BatchConfig::max_sequence_length());
       peft_token_infos_size = sizeof(BatchConfig::PerTokenInfo) *
                               BatchConfig::max_sequence_length();
-    } else {
-      allocated_peft_buffer_size1 = 0;
-      allocated_peft_buffer_size2 = 0;
-      peft_token_infos = nullptr;
-      peft_token_infos_size = 0;
-    }
-    size_t totalSize = (qkv_max_proj_size + query_tmp_size + key_cache_size +
-                        value_cache_size + 2 * qk_prod_size + attn_heads_size) *
-                           size_of_dt +
-                       complex_size * sizeof(cuFloatComplex) +
-                       3 * gqa_ptr_array_size;
-    if (enable_peft_finetuning) {
       totalSize += allocated_peft_buffer_size1 + allocated_peft_buffer_size2;
       totalSize += peft_token_infos_size;
-      totalSize += 3 * gqa_ptr_array_size;
-    }
-    if (offload) {
-      // assert that we have enough reserved work space left
-      size_t totalSharedSize =
-          infer_mode == TREE_VERIFY_MODE
-              ? totalSize - (query_tmp_size + key_cache_size +
-                             value_cache_size + qkv_max_proj_size) *
-                                size_of_dt
-              : totalSize -
-                    (query_tmp_size + key_cache_size + value_cache_size) *
-                        size_of_dt;
-
-      size_t instance_size =
-          size_of_dt *
-          (infer_mode == TREE_VERIFY_MODE
-               ? query_tmp_size + key_cache_size + value_cache_size +
-                     qkv_max_proj_size
-               : query_tmp_size + key_cache_size + value_cache_size);
-
-      assert(gpu_mem_allocator.reserved_total_size -
-                 gpu_mem_allocator.reserved_allocated_size >=
-             totalSharedSize);
-      gpu_mem_allocator.create_legion_instance(
-          reserveInst, instance_size, "IncMultiHeadSelfAttentionMeta");
-    } else {
-      gpu_mem_allocator.create_legion_instance(
-          reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta");
     }
 
-    // in tree_verify, enable devQKVProjArray;
-    if (!offload || infer_mode == TREE_VERIFY_MODE) {
-      devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped(
-          qkv_max_proj_size * size_of_dt);
-    } else {
-      devQKVProjArray = gpu_mem_allocator.allocate_reserved_untyped(
-          qkv_max_proj_size * size_of_dt);
-      // offset += qkv_max_proj_size * size_of_dt;
+    // 4. offload: TBD
+    if (offload) {
+      assert(false && "TODO");
     }
+  }
 
-    // use key value cache in all mode.
-    keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size *
-                                                           size_of_dt);
-    valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
-                                                             size_of_dt);
+  // Allocate chunk of memory
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, totalSize, "IncMultiHeadSelfAttentionMeta");
 
+  // Assign pointers from chunk of memory
+  {
     // gqa pointers
     if (num_q_heads > num_kv_heads) {
       assert(num_q_heads % num_kv_heads == 0 &&
@@ -2161,31 +2168,58 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       }
     }
 
-    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
-        handler.batch_config_metadata->tokens_info);
-    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
-        handler.batch_config_metadata->requestsInfo);
-
-    if (offload) {
-      qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
+    // KV cache
+    if (infer_mode == INC_DECODING_MODE) {
+      kvCache = gpu_mem_allocator.allocate_instance_untyped(
+          (key_cache_size + value_cache_size) * size_of_dt);
+      keyCache = valueCache = nullptr;
+    } else {
+      kvCache = nullptr;
+      keyCache = gpu_mem_allocator.allocate_instance_untyped(key_cache_size *
                                                              size_of_dt);
-      qk_prods_softmax = gpu_mem_allocator.allocate_reserved_untyped(
-          qk_prod_size * size_of_dt);
-      attn_heads = gpu_mem_allocator.allocate_reserved_untyped(attn_heads_size *
-                                                               size_of_dt);
-      complex_input =
-          gpu_mem_allocator.allocate_reserved<cuFloatComplex>(complex_size);
+      valueCache = gpu_mem_allocator.allocate_instance_untyped(
+          value_cache_size * size_of_dt);
+    }
+    if (enable_peft_finetuning) {
+      assert(infer_mode == INC_DECODING_MODE);
+      keyCachePeft = gpu_mem_allocator.allocate_instance_untyped(
+          peft_key_cache_size * size_of_dt);
+      valueCachePeft = gpu_mem_allocator.allocate_instance_untyped(
+          peft_value_cache_size * size_of_dt);
     } else {
+      keyCachePeft = valueCachePeft = nullptr;
+    }
+
+    // intermediate buffers
+    // devQKVProjArray: used to store QKV proj so that we can modify them (apply
+    // rope, etc)
+    devQKVProjArray = gpu_mem_allocator.allocate_instance_untyped(
+        qkv_max_proj_size * size_of_dt);
+    // devQKVProjArrayBWD
+    if (enable_peft_finetuning) {
+      devQKVProjArrayBWD = gpu_mem_allocator.allocate_instance_untyped(
+          qkv_max_proj_size_bwd * size_of_dt);
+    }
+    // queryTmp and outputTmp: only for paged attention
+    if (infer_mode == INC_DECODING_MODE) {
+      queryTmp = gpu_mem_allocator.allocate_instance_untyped(query_tmp_size *
+                                                             size_of_dt);
+      outputTmp = gpu_mem_allocator.allocate_instance_untyped(output_tmp_size *
+                                                              size_of_dt);
+    }
+    // complex input
+    complex_input =
+        gpu_mem_allocator.allocate_instance<cuFloatComplex>(complex_size);
+    complex_input_bwd =
+        gpu_mem_allocator.allocate_instance<cuFloatComplex>(complex_size_bwd);
+    // qk_prods, qk_prods_softmax
+    if (infer_mode == BEAM_SEARCH_MODE || enable_peft_finetuning) {
       qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
           qk_prod_size * size_of_dt);
-      attn_heads = gpu_mem_allocator.allocate_instance_untyped(attn_heads_size *
-                                                               size_of_dt);
-      complex_input =
-          gpu_mem_allocator.allocate_instance<cuFloatComplex>(complex_size);
     }
-
+    // peft partial result buffers
     if (enable_peft_finetuning) {
       query_activation_buffer = gpu_mem_allocator.allocate_instance_untyped(
           allocated_peft_buffer_size1);
@@ -2196,6 +2230,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                         peft_token_infos_size);
     }
 
+    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
+        handler.batch_config_metadata->tokens_info);
+    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
+        handler.batch_config_metadata->requestsInfo);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
+
     // allocate more size for quantization data
     if (quantization_type != DT_NONE) {
       assert(offload);
@@ -2206,6 +2247,18 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     }
   }
 
+  // ensure we have consumed the allocated memory
+  assert(gpu_mem_allocator.reserved_total_size ==
+         gpu_mem_allocator.reserved_allocated_size);
+
+  // set attention constants
+  // std::cerr << "Enabling incr attention metadata for handler incr meta: "
+  //           << handler.incr_attention_metadata << std::endl;
+  handler.incr_attention_metadata->set_enabled(true);
+  handler.incr_attention_metadata->set_num_q_heads(num_q_heads);
+  handler.incr_attention_metadata->set_num_kv_heads(num_kv_heads);
+  handler.incr_attention_metadata->set_head_dim(qProjSize);
+
   cudaStreamSynchronize(stream);
 }
 
@@ -2277,32 +2330,60 @@ template void Kernels::IncMultiHeadAttention::run_batched_matmul<float>(
     int batch_ratio_c,
     bool bwd);
 
+template void Kernels::IncMultiHeadAttention::apply_scaling_and_rotary<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float *output_ptr,
+    cudaStream_t inf_stream);
+
+template void Kernels::IncMultiHeadAttention::apply_scaling_and_rotary<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half *output_ptr,
+    cudaStream_t inf_stream);
+
 template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
+    Kernels::IncMultiHeadAttention::update_kv_cache_kernel_flashinfer<float>(
         IncMultiHeadSelfAttentionMeta const *m,
         BatchConfig const *bc,
-        float *output_ptr,
         cudaStream_t stream);
 
 template void
-    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
+    Kernels::IncMultiHeadAttention::update_kv_cache_kernel_flashinfer<half>(
         IncMultiHeadSelfAttentionMeta const *m,
         BatchConfig const *bc,
-        half *output_ptr,
         cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::apply_scaling_and_rotary<float>(
+template void Kernels::IncMultiHeadAttention::produce_output<float>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
-    int shard_id,
     float *output_ptr,
     cudaStream_t stream);
 
-template void Kernels::IncMultiHeadAttention::apply_scaling_and_rotary<half>(
+template void Kernels::IncMultiHeadAttention::produce_output<half>(
     IncMultiHeadSelfAttentionMeta const *m,
     BatchConfig const *bc,
-    int shard_id,
     half *output_ptr,
     cudaStream_t stream);
 
+template __global__ void
+    Kernels::IncMultiHeadAttention::apply_position_bias_qkprd<float>(
+        float *input_ptr,
+        int num_tokens,
+        int num_total_tokens,
+        int num_heads,
+        int global_num_q_heads,
+        int shard_id);
+
+template __global__ void
+    Kernels::IncMultiHeadAttention::apply_position_bias_qkprd<half>(
+        half *input_ptr,
+        int num_tokens,
+        int num_total_tokens,
+        int num_heads,
+        int global_num_q_heads,
+        int shard_id);
+
 }; // namespace FlexFlow
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index e365a6c8b..d246c84dd 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -569,8 +569,8 @@ void peft_bwd_kernel(LinearMeta const *m,
                      int in_dim,
                      int out_dim,
                      ffStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDA(cublasSetStream(m->handle.peft_blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, stream));
 
   assert(
       bc->peft_bwd_applies_to_this_layer(m->layer_guid.transformer_layer_id));
@@ -619,7 +619,7 @@ void peft_bwd_kernel(LinearMeta const *m,
   }
 
   if (input_grad_ptr != NULL) {
-    checkCUDA(cublasGemmEx(m->handle.blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
                            in_dim,
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
index 4cde72686..b07746fcc 100644
--- a/src/ops/kernels/lora_linear_kernels.cu
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -313,8 +313,8 @@ void peft_bwd_kernel(Context ctx,
                      int in_dim,
                      int out_dim,
                      ffStream_t stream) {
-  checkCUDA(cublasSetStream(m->handle.blas, stream));
-  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  checkCUDA(cublasSetStream(m->handle.peft_blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.peft_dnn, stream));
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == output_type);
@@ -363,7 +363,7 @@ void peft_bwd_kernel(Context ctx,
                   lora_config.rank * num_peft_tokens,
                   filename.c_str());
     }
-    checkCUDA(cublasGemmEx(m->handle.blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
                            lora_config.rank,
@@ -388,7 +388,7 @@ void peft_bwd_kernel(Context ctx,
   // low_rank_activation
   {
     DT alpha = 1.0f, beta = 0.0f;
-    checkCUDA(cublasGemmEx(m->handle.blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
                            lora_config.rank,
@@ -415,7 +415,7 @@ void peft_bwd_kernel(Context ctx,
     DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
                   ? 0.0f
                   : 1.0f;
-    checkCUDA(cublasGemmEx(m->handle.blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_T,
                            in_dim,
@@ -440,7 +440,7 @@ void peft_bwd_kernel(Context ctx,
   if (input_grad_ptr != nullptr) {
     DT alpha = 1.0f;
     DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
-    checkCUDA(cublasGemmEx(m->handle.blas,
+    checkCUDA(cublasGemmEx(m->handle.peft_blas,
                            CUBLAS_OP_N,
                            CUBLAS_OP_N,
                            in_dim,
@@ -493,7 +493,7 @@ void peft_bwd_kernel(Context ctx,
                               w1_num_elements,
                               nccl_data_type,
                               ncclSum,
-                              m->handle.ncclComm,
+                              m->handle.ncclCommPeft,
                               stream));
       runtime->concurrent_task_barrier(ctx);
 #else
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 0f5f49cf0..7186d5f36 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -53,40 +53,6 @@ bool SpecIncMultiHeadSelfAttentionParams::is_valid(
 }
 
 Tensor FFModel::spec_inc_multihead_self_attention(
-    Tensor const input,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    DataType data_type,
-    Initializer *kernel_initializer,
-    RotaryEmbeddingMeta rotary_embedding_meta,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name) {
-  return spec_inc_multiquery_self_attention(input,
-                                            embed_dim,
-                                            num_heads,
-                                            num_heads,
-                                            kdim,
-                                            vdim,
-                                            dropout,
-                                            add_zero_attn,
-                                            data_type,
-                                            kernel_initializer,
-                                            rotary_embedding_meta,
-                                            scaling_query,
-                                            scaling_factor,
-                                            qk_prod_scaling,
-                                            position_bias,
-                                            name);
-}
-
-Tensor FFModel::spec_inc_multiquery_self_attention(
     Tensor const input,
     int embed_dim,
     int num_q_heads,
@@ -161,6 +127,7 @@ Tensor FFModel::spec_inc_multiquery_self_attention(
   li->add_float_property("scaling_factor", scaling_factor);
   li->add_int_property("qk_prod_scaling", qk_prod_scaling);
   li->add_int_property("position_bias", position_bias);
+  li->add_int_property("num_kv_cache_pages", get_num_kv_cache_pages());
   layers.push_back(li);
   return li->outputs[0];
 }
@@ -206,6 +173,8 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
   bool qk_prod_scaling = (bool)value;
   layer->get_int_property("position_bias", value);
   bool position_bias = (bool)value;
+  layer->get_int_property("num_kv_cache_pages", value);
+  int num_kv_cache_pages = (int)value;
 
   return new SpecIncMultiHeadSelfAttention(model,
                                            layer->layer_guid,
@@ -222,6 +191,7 @@ Op *SpecIncMultiHeadSelfAttention::create_operator_from_layer(
                                            scaling_factor,
                                            qk_prod_scaling,
                                            position_bias,
+                                           num_kv_cache_pages,
                                            layer->name);
 }
 
@@ -241,6 +211,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
+    int _num_kv_cache_pages,
     char const *name)
     : Op(model,
          OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
@@ -252,10 +223,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       add_zero_attn(_add_zero_attn),
-      rotary_embedding_meta(_rotary_embedding_meta),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      rotary_embedding_meta(_rotary_embedding_meta), qProjSize(_kdim),
+      kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) {
@@ -274,6 +243,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
 
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
+
+  num_kv_cache_pages = _num_kv_cache_pages;
 }
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
@@ -291,6 +262,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
     float _scaling_factor,
     bool _qk_prod_scaling,
     bool _position_bias,
+    int _num_kv_cache_pages,
     char const *name)
     : Op(model,
          OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
@@ -302,10 +274,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       add_zero_attn(_add_zero_attn),
-      rotary_embedding_meta(_rotary_embedding_meta),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      rotary_embedding_meta(_rotary_embedding_meta), qProjSize(_kdim),
+      kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias) {
@@ -319,6 +289,8 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
   // Currently require no parallelism along this dim
   assert(dims[0].degree == 1);
 
+  num_kv_cache_pages = _num_kv_cache_pages;
+
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
 }
@@ -342,6 +314,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     other.scaling_factor,
                                     other.qk_prod_scaling,
                                     other.position_bias,
+                                    other.num_kv_cache_pages,
                                     other.name) {}
 
 SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
@@ -364,6 +337,7 @@ SpecIncMultiHeadSelfAttention::SpecIncMultiHeadSelfAttention(
                                     params.scaling_factor,
                                     params.qk_prod_scaling,
                                     params.position_bias,
+                                    params.num_kv_cache_pages,
                                     params.name) {}
 
 void SpecIncMultiHeadSelfAttention::init_inference(
@@ -466,7 +440,6 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
                                        ctx,
                                        runtime);
 
-  int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1;
   assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   int num_q_heads = attn->num_q_heads;
@@ -477,7 +450,7 @@ OpMeta *SpecIncMultiHeadSelfAttention::init_task(
   MemoryAllocator gpu_mem_allocator(gpu_mem);
   // We don't do offloading for SSMs (small speculative models)
   SpecIncMultiHeadSelfAttentionMeta *m = new SpecIncMultiHeadSelfAttentionMeta(
-      handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads);
+      handle, attn, gpu_mem_allocator, num_q_heads, num_kv_heads);
   // assert that we didn't over allocate memory
   assert(gpu_mem_allocator.instance_allocated_size ==
          gpu_mem_allocator.instance_total_size);
@@ -628,7 +601,8 @@ bool operator==(SpecIncMultiHeadSelfAttentionParams const &lhs,
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.num_kv_cache_pages == rhs.num_kv_cache_pages;
 }
 
 SpecIncMultiHeadSelfAttentionParams
@@ -647,6 +621,7 @@ SpecIncMultiHeadSelfAttentionParams
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
+  params.num_kv_cache_pages = this->num_kv_cache_pages;
   if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
@@ -680,6 +655,7 @@ size_t hash<FlexFlow::SpecIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.scaling_factor);
   hash_combine(key, params.qk_prod_scaling);
   hash_combine(key, params.position_bias);
+  hash_combine(key, params.num_kv_cache_pages);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index f152fb30d..e0b27d1bc 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -394,9 +394,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             hipStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
-  assert(m->hidden_size % m->num_q_heads == 0);
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   if (num_tokens > 0) {
     int parallelism = head_dim * tot_num_heads * num_tokens;
@@ -843,15 +841,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     SpecIncMultiHeadSelfAttention const *attn,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
                                     BEAM_SEARCH_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -862,11 +856,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->position_bias,
                                     attn->scaling_factor,
                                     gpu_mem_allocator,
-                                    num_samples,
                                     attn->num_q_heads,
                                     attn->num_kv_heads,
                                     _num_q_heads,
                                     _num_kv_heads,
+                                    attn->num_kv_cache_pages,
                                     DT_NONE,
                                     false) {
   hipStream_t stream;
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index b111e9f47..c23ddf339 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -370,9 +370,7 @@ void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             cudaStream_t stream) {
   int num_tokens = bc->num_active_tokens();
   int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
-  assert(m->hidden_size % m->num_q_heads == 0);
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   if (num_tokens > 0) {
     int parallelism = head_dim * tot_num_heads * num_tokens;
@@ -818,15 +816,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     SpecIncMultiHeadSelfAttention const *attn,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
                                     BEAM_SEARCH_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -837,11 +831,11 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
                                     attn->position_bias,
                                     attn->scaling_factor,
                                     gpu_mem_allocator,
-                                    num_samples,
                                     attn->num_q_heads,
                                     attn->num_kv_heads,
                                     _num_q_heads,
                                     _num_kv_heads,
+                                    attn->num_kv_cache_pages,
                                     DT_NONE,
                                     false) {
   cudaStream_t stream;
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index d3e1a5c37..42f816404 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -55,40 +55,6 @@ bool TreeIncMultiHeadSelfAttentionParams::is_valid(
 }
 
 Tensor FFModel::inc_multihead_self_attention_verify(
-    const Tensor input,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
-    float dropout,
-    bool add_zero_attn,
-    DataType data_type,
-    Initializer *kernel_initializer,
-    RotaryEmbeddingMeta rotary_embedding_meta,
-    bool scaling_query,
-    float scaling_factor,
-    bool qk_prod_scaling,
-    bool position_bias,
-    char const *name) {
-  return inc_multiquery_self_attention_verify(input,
-                                              embed_dim,
-                                              num_heads,
-                                              num_heads,
-                                              kdim,
-                                              vdim,
-                                              dropout,
-                                              add_zero_attn,
-                                              data_type,
-                                              kernel_initializer,
-                                              rotary_embedding_meta,
-                                              scaling_query,
-                                              scaling_factor,
-                                              qk_prod_scaling,
-                                              position_bias,
-                                              name);
-}
-
-Tensor FFModel::inc_multiquery_self_attention_verify(
     const Tensor input,
     int embed_dim,
     int num_q_heads,
@@ -169,6 +135,7 @@ Tensor FFModel::inc_multiquery_self_attention_verify(
   li->add_int_property("offload", offload);
   li->add_int_property("tensor_parallelism_degree",
                        config.tensor_parallelism_degree);
+  li->add_int_property("num_kv_cache_pages", get_num_kv_cache_pages());
   layers.push_back(li);
   return li->outputs[0];
 }
@@ -218,6 +185,8 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
   bool offload = (bool)value;
   layer->get_int_property("tensor_parallelism_degree", value);
   int tensor_parallelism_degree = (int)value;
+  layer->get_int_property("num_kv_cache_pages", value);
+  int num_kv_cache_pages = (int)value;
   return new TreeIncMultiHeadSelfAttention(model,
                                            layer->layer_guid,
                                            inputs[0],
@@ -236,6 +205,7 @@ Op *TreeIncMultiHeadSelfAttention::create_operator_from_layer(
                                            quantization_type,
                                            offload,
                                            tensor_parallelism_degree,
+                                           num_kv_cache_pages,
                                            layer->name);
 }
 
@@ -258,6 +228,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
+    int _num_kv_cache_pages,
     char const *name)
     : Op(model,
          OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
@@ -269,10 +240,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       add_zero_attn(_add_zero_attn),
-      rotary_embedding_meta(_rotary_embedding_meta),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      rotary_embedding_meta(_rotary_embedding_meta), qProjSize(_kdim),
+      kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -294,6 +263,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
 
+  num_kv_cache_pages = _num_kv_cache_pages;
+
   /* // Check correctness */
   /* assert(check_output_input_weight_parallel_dims()); */
 }
@@ -316,6 +287,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
     DataType _quantization_type,
     bool _offload,
     int _tensor_parallelism_degree,
+    int _num_kv_cache_pages,
     char const *name)
     : Op(model,
          OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
@@ -327,10 +299,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
          _input),
       num_q_heads(_num_q_heads), num_kv_heads(_num_kv_heads), dropout(_dropout),
       add_zero_attn(_add_zero_attn),
-      rotary_embedding_meta(_rotary_embedding_meta),
-      qSize(_input->dims[0].size), kSize(_input->dims[0].size),
-      vSize(_input->dims[0].size), qProjSize(_kdim), kProjSize(_kdim),
-      vProjSize(_vdim), oProjSize(_embed_dim),
+      rotary_embedding_meta(_rotary_embedding_meta), qProjSize(_kdim),
+      kProjSize(_kdim), vProjSize(_vdim), oProjSize(_embed_dim),
       qoSeqLength(_input->dims[1].size), kvSeqLength(_input->dims[1].size),
       scaling_query(_scaling_query), scaling_factor(_scaling_factor),
       qk_prod_scaling(_qk_prod_scaling), position_bias(_position_bias),
@@ -350,6 +320,8 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
   outputs[0] = model.create_parallel_tensor_legion_ordering(
       _input->num_dims, dims, this->data_type, this);
 
+  num_kv_cache_pages = _num_kv_cache_pages;
+
   // Check correctness
   /* assert(check_output_input_weight_parallel_dims()); */
 }
@@ -376,6 +348,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     other.quantization_type,
                                     other.offload,
                                     other.tensor_parallelism_degree,
+                                    other.num_kv_cache_pages,
                                     other.name) {}
 
 TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
@@ -401,6 +374,7 @@ TreeIncMultiHeadSelfAttention::TreeIncMultiHeadSelfAttention(
                                     params.quantization_type,
                                     params.offload,
                                     params.tensor_parallelism_degree,
+                                    params.num_kv_cache_pages,
                                     params.name) {}
 
 void TreeIncMultiHeadSelfAttention::init_inference(
@@ -503,7 +477,6 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
                                        ctx,
                                        runtime);
 
-  int num_samples = input.domain.hi()[2] - input.domain.lo()[2] + 1;
   assert(attn->qoSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
   assert(attn->kvSeqLength == input.domain.hi()[1] - input.domain.lo()[1] + 1);
 
@@ -521,7 +494,7 @@ OpMeta *TreeIncMultiHeadSelfAttention::init_task(
         handle.offload_reserve_space, handle.offload_reserve_space_size);
   }
   TreeIncMultiHeadSelfAttentionMeta *m = new TreeIncMultiHeadSelfAttentionMeta(
-      handle, attn, gpu_mem_allocator, num_samples, num_q_heads, num_kv_heads);
+      handle, attn, gpu_mem_allocator, num_q_heads, num_kv_heads);
   if (!attn->offload) {
     // assert that we didn't over allocate memory
     assert(gpu_mem_allocator.reserved_allocated_size ==
@@ -677,7 +650,8 @@ bool operator==(TreeIncMultiHeadSelfAttentionParams const &lhs,
          lhs.scaling_query == rhs.scaling_query &&
          lhs.scaling_factor == rhs.scaling_factor &&
          lhs.qk_prod_scaling == rhs.qk_prod_scaling &&
-         lhs.position_bias == rhs.position_bias;
+         lhs.position_bias == rhs.position_bias &&
+         lhs.num_kv_cache_pages == rhs.num_kv_cache_pages;
 }
 
 TreeIncMultiHeadSelfAttentionParams
@@ -697,6 +671,7 @@ TreeIncMultiHeadSelfAttentionParams
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
   params.tensor_parallelism_degree = this->tensor_parallelism_degree;
+  params.num_kv_cache_pages = this->num_kv_cache_pages;
   if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
@@ -732,6 +707,7 @@ size_t hash<FlexFlow::TreeIncMultiHeadSelfAttentionParams>::operator()(
   hash_combine(key, params.quantization_type);
   hash_combine(key, params.offload);
   hash_combine(key, params.tensor_parallelism_degree);
+  hash_combine(key, params.num_kv_cache_pages);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index d6d258de1..482d197d2 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -405,8 +405,7 @@ template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    TreeVerifyBatchConfig const *bc,
                    hipStream_t stream) {
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   // int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
@@ -534,8 +533,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
 
   // update the kv cache
   //  update K-V cache
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = head_dim * m->num_kv_heads * num_new_tokens;
   hipLaunchKernelGGL(HIP_KERNEL_NAME(update_tree_branch_kv_cache_fused),
@@ -660,15 +658,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     TreeIncMultiHeadSelfAttention const *attn,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
                                     TREE_VERIFY_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -679,11 +673,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->position_bias,
                                     attn->scaling_factor,
                                     gpu_mem_allocator,
-                                    num_samples,
                                     attn->num_q_heads,
                                     attn->num_kv_heads,
                                     _num_q_heads,
                                     _num_kv_heads,
+                                    attn->num_kv_cache_pages,
                                     attn->quantization_type,
                                     attn->offload),
       num_active_infr_tokens(0) {
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 535998a21..d9626f660 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -383,8 +383,7 @@ template <typename DT>
 void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
                    TreeVerifyBatchConfig const *bc,
                    cudaStream_t stream) {
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   // int tot_num_heads = m->num_q_heads + 2 * m->num_kv_heads;
   int num_tokens_to_commit = bc->num_tokens_to_commit;
   if (num_tokens_to_commit > 0) {
@@ -510,8 +509,7 @@ void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
 
   // update the kv cache
   //  update K-V cache
-  int head_dim = m->hidden_size / m->num_q_heads;
-  assert(head_dim == m->qProjSize);
+  int head_dim = m->qProjSize;
   int num_new_tokens = bc->num_active_tokens();
   int parallelism = head_dim * m->num_kv_heads * num_new_tokens;
   update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
@@ -634,15 +632,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     TreeIncMultiHeadSelfAttention const *attn,
     MemoryAllocator &gpu_mem_allocator,
-    int num_samples,
     int _num_q_heads,
     int _num_kv_heads)
     : IncMultiHeadSelfAttentionMeta(handler,
                                     TREE_VERIFY_MODE,
                                     attn,
-                                    attn->qSize,
-                                    attn->kSize,
-                                    attn->vSize,
                                     attn->qProjSize,
                                     attn->kProjSize,
                                     attn->vProjSize,
@@ -653,11 +647,11 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     attn->position_bias,
                                     attn->scaling_factor,
                                     gpu_mem_allocator,
-                                    num_samples,
                                     attn->num_q_heads,
                                     attn->num_kv_heads,
                                     _num_q_heads,
                                     _num_kv_heads,
+                                    attn->num_kv_cache_pages,
                                     attn->quantization_type,
                                     attn->offload),
       num_active_infr_tokens(0) {
diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu
index 2099347fa..dc8513729 100644
--- a/src/parallel_ops/kernels/parallel_identity_kernels.cu
+++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu
@@ -84,7 +84,7 @@ void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
                           num_elements,
                           nccl_data_type,
                           ncclSum,
-                          m->handle.ncclComm,
+                          m->handle.ncclCommPeft,
                           stream));
 #else
   assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators");
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index c391f2043..275a7f2b7 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -115,6 +115,18 @@ int BatchConfig::num_active_tokens() const {
   return num_tokens;
 }
 
+int BatchConfig::num_inference_tokens() const {
+  int num_ft_fwd_tokens = num_finetuning_fwd_tokens();
+  assert(num_tokens >= 0 && num_ft_fwd_tokens >= 0 &&
+         num_tokens >= num_ft_fwd_tokens);
+  return num_tokens - num_ft_fwd_tokens;
+}
+
+int BatchConfig::num_inference_requests() const {
+  return num_active_requests() - num_finetuning_fwd_requests() -
+         num_finetuning_bwd_requests();
+}
+
 int BatchConfig::finetuning_request_index() const {
   assert(max_requests_per_batch() > 0);
   return max_requests_per_batch() - 1;
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index 1f086cc1a..fbfbb4d48 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -2350,6 +2350,7 @@ GraphOptimalViewSerialized
         sez.serialize(attn->offload);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(attn->tensor_parallelism_degree);
+        sez.serialize(attn->num_kv_cache_pages);
         sez.serialize(strlen(attn->name));
         sez.serialize(attn->name, strlen(attn->name));
         break;
@@ -2381,6 +2382,7 @@ GraphOptimalViewSerialized
         sez.serialize(attn->qk_prod_scaling);
         sez.serialize(attn->position_bias);
         sez.serialize(attn->num_kv_heads);
+        sez.serialize(attn->num_kv_cache_pages);
         sez.serialize(strlen(attn->name));
         sez.serialize(attn->name, strlen(attn->name));
         break;
@@ -2415,6 +2417,7 @@ GraphOptimalViewSerialized
         sez.serialize(attn->offload);
         sez.serialize(attn->num_kv_heads);
         sez.serialize(attn->tensor_parallelism_degree);
+        sez.serialize(attn->num_kv_cache_pages);
         sez.serialize(strlen(attn->name));
         sez.serialize(attn->name, strlen(attn->name));
         break;
@@ -2842,7 +2845,7 @@ void FFModel::deserialize_graph_optimal_view(
       case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(num_inputs == 1);
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
-            tensor_parallelism_degree;
+            tensor_parallelism_degree, num_kv_cache_pages;
         float dropout, scaling_factor;
         bool add_zero_attn, scaling_query, qk_prod_scaling, offload,
             position_bias;
@@ -2878,6 +2881,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(offload);
         dez.deserialize(num_kv_heads);
         dez.deserialize(tensor_parallelism_degree);
+        dez.deserialize(num_kv_cache_pages);
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
@@ -2900,13 +2904,15 @@ void FFModel::deserialize_graph_optimal_view(
         params.offload = offload;
         params.num_kv_heads = num_kv_heads;
         params.tensor_parallelism_degree = tensor_parallelism_degree;
+        params.num_kv_cache_pages = num_kv_cache_pages;
         strcpy(params.name, name);
         node = get_or_create_node<IncMultiHeadSelfAttention>(inputs[0], params);
         break;
       }
       case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(num_inputs == 1);
-        int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads;
+        int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
+            num_kv_cache_pages;
         float dropout, scaling_factor;
         bool add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
         RotaryEmbeddingMeta rotary_embedding_meta;
@@ -2937,6 +2943,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(qk_prod_scaling);
         dez.deserialize(position_bias);
         dez.deserialize(num_kv_heads);
+        dez.deserialize(num_kv_cache_pages);
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
@@ -2956,6 +2963,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.qk_prod_scaling = qk_prod_scaling;
         params.position_bias = position_bias;
         params.num_kv_heads = num_kv_heads;
+        params.num_kv_cache_pages = num_kv_cache_pages;
         strcpy(params.name, name);
         node = get_or_create_node<SpecIncMultiHeadSelfAttention>(inputs[0],
                                                                  params);
@@ -2964,7 +2972,7 @@ void FFModel::deserialize_graph_optimal_view(
       case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(num_inputs == 1);
         int embed_dim, num_q_heads, k_dim, v_dim, num_kv_heads,
-            tensor_parallelism_degree;
+            tensor_parallelism_degree, num_kv_cache_pages;
         float dropout, scaling_factor;
         bool add_zero_attn, scaling_query, qk_prod_scaling, offload,
             position_bias;
@@ -3000,6 +3008,7 @@ void FFModel::deserialize_graph_optimal_view(
         dez.deserialize(offload);
         dez.deserialize(num_kv_heads);
         dez.deserialize(tensor_parallelism_degree);
+        dez.deserialize(num_kv_cache_pages);
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
@@ -3022,6 +3031,7 @@ void FFModel::deserialize_graph_optimal_view(
         params.offload = offload;
         params.num_kv_heads = num_kv_heads;
         params.tensor_parallelism_degree = tensor_parallelism_degree;
+        params.num_kv_cache_pages = num_kv_cache_pages;
         strcpy(params.name, name);
         node = get_or_create_node<TreeIncMultiHeadSelfAttention>(inputs[0],
                                                                  params);
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 9b6b0e785..20ec7d4f3 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -635,6 +635,14 @@ void FFModel::set_transformer_layer_id(int id) {
   assert(id < MAX_NUM_TRANSFORMER_LAYERS);
 }
 
+void FFModel::set_num_kv_cache_pages(int num_kv_cache_pages_) {
+  num_kv_cache_pages = num_kv_cache_pages_;
+}
+
+int FFModel::get_num_kv_cache_pages() const {
+  return num_kv_cache_pages;
+}
+
 void FFModel::set_position_offset(int offset) {
   assert(offset == 0 || offset == 2);
   position_offset = offset;
@@ -794,6 +802,7 @@ void FFModel::compile_inference() {
         operators[l]->op_type == OP_PARALLEL_IDENTITY ||
         operators[l]->op_type == OP_LORA || operators[l]->op_type == OP_FUSED) {
       MachineView view = operators[l]->outputs[0]->machine_view;
+      // inference
       if (view_hash_to_nccl_comms.find(view.hash()) ==
           view_hash_to_nccl_comms.end()) {
         TaskLauncher launcher(NCCL_GETUNIQUEID_TASK_ID, TaskArgument(NULL, 0));
@@ -822,6 +831,35 @@ void FFModel::compile_inference() {
         }
         view_hash_to_nccl_comms[view.hash()] = nccl_comms;
       }
+      // peft
+      if (view_hash_to_nccl_comms_peft.find(view.hash()) ==
+          view_hash_to_nccl_comms_peft.end()) {
+        TaskLauncher launcher(NCCL_GETUNIQUEID_TASK_ID, TaskArgument(NULL, 0));
+        Future future = runtime->execute_task(ctx, launcher);
+        ncclUniqueId ncclId = future.get_result<ncclUniqueId>();
+        IndexSpace task_is = get_or_create_task_is(view);
+        ArgumentMap argmap;
+        IndexLauncher index_launcher(
+            NCCL_INIT_COMMS_TASK_ID,
+            task_is,
+            TaskArgument(&ncclId, sizeof(ncclUniqueId)),
+            argmap,
+            Predicate::TRUE_PRED,
+            false /*must*/,
+            0 /*mapper_id*/,
+            view.hash() /*MappingTagID*/);
+        index_launcher.concurrent = true;
+        FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+        fm.wait_all_results();
+        int idx = 0;
+        Domain task_domain = runtime->get_index_space_domain(ctx, task_is);
+        ncclComm_t *nccl_comms_peft =
+            (ncclComm_t *)malloc(sizeof(ncclComm_t) * task_domain.get_volume());
+        for (Domain::DomainPointIterator it(task_domain); it; it++, idx++) {
+          nccl_comms_peft[idx] = fm.get_result<ncclComm_t>(*it);
+        }
+        view_hash_to_nccl_comms_peft[view.hash()] = nccl_comms_peft;
+      }
     }
   }
 #endif
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 0b4358008..89f24ac98 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -1218,7 +1218,10 @@ void Op::set_argumentmap_for_init(FFModel const &ff, ArgumentMap &argmap) {
       if (ff.config.computationMode == COMP_MODE_TRAINING &&                   \
           op_type == OP_WEIGHT) {                                              \
         ncclComm_t *nccl_comms = ff.find_nccl_comms(view);                     \
-        handle.ncclComm = nccl_comms[idx++];                                   \
+        handle.ncclComm = nccl_comms[idx];                                     \
+        ncclComm_t *nccl_comms_peft = ff.find_nccl_comms_peft(view);           \
+        handle.ncclCommPeft = nccl_comms_peft[idx];                            \
+        idx++;                                                                 \
       }                                                                        \
       argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler)));         \
     }                                                                          \
@@ -1264,7 +1267,10 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff,
       if (op_type == OP_ALLREDUCE || op_type == OP_LORA ||                     \
           op_type == OP_PARALLEL_IDENTITY) {                                   \
         ncclComm_t *nccl_comms = ff.find_nccl_comms(view);                     \
-        handle.ncclComm = nccl_comms[idx++];                                   \
+        handle.ncclComm = nccl_comms[idx];                                     \
+        ncclComm_t *nccl_comms_peft = ff.find_nccl_comms_peft(view);           \
+        handle.ncclCommPeft = nccl_comms_peft[idx];                            \
+        idx++;                                                                 \
       }                                                                        \
       argmap.set_point(*it, TaskArgument(&handle, sizeof(FFHandler)));         \
     }                                                                          \
@@ -1627,6 +1633,7 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
 void FFModel::finish_nccl_comms() {
   Context ctx = config.lg_ctx;
   Runtime *runtime = config.lg_hlr;
+  // finish inference nccl comms
   for (auto const &comm : view_hash_to_nccl_comms) {
     // Find the machine view that has the hash
     MachineView view;
@@ -1657,6 +1664,37 @@ void FFModel::finish_nccl_comms() {
     FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
     fm.wait_all_results();
   }
+  // finish peft nccl comms
+  for (auto const &comm : view_hash_to_nccl_comms_peft) {
+    // Find the machine view that has the hash
+    MachineView view;
+    for (size_t l = 0; l < operators.size(); l++) {
+      view = operators[l]->outputs[0]->machine_view;
+      if (view.hash() == comm.first) {
+        break;
+      }
+    }
+    assert(view.hash() == comm.first && "Cannot find the machine view");
+    IndexSpace task_is = get_or_create_task_is(view);
+    Domain domain = runtime->get_index_space_domain(ctx, task_is);
+    ArgumentMap argmap;
+    int idx = 0;
+    for (Domain::DomainPointIterator it(domain); it; it++, idx++) {
+      argmap.set_point(*it,
+                       TaskArgument(&comm.second[idx], sizeof(ncclComm_t)));
+    }
+    IndexLauncher index_launcher(NCCL_FINISH_COMMS_TASK_ID,
+                                 task_is,
+                                 TaskArgument(nullptr, 0),
+                                 argmap,
+                                 Predicate::TRUE_PRED,
+                                 false /*must*/,
+                                 0 /*mapper_id*/,
+                                 comm.first);
+    index_launcher.concurrent = true;
+    FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+    fm.wait_all_results();
+  }
 }
 #endif
 
@@ -1684,6 +1722,15 @@ ncclComm_t *FFModel::find_nccl_comms(MachineView const &view) const {
     return it->second;
   }
 }
+ncclComm_t *FFModel::find_nccl_comms_peft(MachineView const &view) const {
+  auto const &it = view_hash_to_nccl_comms.find(view.hash());
+  if (it == view_hash_to_nccl_comms.end()) {
+    assert(config.computationMode == COMP_MODE_INFERENCE);
+    return nullptr;
+  } else {
+    return it->second;
+  }
+}
 #endif
 
 template <int NDIM>
@@ -3920,6 +3967,34 @@ void FFModel::compile(LossType loss_type,
         }
         view_hash_to_nccl_comms[view.hash()] = nccl_comms;
       }
+      if (view_hash_to_nccl_comms_peft.find(view.hash()) ==
+          view_hash_to_nccl_comms_peft.end()) {
+        TaskLauncher launcher(NCCL_GETUNIQUEID_TASK_ID, TaskArgument(NULL, 0));
+        Future future = runtime->execute_task(ctx, launcher);
+        ncclUniqueId ncclId = future.get_result<ncclUniqueId>();
+        IndexSpace task_is = get_or_create_task_is(view);
+        ArgumentMap argmap;
+        IndexLauncher index_launcher(
+            NCCL_INIT_COMMS_TASK_ID,
+            task_is,
+            TaskArgument(&ncclId, sizeof(ncclUniqueId)),
+            argmap,
+            Predicate::TRUE_PRED,
+            false /*must*/,
+            0 /*mapper_id*/,
+            view.hash() /*MappingTagID*/);
+        index_launcher.concurrent = true;
+        FutureMap fm = runtime->execute_index_space(ctx, index_launcher);
+        fm.wait_all_results();
+        int idx = 0;
+        Domain task_domain = runtime->get_index_space_domain(ctx, task_is);
+        ncclComm_t *nccl_comms_peft =
+            (ncclComm_t *)malloc(sizeof(ncclComm_t) * task_domain.get_volume());
+        for (Domain::DomainPointIterator it(task_domain); it; it++, idx++) {
+          nccl_comms_peft[idx] = fm.get_result<ncclComm_t>(*it);
+        }
+        view_hash_to_nccl_comms_peft[view.hash()] = nccl_comms_peft;
+      }
     }
   }
 #endif
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index af6f7d5c9..4ad47a362 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -91,11 +91,25 @@ FFHandler
   handle.offload_reserve_space_size = info->offload_reserve_space_size;
   handle.quantization_type = info->quantization_type;
   handle.allowTensorOpMathConversion = info->allowTensorOpMathConversion;
+
+  // flashinfer
+  handle.incr_attention_metadata = new AttentionMetaData();
+  assert(handle.incr_attention_metadata != nullptr &&
+         "Attention metadata must be allocated");
+
+  // cublas/dnn handles for inference stream
   checkCUDA(cublasCreate(&handle.blas));
   if (handle.allowTensorOpMathConversion) {
     checkCUDA(cublasSetMathMode(handle.blas, CUBLAS_TENSOR_OP_MATH));
   }
   checkCUDNN(cudnnCreate(&handle.dnn));
+  // cublas/dnn handles for peft stream
+  checkCUDA(cublasCreate(&handle.peft_blas));
+  if (handle.allowTensorOpMathConversion) {
+    checkCUDA(cublasSetMathMode(handle.peft_blas, CUBLAS_TENSOR_OP_MATH));
+  }
+  checkCUDNN(cudnnCreate(&handle.peft_dnn));
+
   // #ifdef FF_USE_NCCL
   //   checkNCCL(ncclCommInitRank(&handle.nccl, info->allRanks, info->ncclId,
   //   info->myRank)); fprintf(stderr, "handle.nccl(%p)\n", handle.nccl);
@@ -168,9 +182,46 @@ FFHandler
   } else {
     handle.batch_config_metadata = nullptr;
   }
+
+  // std::cout << "handle.batch_config_metadata_size: "
+  //           << handle.batch_config_metadata_size << std::endl;
+  // std::cout << "handle.incr_attention_metadata->mem_size(): "
+  //           << handle.incr_attention_metadata->mem_size() << std::endl;
+  if (handle.batch_config_metadata_size +
+      handle.incr_attention_metadata->mem_size()) {
+    // allocate memory for offload reserve space
+    Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc);
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(handle.batch_config_metadata_size +
+                                 handle.incr_attention_metadata->mem_size() -
+                                 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance workspaceInst;
+    Realm::RegionInstance::create_instance(workspaceInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata =
+        static_cast<CombinedBatchConfigMetaStruct *>(ptr);
+    handle.incr_attention_metadata->assign_address(
+        static_cast<void *>(static_cast<char *>(ptr) +
+                            handle.batch_config_metadata_size),
+        handle.incr_attention_metadata->mem_size());
+  } else {
+    handle.batch_config_metadata = nullptr;
+    handle.incr_attention_metadata->assign_address(nullptr, 0);
+  }
+
 // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
+  handle.ncclCommPeft = NULL;
 #endif
   return handle;
 }
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index d5bfcfc48..1dc6e8321 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -31,7 +31,8 @@ fs::path get_dst_folder(std::string const &subdir,
     step_substr += "_pre";
   }
   char cwd[PATH_MAX];
-  getcwd(cwd, sizeof(cwd));
+  char *result = getcwd(cwd, sizeof(cwd));
+  assert(result && "getcwd failed");
 
   // char const *ff_cache_path = std::string(std::getenv("FF_DEBUG_PATH")) ==
   // "." ?
diff --git a/src/runtime/page_manager.cc b/src/runtime/page_manager.cc
new file mode 100644
index 000000000..85549457f
--- /dev/null
+++ b/src/runtime/page_manager.cc
@@ -0,0 +1,321 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/page_manager.h"
+
+namespace FlexFlow {
+
+// For all runtime functions, they share a single page manager for pages
+// information
+PageManager *page_manager_singleton = nullptr;
+
+PageManager::PageManager(int tot_num_pages_) : tot_num_pages(tot_num_pages_) {
+  assert(kPagesize > 0 && tot_num_pages >= 0 &&
+         "Number of tokens per page must be positive and total number of pages "
+         "must be non-negative");
+  for (int i = 0; i < tot_num_pages; i++) {
+    free_pages.insert(i);
+  }
+}
+
+PageManager *PageManager::get_page_manager() {
+  assert(page_manager_singleton != nullptr && "PageManager not initialized");
+  return page_manager_singleton;
+}
+
+PageManager *PageManager::get_page_manager(int num_total_pages) {
+  assert(num_total_pages > 0 &&
+         "attempting to initialize the PageManager with 0 pages");
+  assert(page_manager_singleton == nullptr &&
+         "Attempting to initialize PageManager twice");
+  printf("page manager singleton is initialized with %d pages\n",
+         num_total_pages);
+  page_manager_singleton = new PageManager(num_total_pages);
+  return page_manager_singleton;
+}
+
+int PageManager::get_tot_num_pages() const {
+  return tot_num_pages;
+}
+
+int PageManager::get_tokens_per_page() const {
+  return kPagesize;
+}
+
+int PageManager::get_num_pages_used_by_req(
+    RequestGuid const &request_guid) const {
+  assert(requests_info.find(request_guid) != requests_info.end());
+  int n = requests_info.at(request_guid).num_used_pages;
+  if (!(n >= 0 && n <= requests_info.at(request_guid).page_indices.size())) {
+    std::cerr << "Error: requests_info.at(request_guid).num_used_pages is out "
+                 "of bounds for request "
+              << request_guid << std::endl;
+    std::cerr << *this << std::endl;
+  }
+  assert(n >= 0 && n <= requests_info.at(request_guid).page_indices.size());
+  return n;
+}
+
+std::vector<int>
+    PageManager::get_req_page_indices(RequestGuid const &request_guid) const {
+  int n = get_num_pages_used_by_req(request_guid);
+  return std::vector<int>(requests_info.at(request_guid).page_indices.begin(),
+                          requests_info.at(request_guid).page_indices.begin() +
+                              n);
+}
+
+int PageManager::get_num_tokens_in_last_used_page(
+    RequestGuid const &request_guid) const {
+  assert(requests_info.find(request_guid) != requests_info.end());
+  int n = requests_info.at(request_guid).num_tokens_in_last_used_page;
+  if (!(n >= 0 && n <= kPagesize)) {
+    std::cerr
+        << "Error: num_tokens_in_last_used_page is out of bounds for request "
+        << request_guid << std::endl;
+    std::cerr << *this << std::endl;
+  }
+  assert(n >= 0 && n <= kPagesize);
+
+  return n;
+}
+
+bool PageManager::enough_space_to_add_request(
+    int num_prompt_tokens,
+    int num_prompt_tokens_in_first_batch,
+    int max_tokens_per_batch) const {
+  // there is enough space to add a request if there are enough pages for this
+  // request's prompt + N decoding steps for all existing requests, where
+  // N = tot prefilling steps needed to consume the new request's prompt
+  assert(num_prompt_tokens > 0 && num_prompt_tokens_in_first_batch > 0);
+  assert(num_prompt_tokens_in_first_batch <= num_prompt_tokens);
+
+  // pages needed to process the new request's prompt alone
+  int new_pages_needed = round_up_pages(num_prompt_tokens);
+
+  // number of steps to finish prefilling (during which other requests will
+  // accrue more tokens)
+  int num_expected_prefill_steps =
+      ceilDiv(num_prompt_tokens - num_prompt_tokens_in_first_batch,
+              max_tokens_per_batch - (int)active_requests.size());
+
+  for (auto req_info_pair : requests_info) {
+    RequestGuid const &guid = req_info_pair.first;
+    PerRequestPageInfo const &req_info = req_info_pair.second;
+    // ensure that no other request is an unfinished prompt
+    if (req_info.num_used_pages < req_info.page_indices.size()) {
+      // this request is an unfinished prompt
+      // we cannot add a new request
+      std::cout << *this << std::endl;
+      assert(false && "Attempting to add a request with another unfinished "
+                      "prefill present in the batch");
+    }
+    int available_slots =
+        kPagesize - req_info.num_tokens_in_last_used_page +
+        ((int)req_info.page_indices.size() - req_info.num_used_pages) *
+            kPagesize;
+    if (num_expected_prefill_steps > available_slots) {
+      new_pages_needed +=
+          round_up_pages(num_expected_prefill_steps - available_slots);
+    }
+  }
+  // printf("new pages needed to add request with %d prompt tokens, %d "
+  //        "tokens in first batch, %d max tokens per batch: %d\n",
+  //        num_prompt_tokens,
+  //        num_prompt_tokens_in_first_batch,
+  //        max_tokens_per_batch,
+  //        new_pages_needed);
+  // printf("free pages: %ld\n", free_pages.size());
+  // printf("total pages: %d\n", tot_num_pages);
+  // printf("active requests: %ld\n", active_requests.size());
+  return free_pages.size() >= new_pages_needed;
+}
+
+bool PageManager::enough_space_to_append_tokens(
+    std::vector<std::pair<RequestGuid, int>> new_tokens_per_request) const {
+
+  int new_pages_needed = 0;
+  for (auto const &pair : new_tokens_per_request) {
+    RequestGuid const &guid = pair.first;
+    int num_tokens = pair.second;
+    assert(num_tokens > 0 && "Number of tokens to append must be positive");
+    assert(requests_info.find(guid) != requests_info.end() &&
+           "Request does not exist");
+    PerRequestPageInfo const &req_info = requests_info.at(guid);
+    assert((int)req_info.page_indices.size() - req_info.num_used_pages >= 0 &&
+           "Number of used pages must be less than or equal to the number of "
+           "pages assigned to the request");
+    assert(kPagesize - req_info.num_tokens_in_last_used_page >= 0 &&
+           "Number of tokens in last page must be less than or equal to the "
+           "number of tokens per page");
+    int available_slots =
+        kPagesize - req_info.num_tokens_in_last_used_page +
+        ((int)req_info.page_indices.size() - req_info.num_used_pages) *
+            kPagesize;
+    if (num_tokens > available_slots) {
+      int num_pages_needed = round_up_pages(num_tokens - available_slots);
+      new_pages_needed += num_pages_needed;
+    }
+  }
+  return free_pages.size() >= new_pages_needed;
+}
+
+void PageManager::add_request(RequestGuid const &guid, int num_tokens) {
+  assert(num_tokens > 0 && "Number of tokens to add must be positive");
+  assert(requests_info.find(guid) == requests_info.end() &&
+         "Request already exists");
+  // assert(enough_space_to_add_request(num_tokens) &&
+  //        "Not enough space to add request");
+  active_requests.push_back(guid);
+  // assign pages to the request
+  assert(!free_pages.empty() && "No free pages available");
+  int num_pages_needed = round_up_pages(num_tokens);
+  std::vector<int> pages;
+  for (int i = 0; i < num_pages_needed; i++) {
+    int page = *free_pages.begin();
+    free_pages.erase(free_pages.find(page));
+    pages.push_back(page);
+  }
+  // add the request to the requests info
+  PerRequestPageInfo req_info;
+  req_info.guid = guid;
+  req_info.page_indices = pages;
+  req_info.num_used_pages = 0;
+  req_info.num_tokens_in_last_used_page = 0;
+  requests_info[guid] = req_info;
+  // printf("adding request %d with %d tokens. It allocated %ld new pages\n",
+  //        guid,
+  //        num_tokens,
+  //        pages.size());
+}
+
+// remove completed request
+void PageManager::remove_request(RequestGuid const &request_guid) {
+  assert(requests_info.find(request_guid) != requests_info.end() &&
+         "Request does not exist");
+  PerRequestPageInfo const &req_info = requests_info[request_guid];
+  // free the pages assigned to the request
+  for (auto page : req_info.page_indices) {
+    free_pages.insert(page);
+  }
+  requests_info.erase(request_guid);
+  // remove the request from the active requests
+  auto it =
+      std::find(active_requests.begin(), active_requests.end(), request_guid);
+  assert(it != active_requests.end() && "Request does not exist");
+  active_requests.erase(it);
+
+  assert(requests_info.find(request_guid) == requests_info.end() &&
+         "Removal of request info did not go through");
+  assert(std::find(active_requests.begin(),
+                   active_requests.end(),
+                   request_guid) == active_requests.end() &&
+         "Removal of active request did not go through");
+}
+
+RequestGuid PageManager::evict_request_fifo() {
+  assert(!active_requests.empty() && "No active requests to evict");
+  RequestGuid request_guid = active_requests.back();
+  remove_request(request_guid);
+  return request_guid;
+}
+
+void PageManager::append_tokens(RequestGuid const &request_guid,
+                                int num_tokens) {
+  assert(num_tokens > 0 && "Number of tokens to append must be positive");
+  assert(requests_info.find(request_guid) != requests_info.end() &&
+         "Request does not exist");
+  PerRequestPageInfo &req_info = requests_info[request_guid];
+
+  // std::vector<std::pair<RequestGuid, int>> new_tokens_per_request;
+  // for (auto const &pair : requests_info) {
+  //   RequestGuid const &guid = pair.first;
+  //   if (guid == request_guid) {
+  //     new_tokens_per_request.push_back(std::make_pair(guid, num_tokens));
+  //   } else {
+  //     new_tokens_per_request.push_back(std::make_pair(guid, 1));
+  //   }
+  // }
+  // assert(enough_space_to_append_tokens(new_tokens_per_request) &&
+  //        "Not enough space to append tokens");
+
+  int available_slots =
+      kPagesize - req_info.num_tokens_in_last_used_page +
+      ((int)req_info.page_indices.size() - req_info.num_used_pages) * kPagesize;
+  if (num_tokens > available_slots) {
+    int num_pages_needed = round_up_pages(num_tokens - available_slots);
+    assert(num_pages_needed <= free_pages.size() &&
+           "Not enough free pages to append new tokens");
+    for (int i = 0; i < num_pages_needed; i++) {
+      int page = *free_pages.begin();
+      free_pages.erase(free_pages.find(page));
+      req_info.page_indices.push_back(page);
+    }
+  }
+  // update the number of used pages and the number of tokens in the last used
+  // page
+  if (req_info.num_tokens_in_last_used_page == 0 &&
+      req_info.num_used_pages == 0) {
+    req_info.num_used_pages = 1;
+  }
+
+  req_info.num_tokens_in_last_used_page += num_tokens;
+  while (req_info.num_tokens_in_last_used_page > kPagesize) {
+    req_info.num_used_pages += 1;
+    req_info.num_tokens_in_last_used_page -= kPagesize;
+  }
+
+  // printf("appending %d tokens to request %d. It now has %d tokens in the last
+  // "
+  //        "used page and %d used pages\n",
+  //        num_tokens,
+  //        request_guid,
+  //        req_info.num_tokens_in_last_used_page,
+  //        req_info.num_used_pages);
+}
+
+std::ostream &operator<<(std::ostream &os, PageManager const &pm) {
+  os << "PageManager State: {\n";
+  os << "\tTotal number of pages: " << pm.tot_num_pages << "\n";
+  os << "\tTokens per page: " << kPagesize << "\n";
+  os << "\tActive requests: " << pm.active_requests.size() << "\n";
+  os << "\tFree pages: " << pm.free_pages.size() << "\n";
+  os << "\tRequests info:\n";
+  for (auto const &[guid, info] : pm.requests_info) {
+    os << "\t  RequestGuid: " << guid << "\n";
+    os << "\t    Number of used pages: " << info.num_used_pages << "\n";
+    os << "\t    Number of tokens in last used page: "
+       << info.num_tokens_in_last_used_page << "\n";
+    os << "\t    Page indices: ";
+    for (int index : info.page_indices) {
+      os << index << " ";
+    }
+    os << "\n}\n";
+  }
+  return os;
+}
+
+int compute_num_kv_cache_pages_needed(int max_seq_len,
+                                      int batch_size,
+                                      bool is_spec) {
+
+  int num_pages_needed = round_up_pages(max_seq_len * batch_size);
+  if (!is_spec) {
+    PageManager *pm = PageManager::get_page_manager(num_pages_needed);
+    assert(pm->get_tot_num_pages() == num_pages_needed);
+  }
+  return num_pages_needed;
+}
+
+}; // namespace FlexFlow
diff --git a/src/runtime/peft_weight_allocator.cc b/src/runtime/peft_weight_allocator.cc
index efb72f331..6d5815449 100644
--- a/src/runtime/peft_weight_allocator.cc
+++ b/src/runtime/peft_weight_allocator.cc
@@ -39,7 +39,7 @@ void PEFTMemoryManager::allocate_inference_memory() {
   base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char));
   if (log_instance_creation) {
     log_peft_mem_allocator.print(
-        "Created instance in memory_kind: %s memory_id: %llx size: %zu "
+        "Created instance in memory_kind: %s memory_id: %llx size: %u "
         "(capacity %lu) task_name: %s",
         Legion::Mapping::Utilities::to_string(gpu_mem.kind()),
         gpu_mem.id,
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index 740d5df4b..cf3d3279c 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -284,6 +284,7 @@ RequestManager::RequestManager()
   max_tokens_per_batch = -1;
   max_spec_tree_token_num = -1;
   max_sequence_length = -1;
+
   step_idx = 0;
   run_idx = 0;
 }
@@ -338,7 +339,7 @@ int RequestManager::get_max_tokens_per_batch() {
 }
 
 int RequestManager::get_max_spec_tree_token_num() {
-  assert(max_spec_tree_token_num > 0);
+  assert(max_spec_tree_token_num >= 0);
   return max_spec_tree_token_num;
 }
 
@@ -572,7 +573,7 @@ RequestGuid RequestManager::register_new_request(Request const &request_) {
     return BatchConfig::INVALID_GUID;
   }
 
-  pending_infr_request_queue.push(request);
+  pending_infr_request_queue.push_back(request);
   all_requests[request.guid] = request;
   {
     const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
@@ -754,7 +755,18 @@ bool RequestManager::is_eos_token(int token_id) {
   return false;
 }
 
+bool RequestManager::inf_req_evicted(BatchConfig const &old_bc, int i) {
+  // printf("Entering inf_req_evicted\n");
+  Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+  // if (request.status == Request::EVICTED) {
+  //   printf("Request %zu cannot continue because it is now in evicted
+  //   state...\n", old_bc.requestsInfo[i].request_guid);
+  // }
+  return request.status == Request::EVICTED;
+}
+
 bool RequestManager::inf_req_completed(BatchConfig const &old_bc, int i) {
+  // printf("Entering inf_req_completed\n");
   Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
   bool request_completed = false;
   // printf("model_type = %d\n", this->model_type);
@@ -767,6 +779,32 @@ bool RequestManager::inf_req_completed(BatchConfig const &old_bc, int i) {
   return request_completed;
 }
 
+bool RequestManager::enough_space_to_add_request(
+    BatchConfig const &new_bc, int num_concurrent_inf_adapters) {
+  Request new_request = pending_infr_request_queue.front();
+  assert(new_request.req_type == RequestType::REQ_INFERENCE);
+
+  int prefill_tokens_first_batch =
+      std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
+               (int)new_request.tokens.size());
+
+  // if there is not enough space in the page table, don't add it yet
+  PageManager *pm = PageManager::get_page_manager();
+  if (!pm->enough_space_to_add_request(new_request.tokens.size(),
+                                       prefill_tokens_first_batch,
+                                       get_max_tokens_per_batch())) {
+    // printf("not enough space to add request %zu\n", new_request.guid);
+    return false;
+  }
+
+  // if the request has peft adapters and we are at capacity, don't add it yet
+  if (new_request.peft_model_id != PEFTModelID::NO_ID &&
+      num_concurrent_inf_adapters == get_max_concurrent_adapters()) {
+    return false;
+  }
+  return true;
+}
+
 void RequestManager::check_batch(BatchConfig const &old_bc,
                                  BatchConfig const &new_bc) {
   int num_incomplete_prompts = 0;
@@ -811,6 +849,7 @@ void RequestManager::check_batch(BatchConfig const &old_bc,
 
 void RequestManager::add_peft_config_to_request_info(
     BatchConfig &bc, int req_idx, LoraLinearConfig const &peft_config) {
+  // printf("Entering add_peft_config_to_request_info\n");
   std::memset(bc.requestsInfo[req_idx].peft_model_config_str,
               0,
               BatchConfig::MAX_PEFT_CONFIG_SIZE);
@@ -823,6 +862,7 @@ void RequestManager::add_peft_config_to_request_info(
 
 void RequestManager::record_decoding_req_profiling_info(
     BatchConfig const &old_fwd_bc, int req_idx) {
+  // printf("Entering record_decoding_req_profiling_info\n");
   if (old_fwd_bc.request_completed[req_idx]) {
     return;
   }
@@ -854,6 +894,7 @@ void RequestManager::record_decoding_req_profiling_info(
 
 void RequestManager::process_inf_req_progress(BatchConfig const &old_fwd_bc,
                                               InferenceResult const &result) {
+  // printf("Entering process_inf_req_progress\n");
   for (int i = 0; i < old_fwd_bc.num_active_tokens(); i++) {
     size_t guid =
         old_fwd_bc.requestsInfo[old_fwd_bc.tokensInfo[i].request_index]
@@ -893,6 +934,8 @@ void RequestManager::process_inf_req_progress(BatchConfig const &old_fwd_bc,
     record_decoding_req_profiling_info(old_fwd_bc, req_idx);
 
     if (inf_req_completed(old_fwd_bc, req_idx)) {
+      printf("Request %zu completed...\n",
+             old_fwd_bc.requestsInfo[req_idx].request_guid);
       handle_completed_inf_req(old_fwd_bc, req_idx);
     }
   }
@@ -900,11 +943,16 @@ void RequestManager::process_inf_req_progress(BatchConfig const &old_fwd_bc,
 
 void RequestManager::handle_completed_inf_req(BatchConfig const &old_bc,
                                               int i) {
+  // printf("Entering handle_completed_inf_req\n");
   Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
   assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
   assert(request.req_type == RequestType::REQ_INFERENCE &&
          "Found misplaced finetuning request");
 
+  // page attention: free the pages
+  PageManager *page_manager = PageManager::get_page_manager();
+  page_manager->remove_request(request.guid);
+
   GenerationResult &gr = request_generation_results[request.guid];
   std::vector<int> output_tokens = std::vector<int>(
       request.tokens.begin() + gr.input_tokens.size(), request.tokens.end());
@@ -931,12 +979,104 @@ void RequestManager::handle_completed_inf_req(BatchConfig const &old_bc,
   profiling_requests[request.guid] = profile_info;
 }
 
+void RequestManager::evict_requests_if_needed(BatchConfig const &old_bc,
+                                              int inference_batch_size) {
+  // printf("Entering evict_requests_if_needed\n");
+  // compute number of tokens that each request would like to run in the next
+  // step
+  std::vector<std::pair<RequestGuid, int>> planned_tokens_per_request;
+  int tot_num_planned_tokens = 0;
+  for (int i = 0; i < inference_batch_size; i++) {
+    if (!old_bc.request_completed[i] && !inf_req_completed(old_bc, i)) {
+      Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+      assert(request.req_type == RequestType::REQ_INFERENCE &&
+             "Found misplaced finetuning request");
+      int processed_tokens =
+          old_bc.requestsInfo[i].first_token_depth_in_request +
+          old_bc.requestsInfo[i].num_tokens_in_batch;
+
+      int num_planned_tokens = 0;
+      if (processed_tokens + 1 == request.tokens.size()) {
+        // incr decoding phase, planning to process 1 token in the next batch
+        num_planned_tokens = 1;
+      } else {
+        // Prompt phase
+        assert(old_bc.requestsInfo[i].prompt_phase == true);
+        int space_for_incr_dec_requests = 0;
+        // If the prompt can't fit in the batch, compute how much space we
+        // need to leave out for incomplete requests in decoding phase at
+        // higher indices.
+        for (int ii = i + 1; ii < inference_batch_size; ii++) {
+          if (old_bc.request_completed[ii]) {
+            continue;
+          }
+          Request &old_request =
+              all_requests[old_bc.requestsInfo[ii].request_guid];
+          bool req_completed = inf_req_completed(old_bc, ii);
+          if (!req_completed) {
+            space_for_incr_dec_requests++;
+          }
+        }
+        num_planned_tokens =
+            std::min(get_max_tokens_per_batch() - tot_num_planned_tokens -
+                         space_for_incr_dec_requests,
+                     (int)request.tokens.size() - processed_tokens);
+      }
+      assert(num_planned_tokens > 0);
+
+      planned_tokens_per_request.push_back(
+          std::make_pair(request.guid, num_planned_tokens));
+
+      tot_num_planned_tokens += num_planned_tokens;
+    }
+  }
+  assert(tot_num_planned_tokens >= 0 &&
+         tot_num_planned_tokens <= get_max_tokens_per_batch());
+
+  if (tot_num_planned_tokens == 0) {
+    return;
+  }
+
+  // std::cout << "\nplanned tokens per request: " << std::endl;
+  // for (const auto &pair : planned_tokens_per_request) {
+  //   std::cout << "Request GUID: " << pair.first << ", Planned Tokens: " <<
+  //   pair.second << std::endl;
+  // }
+
+  PageManager *pm = PageManager::get_page_manager();
+  // std::cout << "pm state before evicting (if needed): " << *pm << std::endl;
+
+  while (!pm->enough_space_to_append_tokens(planned_tokens_per_request)) {
+    RequestGuid request_to_evict = pm->evict_request_fifo();
+    Request &request = all_requests[request_to_evict];
+    request.status = Request::EVICTED;
+    size_t before = pending_infr_request_queue.size();
+    pending_infr_request_queue.push_front(request);
+    size_t after = pending_infr_request_queue.size();
+    // printf("\nEvicting request: %zu\n", request.guid);
+    // printf("Pending infr request queue size: %zu -> %zu\n", before, after);
+    // Remove the evicted request from planned_tokens_per_request
+    // before = planned_tokens_per_request.size();
+    planned_tokens_per_request.erase(
+        std::remove_if(
+            planned_tokens_per_request.begin(),
+            planned_tokens_per_request.end(),
+            [request_to_evict](std::pair<RequestGuid, int> const &p) {
+              return p.first == request_to_evict;
+            }),
+        planned_tokens_per_request.end());
+    // after = planned_tokens_per_request.size();
+    // printf("planned_tokens_per_request size: %zu -> %zu\n", before, after);
+  }
+}
+
 void RequestManager::add_continuing_inf_req_to_new_batch(
     BatchConfig &new_bc,
     BatchConfig const &old_bc,
     int &num_active_req,
     int &num_concurrent_inf_adapters,
     int i) {
+  // printf("Entering add_continuing_inf_req_to_new_batch\n");
   assert(new_bc.num_tokens < get_max_tokens_per_batch() &&
          "Trying to add a continuing request when the batch is full");
   Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
@@ -1016,6 +1156,12 @@ void RequestManager::add_continuing_inf_req_to_new_batch(
     new_bc.tokensInfo[new_bc.num_tokens].token_id = request.tokens[depth];
     new_bc.num_tokens++;
   }
+
+  // record num tokens used in kv cache
+  PageManager *pm = PageManager::get_page_manager();
+  pm->append_tokens(new_bc.requestsInfo[i].request_guid,
+                    new_bc.requestsInfo[i].num_tokens_in_batch);
+
   // Update profiling
   profiling_requests[new_bc.requestsInfo[i].request_guid].llm_decoding_steps++;
 }
@@ -1024,28 +1170,27 @@ void RequestManager::add_new_inf_req(BatchConfig &new_bc,
                                      int &num_active_req,
                                      int &num_concurrent_inf_adapters,
                                      int i) {
+  // printf("Entering add_new_inf_req\n");
   assert(!pending_infr_request_queue.empty() &&
          "Trying to add a new inference request when there are none");
   assert(new_bc.num_tokens < get_max_tokens_per_batch() &&
          "Trying to add a new inference request when the batch is full");
 
-  Request new_request = pending_infr_request_queue.front();
-  assert(new_request.req_type == RequestType::REQ_INFERENCE);
+  assert(enough_space_to_add_request(new_bc, num_concurrent_inf_adapters) &&
+         "Attempting to add a request that does not fit");
 
-  // if the request has peft adapters and we are at capacity, don't add it yet
-  if (new_request.peft_model_id != PEFTModelID::NO_ID &&
-      num_concurrent_inf_adapters == get_max_concurrent_adapters()) {
-    return;
-  }
+  Request &pq_request = pending_infr_request_queue.front();
+  Request &new_request = all_requests[pq_request.guid];
+  pending_infr_request_queue.pop_front();
 
-  pending_infr_request_queue.pop();
+  int prefill_tokens_first_batch =
+      std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
+               (int)new_request.tokens.size());
 
   new_bc.requestsInfo[i].first_token_depth_in_request = 0;
   new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
   new_bc.requestsInfo[i].request_guid = new_request.guid;
-  new_bc.requestsInfo[i].num_tokens_in_batch =
-      std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
-               (int)new_request.tokens.size());
+  new_bc.requestsInfo[i].num_tokens_in_batch = prefill_tokens_first_batch;
   new_bc.requestsInfo[i].max_length = new_request.max_length;
   new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
   if (new_request.peft_model_id != PEFTModelID::NO_ID) {
@@ -1058,28 +1203,45 @@ void RequestManager::add_new_inf_req(BatchConfig &new_bc,
   num_active_req++;
   new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
   // add start time to profile_info for the new request
-  profiling_requests[new_request.guid].llm_decoding_steps = 1;
-  profiling_requests[new_request.guid].start_time =
-      Realm::Clock::current_time_in_microseconds();
+  if (new_request.status == Request::EVICTED) {
+    assert(profiling_requests.find(new_request.guid) !=
+           profiling_requests.end());
+  } else {
+    profiling_requests[new_request.guid].llm_decoding_steps = 1;
+    profiling_requests[new_request.guid].start_time =
+        Realm::Clock::current_time_in_microseconds();
+  }
+
   for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
     int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
     new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
     new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = depth;
     assert(depth < new_request.tokens.size());
     new_bc.tokensInfo[new_bc.num_tokens].token_id = new_request.tokens[depth];
+
     new_bc.num_tokens++;
   }
 
-  // Record request start time
-  InferenceReqProfileInfo inf_profile_info;
-  inf_profile_info.request_guid = new_request.guid;
-  inf_profile_info.decoding_step_idx = REQ_START_TIME_STEP_IDX;
-  inf_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
-  inf_req_profile_infos.push_back(inf_profile_info);
+  if (new_request.status != Request::EVICTED) {
+    // Record request start time
+    InferenceReqProfileInfo inf_profile_info;
+    inf_profile_info.request_guid = new_request.guid;
+    inf_profile_info.decoding_step_idx = REQ_START_TIME_STEP_IDX;
+    inf_profile_info.timestamp = Realm::Clock::current_time_in_microseconds();
+    inf_req_profile_infos.push_back(inf_profile_info);
+  } else {
+    new_request.status = Request::RUNNING;
+  }
+
+  PageManager *pm = PageManager::get_page_manager();
+  pm->add_request(new_request.guid, (int)new_request.tokens.size());
+  pm->append_tokens(new_request.guid,
+                    new_bc.requestsInfo[i].num_tokens_in_batch);
 }
 
 void RequestManager::handle_completed_finetuning_req(
     BatchConfig const &old_finetuning_bc) {
+  // printf("Entering handle_completed_finetuning_req\n");
   if (!inference_finished) {
     assert(
         old_finetuning_bc.num_finetuning_bwd_requests() == 1 &&
@@ -1131,6 +1293,7 @@ void RequestManager::handle_completed_finetuning_req(
 }
 
 void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
+  // printf("Entering add_finetuning_req_fwd_batch\n");
   assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
@@ -1200,6 +1363,7 @@ void RequestManager::add_finetuning_req_fwd_batch(BatchConfig &new_bc) {
 }
 
 void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
+  // printf("Entering add_finetuning_req_bwd_batch\n");
   assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
   assert(!pending_peft_request_queue.empty() &&
          "Trying to add a new finetuning request when there are none");
@@ -1284,6 +1448,7 @@ void RequestManager::add_finetuning_req_bwd_batch(BatchConfig &new_bc) {
 }
 
 bool RequestManager::finetuning_fwd_work_available() {
+  // printf("Entering finetuning_fwd_work_available\n");
   if (pending_peft_request_queue.empty() || inference_finished) {
     return false;
   }
@@ -1292,6 +1457,7 @@ bool RequestManager::finetuning_fwd_work_available() {
 }
 
 bool RequestManager::finetuning_bwd_work_available() {
+  // printf("Entering finetuning_bwd_work_available\n");
   if (pending_peft_request_queue.empty() || inference_finished) {
     return false;
   }
@@ -1301,6 +1467,7 @@ bool RequestManager::finetuning_bwd_work_available() {
 
 void RequestManager::process_finetuning_req_fwd_progress(
     BatchConfig const &old_bc, InferenceResult const &result) {
+  // printf("Entering process_finetuning_req_fwd_progress\n");
   assert(old_bc.num_finetuning_fwd_requests() +
                  old_bc.num_finetuning_bwd_requests() <=
              1 &&
@@ -1361,6 +1528,7 @@ void RequestManager::process_finetuning_req_fwd_progress(
 
 void RequestManager::process_finetuning_req_bwd_progress(
     BatchConfig const &old_bc) {
+  // printf("Entering process_finetuning_req_bwd_progress\n");
   assert(old_bc.num_finetuning_fwd_requests() +
                  old_bc.num_finetuning_bwd_requests() <=
              1 &&
@@ -1400,6 +1568,7 @@ void RequestManager::process_finetuning_req_bwd_progress(
 }
 
 void RequestManager::record_step_profile_info(BatchConfig const &old_bc) {
+  // printf("Entering record_step_profile_info\n");
   StepProfileInfo step_profile_info;
   step_profile_info.step_idx = step_idx++;
   step_profile_info.run_idx = run_idx;
@@ -1447,6 +1616,7 @@ void RequestManager::record_step_profile_info(BatchConfig const &old_bc) {
 
 void RequestManager::process_work_from_old_batch(
     BatchConfig const &old_bc, InferenceResult const &result) {
+  // printf("Entering process_work_from_old_batch\n");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
 
   if (verbose) {
@@ -1471,6 +1641,7 @@ void RequestManager::process_work_from_old_batch(
 }
 
 BatchConfig RequestManager::prepare_next_bwd_batch(BatchConfig &new_bc) {
+  // printf("Entering prepare_next_bwd_batch\n");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
 
   if (finetuning_bwd_work_available()) {
@@ -1488,6 +1659,7 @@ BatchConfig RequestManager::prepare_next_bwd_batch(BatchConfig &new_bc) {
 BatchConfig
     RequestManager::prepare_next_fwd_batch(BatchConfig const &old_bc,
                                            InferenceResult const &result) {
+  // printf("\nEntering prepare_next_fwd_batch\n");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
 
   if (verbose) {
@@ -1506,10 +1678,16 @@ BatchConfig
       BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
   int num_concurrent_inf_adapters = 0;
 
+  // Step 2: evict any requests that will not fit in the kv cache
+  evict_requests_if_needed(old_bc, inference_batch_size);
+
   // Step 2: prepare the next batch for existing inference requests
   for (int req_idx = 0; req_idx < inference_batch_size; req_idx++) {
     if (!old_bc.request_completed[req_idx] &&
-        !inf_req_completed(old_bc, req_idx)) {
+        !inf_req_completed(old_bc, req_idx) &&
+        !inf_req_evicted(old_bc, req_idx)) {
+      // printf("Adding continuing inference request %zu\n",
+      //        old_bc.requestsInfo[req_idx].request_guid);
       add_continuing_inf_req_to_new_batch(
           new_bc, old_bc, num_active_req, num_concurrent_inf_adapters, req_idx);
     }
@@ -1520,11 +1698,17 @@ BatchConfig
   // Step 3: add new inference requests to the next batch if there is space and
   // they are available
   if (!pending_infr_request_queue.empty()) {
-    for (int req_idx = 0; req_idx < inference_batch_size &&
-                          new_bc.num_tokens < get_max_tokens_per_batch() &&
-                          !pending_infr_request_queue.empty();
+    // printf("pending_infr_request_queue.size(): %zu\n",
+    //        pending_infr_request_queue.size());
+    for (int req_idx = 0;
+         req_idx < inference_batch_size &&
+         new_bc.num_tokens < get_max_tokens_per_batch() &&
+         !pending_infr_request_queue.empty() &&
+         enough_space_to_add_request(new_bc, num_concurrent_inf_adapters);
          req_idx++) {
       if (new_bc.request_completed[req_idx]) {
+        // printf("Adding new inference request %zu\n",
+        //        pending_infr_request_queue.front().guid);
         add_new_inf_req(
             new_bc, num_active_req, num_concurrent_inf_adapters, req_idx);
       }
@@ -2021,7 +2205,7 @@ BeamSearchBatchConfig
       if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
         Request new_request = pending_infr_request_queue.front();
-        pending_infr_request_queue.pop();
+        pending_infr_request_queue.pop_front();
         // all_requests[new_request.guid] = new_request;
         num_active_req++;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 343f1dd6e..6644293fa 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -13,12 +13,21 @@
  * limitations under the License.
  */
 
+#include "flashinfer/decode_attention_decl.cuh"
+#include "flashinfer/prefill_attention_decl.cuh"
 #include "flexflow/request_manager.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
 using namespace Legion;
+using flashinfer::BatchDecodeHandler;
+using flashinfer::BatchPrefillHandler;
+using flashinfer::LogitsPostHook;
+using flashinfer::paged_kv_t;
+using flashinfer::PageStorage;
+using flashinfer::PosEncodingMode;
+using flashinfer::QKVLayout;
 
 void RequestManager::load_tokens_task(
     Task const *task,
@@ -27,6 +36,7 @@ void RequestManager::load_tokens_task(
     Runtime *runtime) {
   assert(regions.size() == 1);
   assert(task->regions.size() == 1);
+  // printf("Entering load_tokens_task\n");
 
   // BatchConfig const batch_config = *((BatchConfig *)task->args);
   BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]);
@@ -78,6 +88,78 @@ void RequestManager::load_tokens_task(
                             stream));
 }
 
+void prepare_inference_params_kernel_h(
+    BatchConfig const *batch_config,
+    std::vector<int32_t> &q_indptr_h,
+    std::vector<int32_t> &kv_indptr_h,
+    std::vector<int32_t> &kv_page_indices_h,
+    std::vector<int32_t> &kv_last_page_len_h) {
+  // printf("Entering prepare_inference_params_kernel_h\n");
+
+  PageManager *pm = PageManager::get_page_manager();
+
+  // std::cout << "prepare_inference_params_kernel_h: " << *batch_config <<
+  // std::endl;
+
+  q_indptr_h.clear();
+  kv_indptr_h.clear();
+  kv_page_indices_h.clear();
+  kv_last_page_len_h.clear();
+
+  q_indptr_h.push_back(0);
+  kv_indptr_h.push_back(0);
+
+  for (int req_idx = 0; req_idx < batch_config->max_requests_per_batch();
+       req_idx++) {
+    if (batch_config->request_completed[req_idx] ||
+        batch_config->requestsInfo[req_idx].finetuning_request) {
+      continue;
+    }
+
+    // q_indptr: first token offset in batch, plus one token at the end
+    // representing the total number of tokens in batch
+    q_indptr_h.push_back(
+        q_indptr_h.back() +
+        batch_config->requestsInfo[req_idx].num_tokens_in_batch);
+
+    // kv_indptr: starting index of KV cache pages for each request in logical
+    // page table
+
+    int num_pages_used_by_req = pm->get_num_pages_used_by_req(
+        batch_config->requestsInfo[req_idx].request_guid);
+    assert(num_pages_used_by_req >= 1);
+    kv_indptr_h.push_back(kv_indptr_h.back() + num_pages_used_by_req);
+
+    // kv_page_indices_h: physical indices of KV cache pages in use by each
+    // request (not just the pages used by the tokens in the current batch)
+    std::vector<int> req_page_indices = pm->get_req_page_indices(
+        batch_config->requestsInfo[req_idx].request_guid);
+    kv_page_indices_h.insert(kv_page_indices_h.end(),
+                             req_page_indices.begin(),
+                             req_page_indices.end());
+
+    // kv_last_page_len_h: number of tokens in the last page in use by each
+    // request
+    kv_last_page_len_h.push_back(pm->get_num_tokens_in_last_used_page(
+        batch_config->requestsInfo[req_idx].request_guid));
+  }
+
+  // check sizes
+  int batch_size = batch_config->num_active_requests() -
+                   batch_config->num_finetuning_fwd_requests() -
+                   batch_config->num_finetuning_bwd_requests();
+  assert(batch_size > 0);
+  // printf("q_indptr_h size: %lu\n", q_indptr_h.size());
+  // printf("kv_indptr_h size: %lu\n", kv_indptr_h.size());
+  // printf("kv_page_indices_h size: %lu\n", kv_page_indices_h.size());
+  // printf("kv_last_page_len_h size: %lu\n", kv_last_page_len_h.size());
+  // printf("batch_size: %i\n", batch_size);
+  assert(q_indptr_h.size() == batch_size + 1);
+  assert(kv_indptr_h.size() == batch_size + 1);
+  assert(kv_page_indices_h.size() >= batch_size);
+  assert(kv_last_page_len_h.size() == batch_size);
+}
+
 void RequestManager::load_batch_config_task(
     Task const *task,
     std::vector<PhysicalRegion> const &regions,
@@ -87,6 +169,7 @@ void RequestManager::load_batch_config_task(
   assert(task->regions.size() == 0);
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  // printf("Entering load_batch_config_task\n");
 
   // BatchConfig const batch_config = *((BatchConfig *)task->args);
   BatchConfig const *batch_config = BatchConfig::from_future(task->futures[0]);
@@ -156,6 +239,106 @@ void RequestManager::load_batch_config_task(
                               cudaMemcpyHostToDevice,
                               stream));
   }
+
+  // load attention metadata
+  int batch_size = batch_config->num_active_requests() -
+                   batch_config->num_finetuning_fwd_requests() -
+                   batch_config->num_finetuning_bwd_requests();
+  if (batch_config->get_mode() == INC_DECODING_MODE && batch_size > 0 &&
+      handle.incr_attention_metadata->enabled()) {
+    // assert(handle.incr_attention_metadata->enabled());
+    // printf("Entering here, handler: %p\n", handle.incr_attention_metadata);
+    std::vector<int32_t> q_indptr_h;
+    std::vector<int32_t> kv_indptr_h;
+    std::vector<int32_t> kv_page_indices_h;
+    std::vector<int32_t> kv_last_page_len_h;
+    // calculate the attention meta data
+    prepare_inference_params_kernel_h(batch_config,
+                                      q_indptr_h,
+                                      kv_indptr_h,
+                                      kv_page_indices_h,
+                                      kv_last_page_len_h);
+    checkCUDA(cudaMemcpyAsync(handle.incr_attention_metadata->q_indptr,
+                              q_indptr_h.data(),
+                              sizeof(int32_t) * q_indptr_h.size(),
+                              cudaMemcpyHostToDevice,
+                              stream));
+    checkCUDA(cudaMemcpyAsync(handle.incr_attention_metadata->kv_indptr,
+                              kv_indptr_h.data(),
+                              sizeof(int32_t) * kv_indptr_h.size(),
+                              cudaMemcpyHostToDevice,
+                              stream));
+    checkCUDA(cudaMemcpyAsync(handle.incr_attention_metadata->kv_indices,
+                              kv_page_indices_h.data(),
+                              sizeof(int32_t) * kv_page_indices_h.size(),
+                              cudaMemcpyHostToDevice,
+                              stream));
+    checkCUDA(cudaMemcpyAsync(handle.incr_attention_metadata->kv_last_page_len,
+                              kv_last_page_len_h.data(),
+                              sizeof(int32_t) * kv_last_page_len_h.size(),
+                              cudaMemcpyHostToDevice,
+                              stream));
+    // prepare attention forward handler
+    if (handle.incr_attention_metadata->prompt_handler_collections.count(
+            batch_size) == 0) {
+      handle.incr_attention_metadata->prompt_handler_collections[batch_size] =
+          static_cast<void *>(new flashinfer::BatchPrefillHandler(true));
+    }
+    BatchPrefillHandler *handler = static_cast<BatchPrefillHandler *>(
+        handle.incr_attention_metadata->prompt_handler_collections[batch_size]);
+    handler->SetCUDAStream(stream);
+    // static int step=0;
+    PageManager *pm = PageManager::get_page_manager();
+    // printf("BatchPrefillHandler %p\n", handler);
+    // std::cout << "STEP " << step << ": " << *pm << std::endl;
+    // step+=1;
+    // std::cout << "batch_config: " << *batch_config << std::endl;
+    // std::cout << "q_indptr_h: ";
+    // for (int i = 0; i < q_indptr_h.size(); i++) {
+    //   std::cout << q_indptr_h[i] << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "kv_indptr_h: ";
+    // for (int i = 0; i < kv_indptr_h.size(); i++) {
+    //   std::cout << kv_indptr_h[i] << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "kv_page_indices_h: ";
+    // for (int i = 0; i < kv_page_indices_h.size(); i++) {
+    //   std::cout << kv_page_indices_h[i] << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "kv_last_page_len_h: ";
+    // for (int i = 0; i < kv_last_page_len_h.size(); i++) {
+    //   std::cout << kv_last_page_len_h[i] << " ";
+    // }
+    // std::cout << std::endl;
+    // std::cout << "batch_size: " << batch_size << std::endl;
+
+    // std::cout << "num_q_heads: " <<
+    // handle.incr_attention_metadata->num_q_heads() << std::endl; std::cout <<
+    // "num_kv_heads: " << handle.incr_attention_metadata->num_kv_heads() <<
+    // std::endl; std::cout << "head_dim: " <<
+    // handle.incr_attention_metadata->head_dim() << std::endl; std::cout <<
+    // "tokens_per_page: " << pm->get_tokens_per_page() << std::endl; std::cout
+    // << "float_workspace_size: " <<
+    // handle.incr_attention_metadata->float_workspace_size << std::endl;
+    // std::cout << "int_workspace_size: " <<
+    // handle.incr_attention_metadata->int_workspace_size << std::endl;
+
+    handler->BeginForward<half, int32_t>(
+        static_cast<void *>(handle.incr_attention_metadata->float_workspace),
+        handle.incr_attention_metadata->float_workspace_size,
+        static_cast<void *>(handle.incr_attention_metadata->int_workspace),
+        handle.incr_attention_metadata->int_workspace_size,
+        static_cast<int32_t *>(q_indptr_h.data()),
+        static_cast<int32_t *>(kv_indptr_h.data()),
+        batch_size,
+        handle.incr_attention_metadata->num_q_heads(),
+        handle.incr_attention_metadata->num_kv_heads(),
+        handle.incr_attention_metadata->head_dim(),
+        pm->get_tokens_per_page());
+  }
 }
 
 void RequestManager::load_positions_task(
diff --git a/tests/fine_grained_alignment_test.sh b/tests/fine_grained_alignment_test.sh
index 4baaa53ab..c4afa00bd 100755
--- a/tests/fine_grained_alignment_test.sh
+++ b/tests/fine_grained_alignment_test.sh
@@ -9,7 +9,7 @@ TP_DEGREE=${TP_DEGREE:-2}
 PP_DEGREE=${PP_DEGREE:-2}
 CACHE_PATH=${FF_CACHE_PATH:-"~/.cache/flexflow"}
 NUM_STEPS=${NUM_STEPS:-2}
-FULL_PRECISION=${FULL_PRECISION:-true}
+FULL_PRECISION=${FULL_PRECISION:-false}
 FUSION=${FUSION:-true}
 
 # Token to access private huggingface models (e.g. LLAMA-2)
@@ -55,9 +55,10 @@ then
 fi
 
 MAX_LENGTH=$((PROMPT_LENGTH + NUM_STEPS + 1))
-
 if [ "$FULL_PRECISION" = "true" ]; then full_precision_flag="--use-full-precision"; else full_precision_flag=""; fi
-python ./tests/inference/huggingface_inference.py \
+if [ "$FUSION" = "true" ]; then fusion_flag="--fusion"; else fusion_flag=""; fi
+
+eval python ./tests/inference/huggingface_inference.py \
     --model-name "${MODEL_NAME}" \
     --max-length "${MAX_LENGTH}" \
     --prompt-file ../../inference/prompt/test.json \
@@ -91,16 +92,17 @@ echo "$json_config" > ./fine_grained_alignment_config.json
 
 python ./inference/python/incr_decoding.py -config-file ./fine_grained_alignment_config.json
 
-# # C++ test
-# echo "C++ test"
-# ./build/inference/incr_decoding/incr_decoding \
-#     -ll:gpu 2 -ll:cpu 4 -ll:util 4 \
-#     -tensor-parallelism-degree 2 \
-#     -ll:fsize 8192 -ll:zsize 12000 \
-#     -llm-model $MODEL_NAME \
-#     -prompt ./inference/prompt/peft.json \
-#     --use-full-precision \
-#     --inference-debugging
+# C++ test
+echo "C++ test"
+eval ./build/inference/incr_decoding/incr_decoding \
+    -ll:gpu "${NUM_GPUS}" -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree "${TP_DEGREE}" \
+    -pipeline-parallelism-degree "${PP_DEGREE}" \
+    -ll:fsize "${MEMORY_PER_GPU}" -ll:zsize "${ZCOPY_MEMORY}" \
+    -llm-model "${MODEL_NAME}" \
+    -prompt ./inference/prompt/test.json \
+    --max-length $MAX_LENGTH \
+    "${full_precision_flag}" "${fusion_flag}" --inference-debugging
 
 # Check alignment
 python ./tests/inference/inference_alignment_test.py -m "$MODEL_NAME" -tp "$TP_DEGREE" -n "$NUM_STEPS"
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index c8d680a87..2848f0a6a 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -2,264 +2,171 @@
 set -x
 set -e
 
-# Cd into directory holding this script
-cd "${BASH_SOURCE[0]%/*}"
-
-# Enable model parallelism tests in C++, if desired
-TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF}
-
-# Download models
-python3 ../../inference/utils/download_hf_model.py "meta-llama/Llama-2-7b-hf" "JackFram/llama-160m" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b"
-
-###############################################################################################
-############################ Speculative inference tests ######################################
-###############################################################################################
-
-# LLAMA
-../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
-# LLAMA (half precision)
-../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
-
-# OPT
-../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
-# OPT (half precision)
-../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
-
-# Tensor parallelism tests
-if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
-    # LLAMA
-    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    # LLAMA (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    
-    # OPT
-    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    # OPT (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-fi
-
-###############################################################################################
-############################ Incremental decoding tests #######################################
-###############################################################################################
-
-# LLAMA (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
-
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1
-
-# LLAMA (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
-
-# LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
-# LLAMA (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
-
-# OPT (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
-# OPT (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
-
-# OPT (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
-# OPT (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
-
-# Falcon (full precision)
-../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
-# Falcon (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
-
-# # StarCoder (full precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
-# # StarCoder (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
-
-# Tensor parallelism tests
-if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
-    # LLAMA (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
-    # LLAMA (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
-
-    # LLAMA (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    # LLAMA (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-
-    # OPT (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
-    # OPT (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
-
-    # OPT (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    # OPT (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-fi
-
-###############################################################################################
-############################### Alignment and Speed tests #####################################
-###############################################################################################
-
-##################################### Helper functions #######################################
-function check_partial_token_match {
-    local file1="$1"
-    local file2="$2"
-    local num_tokens_to_match=30
-
-    # Read the second line of the first file
-    third_line=$(sed -n '3p' "$file1")
-    read -r line1 <<< "$third_line"
-    tokens1=${line1#*: }
-    IFS=',' read -ra arr1 <<< "$tokens1"
-
-    # Read the second line of the second file
-    third_line=$(sed -n '3p' "$file2")
-    read -r line2 <<< "$third_line"
-    tokens2=${line2#*: }
-    IFS=',' read -ra arr2 <<< "$tokens2"
-
-    # Compare the first few integers in the two lists
-    for ((i = 0; i < num_tokens_to_match; i++)); do
-        if [[ "${arr1[$i]}" != "${arr2[$i]}" ]]; then
-            echo "The first $num_tokens_to_match tokens in files $file1 and $file2 are not identical."
-            exit 1
-        fi
-    done
-    #echo "The first $num_tokens_to_match integers are identical."
-}
-
-function compare_speed_spec_infer_incr_decoding {
-    local incrDec_file="$1"
-    local specInf_file="$2"
-
-    # Read the float numbers from the first line of the files
-    incrDec=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$incrDec_file")
-    specInf=$(sed -n '1 s/end-to-end latency: \(.*\)/\1/p' "$specInf_file")
-
-    if ! command -v bc &> /dev/null; then
-        echo "bc is not installed. Installing..."
-        sudo apt-get install -y bc
+# Cd into root directory of repo
+cd "${BASH_SOURCE[0]%/*}/../.."
+
+# Function to launch specinfer with flags from a JSON config file.
+run_cpp_inference() {
+  local config_file="$1"
+
+  # Check that a config file was provided and exists
+  if [[ -z "$config_file" ]]; then
+    echo "Usage: launch_specinfer <config_file.json>"
+    return 1
+  fi
+  if [[ ! -f "$config_file" ]]; then
+    echo "Config file not found: $config_file"
+    return 1
+  fi
+
+  # Check for mandatory keys in the config file (including model_name)
+  for req in num_gpus memory_per_gpu zero_copy_memory_per_node llm_model; do
+    if ! jq -e --arg key "$req" 'has($key)' "$config_file" >/dev/null; then
+      echo "Error: Missing required parameter: $req"
+      return 1
     fi
-    
-    # Perform the comparison
-    threshold=$(bc <<< "$specInf * 1.5")
-    if (( $(echo "$incrDec >= $threshold" | bc -l) )); then
-        #echo "The latency in $specInf_file is at least 1.5x smaller than the latency from $incrDec_file."
-        :
-    else
-        echo "Error: The latency in $specInf_file is not at least 1.5x smaller than the latency in $incrDec_file!"
-        exit 1
+  done
+
+  # Download the model using the model_name key
+  llm_model=$(jq -r '.llm_model' "$config_file")
+  echo "Downloading model: $llm_model"
+  if ! python3 ./inference/utils/download_hf_model.py "$llm_model" --half-precision-only; then
+    echo "Error: Failed to download model $llm_model"
+    return 1
+  fi
+
+  # Declare an associative array mapping config keys to system flags
+  declare -A ff_arg_to_sysarg=(
+    ["num_gpus"]="-ll:gpu"
+    ["memory_per_gpu"]="-ll:fsize"
+    ["zero_copy_memory_per_node"]="-ll:zsize"
+    ["cpu_memory_per_node"]="-ll:csize"
+    ["num_cpus"]="-ll:cpu"
+    ["legion_utility_processors"]="-ll:util"
+    ["data_parallelism_degree"]="-data-parallelism-degree"
+    ["tensor_parallelism_degree"]="-tensor-parallelism-degree"
+    ["pipeline_parallelism_degree"]="-pipeline-parallelism-degree"
+    ["offload"]="-offload"
+    ["offload_reserve_space_size"]="-offload-reserve-space-size"
+    ["use_4bit_quantization"]="--4bit-quantization"
+    ["use_8bit_quantization"]="--8bit-quantization"
+    ["enable_peft"]="-enable-peft"
+    ["profiling"]="--profiling"
+    ["benchmarking"]="--benchmarking"
+    ["inference_debugging"]="--inference-debugging"
+    ["fusion"]="--fusion"
+    ["llm_model"]="-llm-model"
+    # ["cache_path"]="-cache-folder"
+    ["full_precision"]="--use-full-precision"
+    ["prompt"]="-prompt"
+    ["output_file"]="-output-file"
+    ["max_seq_length"]="--max-sequence-length"
+    ["max_requests_per_batch"]="--max-requests-per-batch"
+    ["max_length"]="--max-length"
+    ["max_tokens_per_batch"]="--max-tokens-per-batch"
+    ["log_instance_creation"]="--log-instance-creation"
+    ["disable_control_replication"]="--disable-control-replication"
+    ["dataset"]="--dataset"
+    ["enable_inplace_optimizations"]="--enable-inplace-optimization"
+  )
+
+  # Build the command line arguments array
+  args=()
+
+  # Process keys in the order they appear in the JSON file.
+  # Use jq to output tab-separated key-value pairs.
+  while IFS=$'\t' read -r key value; do
+    # Skip "model_name" (already used for downloading).
+    if [[ "$key" == "model_name" ]]; then
+      continue
     fi
-}
 
-function compare_decoding_steps_spec_infer_incr_decoding {
-    local incrDec_file="$1"
-    local specInf_file="$2"
-
-    # Read the number of decoding steps from the second line of the files
-    second_line=$(sed -n '2p' "$incrDec_file")
-    read -r line <<< "$second_line"
-    incrDec=${line#*: }
-    second_line=$(sed -n '2p' "$specInf_file")
-    read -r line <<< "$second_line"
-    specInf=${line#*: }
-
-    if ! command -v bc &> /dev/null; then
-        echo "bc is not installed. Installing..."
-        sudo apt-get install -y bc
+    # Process the "ssms" block specially.
+    if [[ "$key" == "ssms" ]]; then
+      # For each element in the "ssms" array, download and add the -ssm-model flag.
+      ssm_models=$(jq -r '.ssms[] | .ssm_model' "$config_file")
+      for ssm_model in $ssm_models; do
+        echo "Downloading ssm_model: $ssm_model"
+        if ! python3 ./inference/utils/download_hf_model.py "$ssm_model" --half-precision-only; then
+          echo "Error: Failed to download ssm_model $ssm_model"
+          return 1
+        fi
+        args+=("-ssm-model" "$ssm_model")
+      done
+      continue
     fi
-    
-    # Perform the comparison
-    threshold=$(bc <<< "$specInf * 1.5")
-    if (( $(echo "$incrDec >= $threshold" | bc -l) )); then
-        #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file."
-        :
-    else
-        echo "Error: The decoding steps in $specInf_file are not at least 1.5x less than those in $incrDec_file!"
-        exit 1
+
+    # If the key is recognized in the mapping, add the corresponding flag.
+    if [[ -n "${ff_arg_to_sysarg[$key]}" ]]; then
+      flag="${ff_arg_to_sysarg[$key]}"
+      if [[ "$value" == "true" ]]; then
+        args+=("$flag")
+      elif [[ "$value" == "false" ]]; then
+        continue
+      else
+        args+=("$flag" "$value")
+      fi
     fi
+  done < <(jq -r 'to_entries[] | "\(.key)\t\(.value|tostring)"' "$config_file")
+
+  # Determine which executable to run based on file contents:
+  # Use ./incr_dec if the file contains "incr_dec", else if "spec_infer" is found use ./specinfer.
+  if [[ $config_file == *"incr_dec"* ]]; then
+    executable="./build/inference/incr_decoding/incr_decoding"
+  elif [[ $config_file == *"spec_infer"* ]]; then  
+    executable="./build/inference/spec_infer/spec_infer"
+  else
+    echo "Error: Config file does not specify a valid mode (incr_dec or spec_infer)"
+    return 1
+  fi
+
+  # Launch the chosen program with the constructed arguments
+  $executable "${args[@]}"
 }
 
-############ Alignment between speculative inference and incremental decoding #################
-# Full precision
-diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt")
-diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt")   <(tail -n +3 "../../inference/output/spec_inference_opt.txt")
-# Half precision
-check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
-check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt"
-
-# Speed test: speculative inference should be at very least 1.5x faster than incremental decoding
-# Full precision
-#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt"
-#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt"
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt"
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt"
-# Half precision
-#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
-#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt"
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt"
-compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt"
-
-############ Alignment between tensor model parallelism and pipeline parallelism only #################
-if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
-    diff <(tail -n +3 "../../inference/output/spec_inference_llama_tp.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt")
-    diff <(tail -n +3 "../../inference/output/spec_inference_opt_tp.txt")  <(tail -n +3 "../../inference/output/spec_inference_opt.txt")
-    check_partial_token_match "../../inference/output/spec_inference_llama_half_tp.txt" "../../inference/output/spec_inference_llama_half.txt"
-    check_partial_token_match "../../inference/output/spec_inference_opt_half_tp.txt" "../../inference/output/spec_inference_opt_half.txt"
-    diff <(tail -n +3 "../../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_160M.txt")
-    check_partial_token_match "../../inference/output/incr_decoding_llama_160M_half_tp.txt" "../../inference/output/incr_decoding_llama_160M_half.txt"
-    diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt")
-    check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_2_7B_half.txt"
-    diff <(tail -n +3 "../../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_125M.txt")
-    check_partial_token_match "../../inference/output/incr_decoding_opt_125M_half_tp.txt" "../../inference/output/incr_decoding_opt_125M_half.txt"
-    diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt")
-    check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half_tp.txt" "../../inference/output/incr_decoding_opt_6B_half.txt"
-fi
-
-######################### Alignment tests with HuggingFace ####################################
-
-# LLAMA (small model, full precision)
-python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu
-
-# LLAMA (small model, half precision)
-python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu
-
-# LLAMA (big model, full precision)
-python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B.txt"
 
-# LLAMA (big model, half precision)
-python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B_half.txt" --gpu
+############## Create prompt ################################
+# Clean up before test
+rm -rf inference/prompt inference/output inference/inf_test_configs || true
+# Create test prompt file
+mkdir -p ./inference/prompt
+echo '["Three tips for staying healthy are: "]' > ./inference/prompt/test.json
 
-# OPT (small model, full precision)
-python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128
 
-# OPT (small model, half precision)
-python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M_half.txt" --gpu --max-length 128
+############## Run inference in flexflow-serve ##############
 
-# OPT (big model, full precision)
-python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B.txt" --max-length 128
+echo "Running inference in flexflow-serve (C++)..."
 
-# OPT (big model, half precision)
-# python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_6B_half.txt" --gpu --max-length 128
+# Generate test configs
+rm -rf ./inference/inf_test_configs/*.json || true
+python ./tests/inference/generate_inf_test_configs.py
 
-# Falcon (full precision)
-python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128
+# Loop through .json files in the ./inference/inf_test_configs dir 
+for file in ./inference/inf_test_configs/*.json; do
+    # Run script
+    run_cpp_inference "$file"
+done
 
+##############  Run inference in HuggingFace ##############
 
-diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_160M.txt")
-diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff "../../inference/output/huggingface_llama_2_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B.txt")
-diff <( < ../../inference/output/huggingface_llama_2_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
+echo "Running inference in huggingface..."
 
-diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_125M.txt")
-diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B.txt")
-# diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B_half.txt")
-diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_falcon_7B.txt")
+model_names=(
+    "meta-llama/Llama-3.1-8B-Instruct"
+    "meta-llama/Llama-3.2-1B-Instruct"
+    "facebook/opt-6.7b"
+    "facebook/opt-125m"
+)
+for model_name in "${model_names[@]}"; do
+    # set model_name_ to the content of model_name after the first "/", transformed into lowercase
+    model_name_=$(echo "$model_name" | cut -d'/' -f2 | tr '[:upper:]' '[:lower:]')
+    python ./tests/inference/huggingface_inference.py \
+        --model-name "$model_name" \
+        --max-length 255 \
+        --prompt-file "${PWD}/inference/prompt/test.json" \
+        --output-file "${PWD}/inference/output/huggingface_$model_name_.json"
+done
 
+##############  Check alignment between results ##############
+echo "Checking alignment of results..."
+pytest -v ./tests/inference/test_inference_output.py
diff --git a/tests/inference/generate_inf_test_configs.py b/tests/inference/generate_inf_test_configs.py
index 15a1af681..982fbfe71 100644
--- a/tests/inference/generate_inf_test_configs.py
+++ b/tests/inference/generate_inf_test_configs.py
@@ -167,4 +167,4 @@ def gen_spec_configs(prompt_file, output_folder, specinfer_model_pairs, parallel
                      parallelism_settings=[Parallelism(4, 1)], 
                      full_precision_settings=[False,], 
                      config_output_folder=config_output_folder
-    )
\ No newline at end of file
+    )
diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh
index 120d3a58b..99f53ecc6 100755
--- a/tests/inference_tests.sh
+++ b/tests/inference_tests.sh
@@ -46,7 +46,7 @@ export LEGION_BACKTRACE=1
 
 ############## Run inference in flexflow-serve ##############
 
-echo "Running inference in flexflow-serve..."
+echo "Running inference in flexflow-serve (python)..."
 
 # Generate test configs
 rm -rf ./inference/inf_test_configs/*.json || true
@@ -61,7 +61,7 @@ for file in ./inference/inf_test_configs/*.json; do
       script="./inference/python/spec_infer.py"
     fi
     # Run script
-    python "$script" -config-file "$file" 
+    python "$script" -config-file "$file"
 done
 
 ##############  Run inference in HuggingFace ##############
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
index e497d4224..f7feebcae 100755
--- a/tests/peft_test.sh
+++ b/tests/peft_test.sh
@@ -33,7 +33,7 @@ export LEGION_BACKTRACE=1
 # Download test model
 python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora
 
-# Run PEFT in Huggingface to get ground truth tensors
+# # Run PEFT in Huggingface to get ground truth tensors
 python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision -lr 0.001
 
 # Python test