Skip to content

Commit

Permalink
FlexLLM (part 4) (#107)
Browse files Browse the repository at this point in the history
* add flashinfer dep

* backup

* backup2

* compiles

* lint

* fwd/bwd handlers

* backup

* backup

* cleanup

* fixes

* backup

* fix

* fix

* fix

* restore multi-stream

* fix

* fix peft test

* fix peft test

* update

* multi batch fix

* fixes

* update

* update

* rocm fix

* shellcheck
  • Loading branch information
goliaro authored Mar 6, 2025
1 parent 2e764a9 commit a63cba1
Show file tree
Hide file tree
Showing 75 changed files with 4,015 additions and 2,289 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ jobs:
CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
run: |
source ./build/set_python_envs.sh
./tests/fine_grained_alignment_test.sh
# ./tests/fine_grained_alignment_test.sh
./tests/inference_tests.sh
- name: Run PEFT tests
Expand Down
5 changes: 4 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@
[submodule "deps/tokenizers-cpp"]
path = deps/tokenizers-cpp
url = https://github.com/mlc-ai/tokenizers-cpp.git
fetchRecurseSubmodules = true
fetchRecurseSubmodules = true
[submodule "deps/flashinfer"]
path = deps/flashinfer
url = https://github.com/flashinfer-ai/flashinfer.git
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ include(variant)
# optional
include(optional)

# flashinfer
list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)

if (FF_GPU_BACKEND STREQUAL "cuda")
list(APPEND FF_CC_FLAGS
-DFF_USE_CUDA)
Expand All @@ -220,7 +223,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND FF_CC_FLAGS
-DFF_DEBUG)
list(APPEND FF_NVCC_FLAGS
-DFF_DEBUG)
-DFF_DEBUG -lineinfo)
endif()

message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
Expand Down
1 change: 1 addition & 0 deletions deps/flashinfer
Submodule flashinfer added at be6bf5
209 changes: 209 additions & 0 deletions include/flexflow/attention_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_
#define _FLEXFLOW_ATTENTION_CONFIG_H_
#include "flexflow/batch_config.h"

namespace FlexFlow {

constexpr uint32_t kPagesize = 64;

inline int ceilDiv(int const a, int const b) {
assert(b != 0 && "Attempting to divide by 0");
assert(a >= 0 && b > 0 && "Expected non-negative numbers");
return (a + b - 1) / b;
}

inline int round_up_pages(int const num_elements) {
return ceilDiv(num_elements, kPagesize);
}

#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...) \
switch (head_dim) { \
case 64: { \
constexpr size_t HEAD_DIM = 64; \
__VA_ARGS__ \
break; \
} \
case 128: { \
constexpr size_t HEAD_DIM = 128; \
__VA_ARGS__ \
break; \
} \
case 256: { \
constexpr size_t HEAD_DIM = 256; \
__VA_ARGS__ \
break; \
} \
default: { \
std::ostringstream err_msg; \
err_msg << "Unsupported head_dim: " << head_dim; \
throw std::invalid_argument(err_msg.str()); \
} \
}

class AttentionMetaData {
public:
AttentionMetaData() {
num_q_heads_ = 0;
num_kv_heads_ = 0;
head_dim_ = 0;
q_indptr = nullptr;
kv_indptr = nullptr;
kv_indices = nullptr;
kv_last_page_len = nullptr;
qk_indptr = nullptr;
custom_mask = nullptr;
workspace = nullptr;
workspace_size = 0;
float_workspace = nullptr;
float_workspace_size = 0;
int_workspace = nullptr;
int_workspace_size = 0;
mem_size_ = 0;
enabled_ = false;
}
AttentionMetaData(AttentionMetaData const &rhs) {
num_q_heads_ = rhs.num_q_heads_;
num_kv_heads_ = rhs.num_kv_heads_;
head_dim_ = rhs.head_dim_;
q_indptr = rhs.q_indptr;
kv_indptr = rhs.kv_indptr;
kv_indices = rhs.kv_indices;
kv_last_page_len = rhs.kv_last_page_len;
qk_indptr = rhs.qk_indptr;
custom_mask = rhs.custom_mask;
workspace = rhs.workspace;
workspace_size = rhs.workspace_size;
float_workspace = rhs.float_workspace;
float_workspace_size = rhs.float_workspace_size;
int_workspace = rhs.int_workspace;
int_workspace_size = rhs.int_workspace_size;
mem_size_ = rhs.mem_size_;
enabled_ = rhs.enabled_;
decode_handler_collections = rhs.decode_handler_collections;
prompt_handler_collections = rhs.prompt_handler_collections;
}

size_t mem_size() {
if (mem_size_ > 0) {
return mem_size_;
}
size_t batch_size = BatchConfig::max_requests_per_batch();
size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
size_t indices_size = std::max(
(batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
size_t custom_mask_size = 0;

float_workspace_size = 128 * 1024 * 1024; // 128 MB
int_workspace_size = 8 * 1024 * 1024; // 8 MB
workspace_size =
float_workspace_size + int_workspace_size; // float + int workspace

mem_size_ = alignTo(sizeof(int32_t) * indices_size +
sizeof(uint8_t) * custom_mask_size + workspace_size,
16);
return mem_size_;
}

void assign_address(void *ptr, int size) {
if (ptr == nullptr) {
q_indptr = nullptr;
kv_indptr = nullptr;
kv_indices = nullptr;
kv_last_page_len = nullptr;
qk_indptr = nullptr;
custom_mask = nullptr;
workspace = nullptr;
float_workspace = nullptr;
int_workspace = nullptr;
return;
}
assert(size >= mem_size() &&
"Insufficient memory size for attention metadata");
size_t batch_size = BatchConfig::max_requests_per_batch();
size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
size_t indices_size = std::max(
(batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
size_t custom_mask_size = 0;

q_indptr = static_cast<int32_t *>(ptr);
kv_indptr = q_indptr + batch_size + 1;
kv_indices = kv_indptr + batch_size + 1;
kv_last_page_len = kv_indices + max_num_pages * batch_size;
qk_indptr = kv_last_page_len + batch_size + 1;
custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
sizeof(int32_t) * indices_size +
sizeof(uint8_t) * custom_mask_size);
float_workspace = workspace;
int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
float_workspace_size);
}

void set_num_q_heads(uint32_t const num_q_heads) {
num_q_heads_ = num_q_heads;
}
void set_num_kv_heads(uint32_t const num_kv_heads) {
num_kv_heads_ = num_kv_heads;
}
void set_head_dim(uint32_t const head_dim) {
head_dim_ = head_dim;
}
uint32_t num_q_heads() const {
return num_q_heads_;
}
uint32_t num_kv_heads() const {
return num_kv_heads_;
}
uint32_t head_dim() const {
return head_dim_;
}

void set_enabled(bool const enabled) {
enabled_ = enabled;
}
bool enabled() const {
return enabled_;
}

uint32_t num_q_heads_;
uint32_t num_kv_heads_;
uint32_t head_dim_;

int32_t *q_indptr;
int32_t *kv_indptr;
int32_t *kv_indices;
int32_t *kv_last_page_len;
int32_t *qk_indptr;
uint8_t *custom_mask;
void *workspace;
size_t workspace_size;
void *float_workspace;
size_t float_workspace_size;
void *int_workspace;
size_t int_workspace_size;

size_t mem_size_;

// batchsize -> handler
bool enabled_;
std::unordered_map<int, void *> decode_handler_collections;
std::unordered_map<int, void *> prompt_handler_collections;
};
} // namespace FlexFlow

#endif // _FLEXFLOW_ATTENTION_CONFIG_H_
26 changes: 26 additions & 0 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@

namespace FlexFlow {

inline int alignTo(int x, int y) {
return ((x + y - 1) / y) * y;
}

class InferenceResult;
class BeamInferenceResult;

Expand Down Expand Up @@ -58,19 +62,32 @@ class BatchConfig {
static const RequestGuid INVALID_GUID = 0;
using TokenId = int;
BatchConfig();
// includes both FWD and BWD finetuning requests
int num_active_requests() const;
// returns number of inference and finetuning FWD tokens
int num_active_tokens() const;

// returns number of inference-only tokens
int num_inference_tokens() const;
int num_inference_requests() const;

// return the index where the finetuning request would be stored (i.e. last
// slot of the batch)
int finetuning_request_index() const;
// returns the number of finetuning FWD requests, or 0 if there is none
int num_finetuning_fwd_requests() const;

int num_finetuning_fwd_tokens() const;
int num_finetuning_bwd_requests() const;
int num_finetuning_bwd_tokens() const;

bool peft_bwd_applies_to_this_layer(int layer) const;
static int max_requests_per_batch();
static int max_tokens_per_batch();
static int max_verify_tokens_per_batch();
static int max_spec_tree_token_num();
static int max_sequence_length();

friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
Expand Down Expand Up @@ -111,6 +128,15 @@ class BatchConfig {
int num_tokens_in_batch;
int max_length;

// paged attention
static constexpr size_t request_guid_size = sizeof(RequestGuid);
static constexpr size_t alignment = 16;
static constexpr size_t padding_size =
(alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
alignment;
static constexpr size_t padding_length = padding_size / sizeof(int);
int padding[padding_length] = {}; // Padding for memory pointer alignment

// request id in batch config:
int batch_config_request_id = -1;
bool prompt_phase = false;
Expand Down
13 changes: 9 additions & 4 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#ifndef _FLEXFLOW_CONFIG_H_
#define _FLEXFLOW_CONFIG_H_
#include "ffconst.h"
#include "flexflow/attention_config.h"
#include "flexflow/batch_config.h"
#include "legion.h"
#include <cstring>
Expand Down Expand Up @@ -87,16 +88,19 @@ struct CombinedBatchConfigMetaStruct {

struct FFHandler {
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
cudnnHandle_t dnn;
cublasHandle_t blas;
cudnnHandle_t dnn, peft_dnn;
cublasHandle_t blas, peft_blas;
#else
miopenHandle_t dnn;
hipblasHandle_t blas;
miopenHandle_t dnn, peft_dnn;
hipblasHandle_t blas, peft_blas;
#endif
void *workSpace;
size_t workSpaceSize;
CombinedBatchConfigMetaStruct *batch_config_metadata;

// flashinfer
AttentionMetaData *incr_attention_metadata;

// request info + token info + topolopgy mask info
size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
void *offload_reserve_space;
Expand All @@ -106,6 +110,7 @@ struct FFHandler {
bool allowTensorOpMathConversion;
#ifdef FF_USE_NCCL
ncclComm_t ncclComm;
ncclComm_t ncclCommPeft;
#endif
};

Expand Down
Loading

0 comments on commit a63cba1

Please sign in to comment.