Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
goliaro committed Feb 21, 2025
1 parent 6256118 commit 40a0a53
Show file tree
Hide file tree
Showing 19 changed files with 472 additions and 283 deletions.
14 changes: 14 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);

bool flexflow_config_get_enable_peft(flexflow_config_t handle_);
bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_);
void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, bool value);

void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
int value);
Expand Down Expand Up @@ -1029,12 +1031,24 @@ void flexflow_request_manager_set_max_sequence_length(
int flexflow_request_manager_get_max_sequence_length(
flexflow_request_manager_t handle_);

void flexflow_request_manager_set_max_finetuning_sequence_length(
flexflow_request_manager_t handle_, int max_seq_length);

int flexflow_request_manager_get_max_finetuning_sequence_length(
flexflow_request_manager_t handle_);

void flexflow_request_manager_set_max_concurrent_adapters(
flexflow_request_manager_t handle_, int max_concurrent_adapters);

void flexflow_request_manager_set_enable_peft_finetuning(
flexflow_request_manager_t handle_, bool enable_peft_finetuning_);

void flexflow_request_manager_set_num_transformers_layers(
flexflow_request_manager_t handle_, int num_transformers_layers_);

void flexflow_request_manager_set_num_layers_per_finetuning_step(
flexflow_request_manager_t handle_, int num_layers_per_finetuning_step_);

void flexflow_request_manager_register_tokenizer(
flexflow_request_manager_t handle_,
enum ModelType model_type,
Expand Down
12 changes: 7 additions & 5 deletions include/flexflow/ops/kernels/linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,7 @@ void peft_bwd_kernel_wrapper(LinearMeta const *m,
void *output_grad_ptr,
void const *kernel_ptr,
int in_dim,
int out_dim,
int num_infr_tokens,
int num_peft_tokens);
int out_dim);
void backward_kernel_wrapper(LinearMeta const *m,
void const *input_ptr,
void *input_grad_ptr,
Expand All @@ -94,15 +92,19 @@ void forward_kernel(LinearMeta const *m,
int batch_size,
ffStream_t stream);
template <typename DT>
void store_peft_activations(LinearMeta const *m,
BatchConfig const *bc,
size_t out_dim,
DT *output_ptr,
cudaStream_t stream);
template <typename DT>
void peft_bwd_kernel(LinearMeta const *m,
BatchConfig const *bc,
void *input_grad_ptr,
void *output_grad_ptr,
void const *kernel_ptr,
int in_dim,
int out_dim,
int num_infr_tokens,
int num_peft_tokens,
ffStream_t stream);
template <typename DT>
void backward_kernel(LinearMeta const *m,
Expand Down
10 changes: 9 additions & 1 deletion include/flexflow/ops/kernels/softmax_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class SoftmaxMeta : public OpMeta {
// PEFT related fields
Realm::RegionInstance reserveInst;
void *output_grad_ptr = nullptr;
BatchConfig::TokenId peft_token_ids[BatchConfig::MAX_NUM_TOKENS];
size_t allocated_peft_buffer_size = 0;
};

Expand All @@ -42,7 +43,7 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output_grad);

void inference_kernel_wrapper(SoftmaxMeta const *m,
void inference_kernel_wrapper(SoftmaxMeta *m,
BatchConfig const *bc,
bool is_last_op,
GenericTensorAccessorR const &input,
Expand Down Expand Up @@ -81,6 +82,13 @@ void peft_bwd_kernel(SoftmaxMeta const *m,
int num_classes,
ffStream_t stream);

template <typename DT>
void store_peft_activations(SoftmaxMeta *m,
BatchConfig const *bc,
int num_classes,
DT *output_ptr,
cudaStream_t stream);

} // namespace Internal
} // namespace Softmax
} // namespace Kernels
Expand Down
4 changes: 2 additions & 2 deletions inference/python/ff_peft.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ def get_configs():
# Define sample configs
ff_init_configs = {
# required parameters
"num_gpus": 4,
"num_gpus": 1,
"memory_per_gpu": 14000,
"zero_copy_memory_per_node": 10000,
# optional parameters
"num_cpus": 4,
"legion_utility_processors": 4,
"data_parallelism_degree": 1,
"tensor_parallelism_degree": 4,
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 1,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
Expand Down
28 changes: 28 additions & 0 deletions python/flexflow/core/flexflow_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,16 @@ def python_data_loader_type(self):
@property
def enable_peft(self):
return ffc().flexflow_config_get_enable_peft(self.handle)

@property
def enable_peft_finetuning(self):
return ffc().flexflow_config_get_enable_peft_finetuning(self.handle)

@enable_peft_finetuning.setter
def enable_peft_finetuning(self, value):
if type(value) is not bool:
raise ValueError("enable_peft_finetuning must be specified as a boolean value")
ffc().flexflow_config_set_enable_peft_finetuning(self.handle, value)

@property
def cpu_offload(self):
Expand Down Expand Up @@ -1634,6 +1644,24 @@ def set_max_sequence_length(self, max_length):
def get_max_sequence_length(self):
return ffc().flexflow_request_manager_get_max_sequence_length(self.handle)

def set_max_finetuning_sequence_length(self, max_length):
return ffc().flexflow_request_manager_set_max_finetuning_sequence_length(
self.handle, max_length
)

def get_max_finetuning_sequence_length(self):
return ffc().flexflow_request_manager_get_max_finetuning_sequence_length(self.handle)

def set_num_transformers_layers(self, num_layers):
return ffc().flexflow_request_manager_set_num_transformers_layers(
self.handle, num_layers
)
def set_num_layers_per_finetuning_step(self, num_layers):
return ffc().flexflow_request_manager_set_num_layers_per_finetuning_step(
self.handle, num_layers
)


def set_max_concurrent_adapters(self, max_adapters):
return ffc().flexflow_request_manager_set_max_concurrent_adapters(
self.handle, max_adapters
Expand Down
26 changes: 26 additions & 0 deletions python/flexflow/serve/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,8 @@ def compile(
max_tokens_per_batch: int = 64,
max_concurrent_adapters: int = 1,
enable_peft_finetuning: bool = False,
max_finetuning_seq_length: int = -1,
num_bwd_layers_per_ft_step: int = -1,
ssms: list = [],
):
"""Compile the LLM for inference and load the weights into memory
Expand All @@ -460,6 +462,10 @@ def compile(
:type max_concurrent_adapters: int, optional
:param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
:type enable_peft_finetuning: bool, optional
:param max_finetuning_seq_length: The maximum sequence length to allow for finetuning, defaults to -1 (i.e. same as max_seq_length)
:type max_finetuning_seq_length: int, optional
:param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers)
:type num_bwd_layers_per_ft_step: int, optional
:param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
:type ssms: list, optional
"""
Expand All @@ -479,6 +485,7 @@ def compile(
mode = InferenceMode.INC_DECODING_MODE

self.max_seq_length = max_seq_length
self.ffconfig.enable_peft_finetuning = enable_peft_finetuning

# Create request manager and set serving configuration
self.rm = RequestManager()
Expand All @@ -487,6 +494,17 @@ def compile(
self.rm.set_max_sequence_length(max_seq_length)
self.rm.set_max_concurrent_adapters(max_concurrent_adapters)
self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
if max_finetuning_seq_length == -1:
self.rm.set_max_finetuning_sequence_length(max_seq_length)
else:
self.rm.set_max_finetuning_sequence_length(max_finetuning_seq_length)
self.rm.set_num_transformers_layers(self.hf_config.num_hidden_layers)
if num_bwd_layers_per_ft_step != -1:
self.rm.set_num_layers_per_finetuning_step(num_bwd_layers_per_ft_step)
else:
self.rm.set_num_layers_per_finetuning_step(
self.hf_config.num_hidden_layers
)

# Instantiate the relevant model
self.model = self.model_class(
Expand Down Expand Up @@ -753,6 +771,8 @@ def compile(
max_tokens_per_batch: int = 2048,
max_concurrent_adapters: int = 1,
enable_peft_finetuning: bool = False,
max_finetuning_seq_length: int = -1,
num_bwd_layers_per_ft_step: int = -1,
ssms: list = [],
):
"""Compile the SSM for inference and load the weights into memory
Expand All @@ -768,6 +788,10 @@ def compile(
:type max_concurrent_adapters: int, optional
:param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
:type enable_peft_finetuning: bool, optional
:param max_finetuning_seq_length: The maximum sequence length to allow for finetuning, defaults to -1 (i.e. same as max_seq_length)
:type max_finetuning_seq_length: int, optional
:param num_bwd_layers_per_ft_step: The number of backward layers to run per finetuning step, defaults to -1 (i.e. all layers)
:type num_bwd_layers_per_ft_step: int, optional
:param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
:type ssms: list, optional
"""
Expand All @@ -778,5 +802,7 @@ def compile(
max_tokens_per_batch,
max_concurrent_adapters,
enable_peft_finetuning,
max_finetuning_seq_length,
num_bwd_layers_per_ft_step,
ssms,
)
37 changes: 37 additions & 0 deletions src/c/flexflow_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,14 @@ bool flexflow_config_get_enable_peft(flexflow_config_t handle_) {
FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
return handle->enable_peft;
}
bool flexflow_config_get_enable_peft_finetuning(flexflow_config_t handle_) {
FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
return handle->enable_peft_finetuning;
}
void flexflow_config_set_enable_peft_finetuning(flexflow_config_t handle_, bool value) {
FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
handle->enable_peft_finetuning = value;
}

int flexflow_config_get_python_data_loader_type(flexflow_config_t handle_) {
FFConfig *handle = FFCObjectWrapper::unwrap(handle_);
Expand Down Expand Up @@ -2776,6 +2784,19 @@ int flexflow_request_manager_get_max_sequence_length(
return handle->get_max_sequence_length();
}

void flexflow_request_manager_set_max_finetuning_sequence_length(
flexflow_request_manager_t handle_, int max_seq_length) {
RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
handle->set_max_finetuning_sequence_length(max_seq_length);
DEBUG_PRINT("[RequestManager] set max finetuning sequence length %d", max_seq_length);
}

int flexflow_request_manager_get_max_finetuning_sequence_length(
flexflow_request_manager_t handle_) {
RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
return handle->get_max_finetuning_sequence_length();
}

void flexflow_request_manager_set_max_concurrent_adapters(
flexflow_request_manager_t handle_, int max_concurrent_adapters) {
RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
Expand All @@ -2792,6 +2813,22 @@ void flexflow_request_manager_set_enable_peft_finetuning(
enable_peft_finetuning_);
}

void flexflow_request_manager_set_num_transformers_layers(
flexflow_request_manager_t handle_, int num_transformers_layers_) {
RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
handle->set_num_transformer_layers(num_transformers_layers_);
DEBUG_PRINT("[RequestManager] set num_transformers_layers %d",
num_transformers_layers_);
}

void flexflow_request_manager_set_num_layers_per_finetuning_step(
flexflow_request_manager_t handle_, int num_layers_per_finetuning_step_) {
RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
handle->set_num_layers_per_finetuning_step(num_layers_per_finetuning_step_);
DEBUG_PRINT("[RequestManager] set num layers per finetuning step %d",
num_layers_per_finetuning_step_);
}

void flexflow_request_manager_register_tokenizer(
flexflow_request_manager_t handle_,
enum ModelType model_type,
Expand Down
4 changes: 1 addition & 3 deletions src/ops/fused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -840,9 +840,7 @@ __host__ bool FusedOp::peft_bwd_task(Task const *task,
my_output_grad_accessor[0].ptr,
my_weight_accessor[0].ptr,
in_dim,
out_dim,
num_infr_tokens,
num_peft_tokens);
out_dim);
break;
}
case OP_LORA: {
Expand Down
Loading

0 comments on commit 40a0a53

Please sign in to comment.