diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h index a509af765..29915bf2d 100644 --- a/include/flexflow/batch_config.h +++ b/include/flexflow/batch_config.h @@ -44,6 +44,11 @@ struct OptimizerTasks { bool save_updated_weights = false; }; +struct NewPeftModelPath { + PEFTModelID peft_model_id; + std::string filepath; +}; + void set_optimizer_tasks(OptimizerTasks &tasks, int max_training_steps, int completed_training_steps, @@ -135,6 +140,7 @@ class BatchConfig { PerRequestInfo requestsInfo[MAX_NUM_REQUESTS]; PerTokenInfo tokensInfo[MAX_NUM_TOKENS]; PerTokenInfo labelsInfo[MAX_NUM_TOKENS]; + NewPeftModelPath new_peft_model_paths[MAX_NUM_REQUESTS]; bool request_completed[MAX_NUM_REQUESTS]; bool request_running[MAX_NUM_REQUESTS]; diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h index 1cdeb65aa..2d8e5360d 100644 --- a/include/flexflow/ops/lora_linear_params.h +++ b/include/flexflow/ops/lora_linear_params.h @@ -17,7 +17,10 @@ namespace FlexFlow { class LoraOptimizerConfig { public: LoraOptimizerConfig(); - virtual ~LoraOptimizerConfig() {} + virtual std::string getType() const = 0; + virtual nlohmann::json toJson() const = 0; + static std::unique_ptr fromJson(const nlohmann::json& j); + virtual ~LoraOptimizerConfig() = default; }; class LoraSGDOptimizerConfig : public LoraOptimizerConfig { @@ -29,9 +32,25 @@ class LoraSGDOptimizerConfig : public LoraOptimizerConfig { bool weight_decay_ = 0.0f); friend std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE( - LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay) + + std::string getType() const override { return "SGD"; } + + nlohmann::json toJson() const override { + return {{"type", "SGD"}, + {"lr", lr}, + {"momentum", momentum}, + {"nesterov", nesterov}, + {"weight_decay", weight_decay}}; + } + + static std::unique_ptr fromJson(const nlohmann::json& j) { + auto sgd = std::make_unique(); + sgd->lr = j["lr"]; + sgd->momentum = j["momentum"]; + sgd->nesterov = j["nesterov"]; + sgd->weight_decay = j["weight_decay"]; + return sgd; + } public: double lr = 0.001f; @@ -50,9 +69,27 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon_ = 1e-8); friend std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE( - LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon) + + std::string getType() const override { return "Adam"; } + + nlohmann::json toJson() const override { + return {{"type", "Adam"}, + {"alpha", alpha}, + {"beta1", beta1}, + {"beta2", beta2}, + {"weight_decay", weight_decay}, + {"epsilon", epsilon}}; + } + + static std::unique_ptr fromJson(const nlohmann::json& j) { + auto adam = std::make_unique(); + adam->alpha = j["alpha"]; + adam->beta1 = j["beta1"]; + adam->beta2 = j["beta2"]; + adam->weight_decay = j["weight_decay"]; + adam->epsilon = j["epsilon"]; + return adam; + } public: // Adam @@ -63,13 +100,13 @@ class LoraAdamOptimizerConfig : public LoraOptimizerConfig { double epsilon = 1e-8; }; -// Serialization helpers -template -void serialize_to_json_file(T const &obj, fs::path const &filepath); +std::unique_ptr LoraOptimizerConfig::fromJson(const nlohmann::json& j) { + std::string type = j["type"]; + if (type == "SGD") return LoraSGDOptimizerConfig::fromJson(j); + if (type == "Adam") return LoraAdamOptimizerConfig::fromJson(j); + throw std::runtime_error("Unknown optimizer type"); +} -// Function to deserialize JSON from file and create object -template -std::unique_ptr deserialize_from_json_file(fs::path const &filepath); class LoraLinearConfig { public: @@ -87,22 +124,54 @@ class LoraLinearConfig { std::vector const &target_modules_ = {}); // constructor used to support std::unordered_map LoraLinearConfig(); + template + void setOptimizer(T&& opt) { + optimizer_config = std::make_unique(std::forward(opt)); + } friend bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs); friend std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc); - - NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig, - cache_folder, - peft_model_id, - rank, - lora_alpha, - lora_dropout, - target_modules, - trainable, - init_lora_weights, - base_model_name_or_path, - precision) + void serialize_to_json_file(const std::string& filename) const { + json j = { + {"cache_folder", cache_folder}, + {"peft_model_id", peft_model_id}, + {"rank", rank}, + {"lora_alpha", lora_alpha}, + {"lora_dropout", lora_dropout}, + {"target_modules", target_modules}, + {"trainable", trainable}, + {"init_lora_weights", init_lora_weights}, + {"base_model_name_or_path", base_model_name_or_path}, + {"precision", precision}, + {"optimizer_config", optimizer_config ? optimizer_config->toJson() : nullptr} + }; + + std::ofstream file(filename); + file << j.dump(4); // Use 4 spaces for indentation + } + // Deserialization method + static LoraLinearConfig deserialize_from_json_file(const std::string& filename) { + std::ifstream file(filename); + json j; + file >> j; + LoraLinearConfig metadata( + j["cache_folder"].get(), + j["peft_model_id"].get>(), + j["rank"].get(), + j["lora_alpha"].get(), + j["lora_dropout"].get(), + j["target_modules"].get>(), + j["trainable"].get(), + j["init_lora_weights"].get(), + j["base_model_name_or_path"].get(), + j["precision"].get() + ); + if (!j["optimizer_config"].is_null()) { + metadata.optimizer_config = LoraOptimizerConfig::fromJson(j["optimizer_config"]); + } + return metadata; + } std::string cache_folder; // Huggingface model ID (for download and/or upload) @@ -116,7 +185,8 @@ class LoraLinearConfig { // whether the weights are trainable (fine-tuning scenario) or not // (inference-only). If set to true, allocate space for the gradients bool trainable = false; - LoraOptimizerConfig *optimizer_config; + // LoraOptimizerConfig *optimizer_config; + std::unique_ptr optimizer_config; // whether to initialize weights randomly (instead of attempting to load them // from file) bool init_lora_weights; diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h index 902865694..7c1bd01ea 100644 --- a/include/flexflow/utils/peft_weight_allocator.h +++ b/include/flexflow/utils/peft_weight_allocator.h @@ -95,7 +95,7 @@ class PEFTMemoryManager { : max_concurrent_adapters(max_concurrent_adapters_), max_lora_size(max_lora_size_), base_ptr(nullptr) {} // allocate memory for all the PEFT adapters for a given layer on a given shard - void allocate_memory(Memory gpu_mem) { + void allocate_inference_memory(Memory gpu_mem) { // allocate chunk of memory for all the PEFT adapters Realm::Rect<1, coord_t> bounds( Realm::Point<1, coord_t>(0), @@ -111,6 +111,9 @@ class PEFTMemoryManager { .wait(); base_ptr = peftLegionInst.pointer_untyped(0, sizeof(char)); } + void allocate_finetuning_memory(Memory gpu_mem) { + + } // Returns the slot in memory where the peft model weights are/will be stored. // If the model is not in memory (cache miss), set the cache_miss flag to true. @@ -160,7 +163,7 @@ class PEFTMemoryManager { int max_concurrent_adapters; size_t max_lora_size; Realm::RegionInstance peftLegionInst; - void *base_ptr; + void *base_ptr; void *finetuning_ptr; std::unordered_map lru_hashtable; std::vector lru_list; // head = least recently used, tail=most recently used std::unordered_map peft2mem_slot; diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc index 0277c008c..f4c1ba9c3 100644 --- a/src/ops/lora_linear.cc +++ b/src/ops/lora_linear.cc @@ -519,12 +519,17 @@ OpMeta *LoraLinear::init_task(Task const *task, std::string lora_layername_substr = lora_layername.substr(0, found + searchString.length()); + // allocate space for lora weights size_t max_lora_size = data_type_size(dt) * (lora->max_rank * in_dim + lora->max_rank * out_dim); m->peft_memory_manager = new PEFTMemoryManager(max_lora_size, lora->max_concurrent_adapters); Memory gpu_mem = get_proc_mem(Machine::get_machine(), task->target_proc); - m->peft_memory_manager->allocate_memory(gpu_mem); + m->peft_memory_manager->allocate_inference_memory(gpu_mem); - for (auto const &kv : lora->peft_configs) { + return m; +} + +void load_peft_adapters(BatchConfig const *bc){ + for (auto const &kv : bc->peft_configs) { PEFTModelID const &model_id = kv.first; LoraLinearConfig const &lora_config = kv.second; @@ -680,7 +685,6 @@ OpMeta *LoraLinear::init_task(Task const *task, m->model_state[model_id].cache_folder = lora_config.cache_folder; m->model_state[model_id].peft_model_id = lora_config.peft_model_id; } - return m; } void LoraLinear::forward(FFModel const &ff) { diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc index 6e0c60e05..310b6d097 100644 --- a/src/ops/lora_linear_params.cc +++ b/src/ops/lora_linear_params.cc @@ -50,38 +50,6 @@ std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) { return os; } -// Serialization helpers -template -void serialize_to_json_file(T const &obj, fs::path const &filepath) { - json j = obj; - std::ofstream file(filepath); - file << j.dump(4); -} - -template -std::unique_ptr deserialize_from_json_file(fs::path const &filepath) { - std::ifstream file(filepath); - json j; - file >> j; - return std::make_unique(j.get()); -} - -template void - serialize_to_json_file(LoraLinearConfig const &obj, - fs::path const &filepath); -template void serialize_to_json_file( - LoraSGDOptimizerConfig const &obj, fs::path const &filepath); -template void serialize_to_json_file( - LoraAdamOptimizerConfig const &obj, fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file(fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file( - fs::path const &filepath); -template std::unique_ptr - deserialize_from_json_file( - fs::path const &filepath); - // ------------------ LoRA configs ------------------- // --------------------------------------------------- const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", "");