From 721405f7ce927e50896d258ef87273fd3c6d65c4 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Thu, 19 Sep 2024 18:43:04 +0800 Subject: [PATCH] skip 7layers --- examples/llava/minicpmv-cli.cpp | 55 +++++++++++++++++++++++---------- llama.cpp | 22 ++++++------- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index e58b2643242150..8cdfdafbe08ee9 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -53,6 +53,22 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } +static void load_model2(struct llava_context *ctx, gpt_params * params){ + llama_model * model2 = nullptr; + if(params->skip_model.size() > 0 && params->skip_layers > 0) { + //load last model + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + model_params.init_time = true; + model_params.has_vocab = false; + //llama_model * model2 = llama_load_model_from_file(params->model.c_str(), model_params); + //llama_model * model2 = llama_load_model_from_file("/Users/zkh/Downloads/last_16/ggml-model-Q4_0.gguf", model_params); + model2 = llama_load_model_from_file(params->skip_model.c_str(), model_params); + llama_set_model_skip_layers(model2, params->skip_layers); + //llama_add_model_load_times(model, model2); + llama_set_model2(ctx->ctx_llama, model2); + } +} + static struct llava_context * llava_init_context(gpt_params * params) { auto model = llava_init(params); if (model == NULL) { @@ -76,18 +92,18 @@ static struct llava_context * llava_init_context(gpt_params * params) { ctx_params.n_ctx = params->n_ctx; } - llama_model * model2 = nullptr; - if(params->skip_model.size() > 0 && params->skip_layers > 0) { - //load last model - llama_model_params model_params = llama_model_params_from_gpt_params(*params); - model_params.init_time = false; - model_params.has_vocab = false; - //llama_model * model2 = llama_load_model_from_file(params->model.c_str(), model_params); - //llama_model * model2 = llama_load_model_from_file("/Users/zkh/Downloads/last_16/ggml-model-Q4_0.gguf", model_params); - model2 = llama_load_model_from_file(params->skip_model.c_str(), model_params); - llama_set_model_skip_layers(model2, params->skip_layers); - //llama_add_model_load_times(model, model2); - } + // llama_model * model2 = nullptr; + // if(params->skip_model.size() > 0 && params->skip_layers > 0) { + // //load last model + // llama_model_params model_params = llama_model_params_from_gpt_params(*params); + // model_params.init_time = false; + // model_params.has_vocab = false; + // //llama_model * model2 = llama_load_model_from_file(params->model.c_str(), model_params); + // //llama_model * model2 = llama_load_model_from_file("/Users/zkh/Downloads/last_16/ggml-model-Q4_0.gguf", model_params); + // model2 = llama_load_model_from_file(params->skip_model.c_str(), model_params); + // llama_set_model_skip_layers(model2, params->skip_layers); + // //llama_add_model_load_times(model, model2); + // } llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); @@ -96,9 +112,9 @@ static struct llava_context * llava_init_context(gpt_params * params) { return NULL; } - if(params->skip_model.size() > 0 && params->skip_layers > 0) { - llama_set_model2(ctx_llama, model2); - } + // if(params->skip_model.size() > 0 && params->skip_layers > 0) { + // llama_set_model2(ctx_llama, model2); + // } for (unsigned int i = 0; i < params->lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params->lora_adapter[i]); @@ -347,9 +363,10 @@ int main(int argc, char ** argv) { if (params.image.size() > 0) { auto image = params.image; - ctx_llava = minicpmv_init(¶ms, image, n_past); + ctx_llava = minicpmv_init(¶ms, image, n_past); //release vit memory - clip_free(ctx_llava->ctx_clip); + //clip_free(ctx_llava->ctx_clip); + load_model2(ctx_llava, ¶ms); if (!params.prompt.empty()) { LOG_TEE("%s\n", params.prompt.c_str()); LOG_TEE(""); @@ -398,7 +415,11 @@ int main(int argc, char ** argv) { llama_print_timings(ctx_llava->ctx_llama); ctx_llava->model = NULL; + auto free_start = ggml_time_us(); llava_free(ctx_llava); + auto free_time = ggml_time_us() - free_start; + printf("free times : %.4fms\n", free_time * 1e-3); + // } return 0; diff --git a/llama.cpp b/llama.cpp index ec36da68e12c09..41864401561378 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6767,18 +6767,18 @@ struct llm_build_context { const llama_model *m = &model; int local_il = il; - if(il >= n_layer - skip_layers && model2 != nullptr){//TODO: && is_vit - m = model2; - local_il = skip_idx; - skip_idx += 1; - } - // if(model2 != nullptr){ - // auto it = find(skip_list.begin(), skip_list.end(), il); - // if(it != skip_list.end()){ - // local_il = it - skip_list.begin(); - // m = model2; - // } + // if(il >= n_layer - skip_layers && model2 != nullptr){//TODO: && is_vit + // m = model2; + // local_il = skip_idx; + // skip_idx += 1; // } + if(model2 != nullptr){ + auto it = find(skip_list.begin(), skip_list.end(), il); + if(it != skip_list.end()){ + local_il = it - skip_list.begin(); + m = model2; + } + } // norm cur = llm_build_norm(ctx0, inpL, hparams,