Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

save number of parameters and the size in llama_model, fixes #10285 #10286

Merged
merged 19 commits into from
Nov 16, 2024
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2907,9 +2907,15 @@ struct llama_model {
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

int64_t t_load_us = 0;
int64_t t_load_us = 0;
int64_t t_start_us = 0;

// total number of parameters in the model
uint64_t n_elements = 0;

// total size of all the tensors in the model in bytes
size_t n_bytes = 0;

// keep track of loaded lora adapters
std::set<struct llama_lora_adapter *> lora_adapters;

Expand Down Expand Up @@ -4275,8 +4281,8 @@ struct llama_model_loader {
int n_tensors = 0;
int n_created = 0;

int64_t n_elements = 0;
size_t n_bytes = 0;
uint64_t n_elements = 0;
size_t n_bytes = 0;

bool use_mmap = false;
bool check_tensors;
Expand Down Expand Up @@ -5344,6 +5350,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
}
}

static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
model.n_elements = ml.n_elements;
model.n_bytes = ml.n_bytes;
}

static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
model.arch = ml.get_arch();
if (model.arch == LLM_ARCH_UNKNOWN) {
Expand Down Expand Up @@ -9256,6 +9267,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}

llm_load_stats(ml, model);
llm_load_print_meta(ml, model);

if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
Expand Down Expand Up @@ -18601,6 +18613,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llama_model model;
llm_load_arch(ml, model);
llm_load_hparams(ml, model);
llm_load_stats(ml, model);

struct quantize_state_internal qs(model, params);

Expand Down Expand Up @@ -19953,19 +19966,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
}

uint64_t llama_model_size(const struct llama_model * model) {
uint64_t size = 0;
for (const auto & it : model->tensors_by_name) {
size += ggml_nbytes(it.second);
}
return size;
return model->n_bytes;
}

uint64_t llama_model_n_params(const struct llama_model * model) {
uint64_t nparams = 0;
for (const auto & it : model->tensors_by_name) {
nparams += ggml_nelements(it.second);
}
return nparams;
return model->n_elements;
}

struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
Expand Down
Loading