Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Aug 15, 2023
1 parent 6c63550 commit 2e07b99
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 76 deletions.
10 changes: 5 additions & 5 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1744,12 +1744,12 @@ extern "C" {
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
GGML_API void * gguf_get_data (struct gguf_context * ctx);

GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);

GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i);
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);

GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
Expand Down
124 changes: 53 additions & 71 deletions gguf-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,20 +573,19 @@ struct gguf_file_loader {

struct ggml_context * ctx_data = NULL;

gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map)
: file(fname, "rb") {
gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, "rb") {
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);

struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_data,
};
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_data,
};

gguf_ctx = gguf_init_from_file(fname, params);
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
gguf_ctx = gguf_init_from_file(fname, params);
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);

read_hparams();
read_vocab();
read_hparams();
read_vocab();
read_tensor_metadata(tensors_map);
}

Expand Down Expand Up @@ -636,18 +635,18 @@ struct gguf_file_loader {

void read_vocab() {
vocab.id_to_token.resize(hparams.n_vocab);
int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");

const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
if (token_idx == -1) {
throw std::runtime_error("cannot find token list in GGUF file\n");
}

int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
if (score_idx == -1) {
throw std::runtime_error("cannot find token scores list in GGUF file\n");
}

for (uint32_t i = 0; i < hparams.n_vocab; i++) {

std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);

vocab.token_to_id[word] = i;
Expand Down Expand Up @@ -786,7 +785,7 @@ struct gguf_file_saver {
gguf_type arr_type;
int n_arr;

switch(vtype) {
switch (vtype) {
case GGUF_TYPE_BOOL:
bool_val = gguf_get_val_bool(fl->gguf_ctx, i);
file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val);
Expand All @@ -809,7 +808,7 @@ struct gguf_file_saver {
break;
case GGUF_TYPE_STRING:
str_val = gguf_get_val_str(fl->gguf_ctx, i);
file.write_val<std::string>(key, GGUF_TYPE_STRING, str_val);
file.write_str(key, GGUF_TYPE_STRING, str_val);
break;
case GGUF_TYPE_UINT16:
u16_val = gguf_get_val_u16(fl->gguf_ctx, i);
Expand All @@ -825,7 +824,7 @@ struct gguf_file_saver {
break;
case GGUF_TYPE_ARRAY:
arr_type = gguf_get_arr_type(fl->gguf_ctx, i);
n_arr = gguf_get_arr_n(fl->gguf_ctx, i);
n_arr = gguf_get_arr_n (fl->gguf_ctx, i);
if (arr_type == GGUF_TYPE_FLOAT32) {
write_hparam_arr_f32(key, arr_type, i, n_arr);
} else if (arr_type == GGUF_TYPE_STRING) {
Expand Down Expand Up @@ -922,20 +921,6 @@ struct llama_model_loader {
}
}

struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
}
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
}

return get_tensor_for(lt, backend);
}

struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor;
if (backend != GGML_BACKEND_CPU) {
Expand All @@ -959,16 +944,41 @@ struct llama_model_loader {
return tensor;
}

struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
}
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
}

return get_tensor_for(lt, backend);
}

void done_getting_tensors() const {
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
}
}

void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
size_t data_size = 0;
void load_data_for(gguf_load_tensor & lt) const {
if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off;
} else {
gguf_file & file = file_loader->file;
file.seek(lt.file_off, SEEK_SET);
file.read_raw(lt.data, lt.size);
}
}

void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
size_t data_size = 0;
size_t prefetch_size = 0;
size_t lock_size = 0;
size_t lock_size = 0;

for (const gguf_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
Expand Down Expand Up @@ -1030,31 +1040,6 @@ struct llama_model_loader {
done_size += lt.size;
}
}

void load_data_for(gguf_load_tensor & lt) {
if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off;
} else {
gguf_file & file = file_loader->file;
file.seek(lt.file_off, SEEK_SET);
file.read_raw(lt.data, lt.size);
}

if (0) {
print_checksum(lt);
}
}

static void print_checksum(gguf_load_tensor & lt) {
uint32_t sum = 0;
for (size_t i = 0; i < lt.size; i++) {
uint8_t byte = lt.data[i];
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
}
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
}

};

//
Expand Down Expand Up @@ -1187,15 +1172,15 @@ int64_t llama_time_us() {
// model loading
//

static const char *gguf_file_version_name(gguf_file_version version) {
static const char * gguf_file_version_name(gguf_file_version version) {
switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
}
}

return "unknown";
}

static const char *llama_ftype_name(enum llama_ftype ftype) {
static const char * llama_ftype_name(enum llama_ftype ftype) {
switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
Expand All @@ -1220,10 +1205,10 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
}
}

static const char *llama_model_type_name(e_model type) {
static const char * llama_model_type_name(e_model type) {
switch (type) {
case MODEL_3B: return "3B";
case MODEL_7B: return "7B";
case MODEL_3B: return "3B";
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_30B: return "30B";
case MODEL_65B: return "65B";
Expand Down Expand Up @@ -2996,9 +2981,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
}
}

const auto rejects =
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
for (auto & reject : rejects) {
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
for (const auto & reject : rejects) {
candidates->data[reject.index].logit = -INFINITY;
}

Expand Down Expand Up @@ -3725,7 +3709,7 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const llama_model_quantize_params *params) {
const llama_model_quantize_params * params) {
try {
llama_model_quantize_internal(fname_inp, fname_out, params);
return 0;
Expand Down Expand Up @@ -4343,8 +4327,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
GGML_UNUSED(n_token_capacity);
GGML_UNUSED(n_token_count_out);


// TODO: implement with GGUF format
// TODO: implement with GGUF format
return true;
}

Expand Down Expand Up @@ -4389,7 +4372,6 @@ int llama_eval(
return 0;
}


int llama_eval_embd(
struct llama_context * ctx,
const float * embd,
Expand Down
1 change: 1 addition & 0 deletions gguf-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ struct gguf_file {

template<typename T>
void write_val(const std::string & key, enum gguf_type type, const T & val) {
static_assert(std::is_fundamental<T>::value, "T must be a primitive type");
write_str(key);
fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &val, sizeof(val), 1, fp);
Expand Down

0 comments on commit 2e07b99

Please sign in to comment.