diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6bb1a75dc4c335..4c0df74e9243c5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ add_library(llama llama-kv-cache.cpp llama-mmap.cpp llama-model.cpp + llama-model-loader.cpp llama-sampling.cpp llama-vocab.cpp unicode.h diff --git a/src/llama-hparams.h b/src/llama-hparams.h index a793b287c29ed6..3a76b71a4e18eb 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -69,12 +69,13 @@ struct llama_hparams { uint32_t time_decay_extra_dim = 0; uint32_t wkv_head_size = 0; - float rope_attn_factor = 1.0f; - float rope_freq_base_train; - float rope_freq_scale_train; - uint32_t n_ctx_orig_yarn; - float rope_yarn_log_mul; - int rope_sections[4]; // TODO: actually this should be std::array (I was wrong) + float rope_attn_factor = 1.0f; + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_ctx_orig_yarn; + float rope_yarn_log_mul; + + std::array rope_sections; // for State Space Models uint32_t ssm_d_conv = 0; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index f8659796f0d655..ca9f1ea8e7e38c 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -2,9 +2,11 @@ #include "llama.h" +#include #include #include #include +#include struct llama_logger_state { ggml_log_callback log_callback = llama_log_callback_default; @@ -89,3 +91,75 @@ std::string format(const char * fmt, ...) { va_end(ap); return std::string(buf.data(), size); } + +std::string llama_format_tensor_shape(const std::vector & ne) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0)); + for (size_t i = 1; i < ne.size(); i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i)); + } + return buf; +} + +std::string llama_format_tensor_shape(const struct ggml_tensor * t) { + char buf[256]; + snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]); + } + return buf; +} + +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return format("unknown type %d", type); + } +} + +std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + replace_all(val, "\\", "\\\\"); + replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} diff --git a/src/llama-impl.h b/src/llama-impl.h index fbf88039d3b99f..12d1fb0828d3a9 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -3,6 +3,7 @@ #include "ggml.h" // for ggml_log_level #include +#include #ifdef __GNUC__ #ifdef __MINGW32__ @@ -33,6 +34,12 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * // helpers // +template +struct no_init { + T value; + no_init() { /* do nothing */ } +}; + struct time_meas { time_meas(int64_t & t_acc, bool disable = false); ~time_meas(); @@ -47,3 +54,8 @@ void replace_all(std::string & s, const std::string & search, const std::string // TODO: rename to llama_format ? LLAMA_ATTRIBUTE_FORMAT(1, 2) std::string format(const char * fmt, ...); + +std::string llama_format_tensor_shape(const std::vector & ne); +std::string llama_format_tensor_shape(const struct ggml_tensor * t); + +std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp new file mode 100644 index 00000000000000..87ffdc5995ecc3 --- /dev/null +++ b/src/llama-model-loader.cpp @@ -0,0 +1,1008 @@ +#include "llama-model-loader.h" + +#include "ggml.h" + +#include +#include + +const char * llama_file_version_name(llama_fver version) { + switch (version) { + case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; + case GGUF_FILE_VERSION_V2: return "GGUF V2"; + case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; + } + + return "unknown"; +} + +namespace GGUFMeta { + template + struct GKV_Base_Type { + static constexpr gguf_type gt = gt_; + + static T getter(const gguf_context * ctx, const int kid) { + return gfun(ctx, kid); + } + }; + + template struct GKV_Base; + + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + template<> struct GKV_Base: GKV_Base_Type {}; + + template<> struct GKV_Base { + static constexpr gguf_type gt = GGUF_TYPE_STRING; + + static std::string getter(const gguf_context * ctx, const int kid) { + return gguf_get_val_str(ctx, kid); + } + }; + + struct ArrayInfo { + const gguf_type gt; + const size_t length; + const void * data; + }; + + template<> struct GKV_Base { + public: + static constexpr gguf_type gt = GGUF_TYPE_ARRAY; + static ArrayInfo getter(const gguf_context *ctx, const int k) { + return ArrayInfo { + gguf_get_arr_type(ctx, k), + size_t(gguf_get_arr_n(ctx, k)), + gguf_get_arr_data(ctx, k), + }; + } + }; + + template + class GKV : public GKV_Base { + GKV() = delete; + + public: + static T get_kv(const gguf_context * ctx, const int k) { + const enum gguf_type kt = gguf_get_kv_type(ctx, k); + + if (kt != GKV::gt) { + throw std::runtime_error(format("key %s has wrong type %s but expected type %s", + gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); + } + return GKV::getter(ctx, k); + } + + static const char * override_type_to_str(const llama_model_kv_override_type ty) { + switch (ty) { + case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; + case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; + case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; + } + return "unknown"; + } + + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { + if (!ovrd) { return false; } + if (ovrd->tag == expected_type) { + LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", + __func__, override_type_to_str(ovrd->tag), ovrd->key); + switch (ovrd->tag) { + case LLAMA_KV_OVERRIDE_TYPE_BOOL: { + LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); + } break; + case LLAMA_KV_OVERRIDE_TYPE_INT: { + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); + } break; + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { + LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); + } break; + case LLAMA_KV_OVERRIDE_TYPE_STR: { + LLAMA_LOG_INFO("%s\n", ovrd->val_str); + } break; + default: + // Shouldn't be possible to end up here, but just in case... + throw std::runtime_error( + format("Unsupported attempt to override %s type for metadata key %s\n", + override_type_to_str(ovrd->tag), ovrd->key)); + } + return true; + } + LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", + __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); + return false; + } + + template + static typename std::enable_if::value, bool>::type + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { + target = ovrd->val_bool; + return true; + } + return false; + } + + template + static typename std::enable_if::value && std::is_integral::value, bool>::type + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { + target = ovrd->val_i64; + return true; + } + return false; + } + + template + static typename std::enable_if::value, bool>::type + try_override(T & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { + target = ovrd->val_f64; + return true; + } + return false; + } + + template + static typename std::enable_if::value, bool>::type + try_override(T & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { + target = ovrd->val_str; + return true; + } + return false; + } + + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + if (try_override(target, ovrd)) { + return true; + } + if (k < 0) { return false; } + target = get_kv(ctx, k); + return true; + } + + static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, gguf_find_key(ctx, key), target, ovrd); + } + + static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, key.c_str(), target, ovrd); + } + }; +} + + template + typename std::enable_if::value, bool>::type + llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) { + const int kid = gguf_find_key(meta.get(), key.c_str()); + + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(meta.get(), kid); + + + result = arr_info.length; + return true; + } + + template + typename std::enable_if::value, bool>::type + llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) { + return get_arr_n(llm_kv(kid), result, required); + } + + template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required); + + template + bool llama_model_loader::get_arr(const std::string & key, std::vector & result, bool required) { + const int kid = gguf_find_key(meta.get(), key.c_str()); + + if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { + if (required) { + throw std::runtime_error(format("array key not found in model: %s", key.c_str())); + } + return false; + } + + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(meta.get(), kid); + + switch (arr_info.gt) { + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_INT32: GGML_ASSERT( + (std::is_same::value) || + (std::is_same::value)); break; + default: + throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); + } + + result.resize(arr_info.length); + result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); + + return true; + } + + template + bool llama_model_loader::get_arr(const std::string & key, std::array & result, bool required) { + const int kid = gguf_find_key(meta.get(), key.c_str()); + + if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { + if (required) { + throw std::runtime_error(format("array key not found in model: %s", key.c_str())); + } + return false; + } + + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(meta.get(), kid); + + switch (arr_info.gt) { + case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; + case GGUF_TYPE_INT32: GGML_ASSERT( + (std::is_same::value) || + (std::is_same::value)); break; + default: + throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); + } + + if (arr_info.length > N_MAX) { + throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); + } + + std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); + + return true; + } + + template + bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) { + return get_arr(llm_kv(kid), result, required); + } + + template + bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { + auto it = kv_overrides.find(key); + + const struct llama_model_kv_override * override = + it != kv_overrides.end() ? &it->second : nullptr; + + const bool found = GGUFMeta::GKV::set(meta.get(), key, result, override); + + if (required && !found) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + + return found; + } + + template + bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) { + return get_key(llm_kv(kid), result, required); + } + + template bool llama_model_loader::get_key (enum llm_kv kid, bool & result, bool required); + template bool llama_model_loader::get_key (enum llm_kv kid, float & result, bool required); + template bool llama_model_loader::get_key (enum llm_kv kid, uint32_t & result, bool required); + template bool llama_model_loader::get_key(enum llm_kv kid, std::string & result, bool required); + + template<> + bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) { + uint32_t tmp; + const bool found = get_key(kid, tmp, required); + if (found) { + result = (enum llama_pooling_type) tmp; + } else { + result = LLAMA_POOLING_TYPE_UNSPECIFIED; + } + return found; + } + + // get array of n <= N_MAX elements, or a single element repeated n times + template + bool llama_model_loader::get_key_or_arr(const std::string & key, std::array & result, uint32_t n, bool required) { + const int kid = gguf_find_key(meta.get(), key.c_str()); + + if (kid < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + if (n > N_MAX) { + throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); + } + + if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { + struct GGUFMeta::ArrayInfo arr_info = + GGUFMeta::GKV::get_kv(meta.get(), kid); + + if (n != arr_info.length) { + throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); + } + + return get_arr(key, result, required); + } + + T value; + + bool ok = get_key(key, value, required); + if (!ok) { + return false; + } + + for (uint32_t i = 0; i < n; i++) { + result[i] = value; + } + + return true; + } + + template + bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) { + return get_key_or_arr(llm_kv(kid), result, n, required); + } + + // TODO: this is not very clever - figure out something better + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + +llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { + int trace = 0; + if (getenv("LLAMA_TRACE")) { + trace = atoi(getenv("LLAMA_TRACE")); + } + + if (param_overrides_p != nullptr) { + for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { + kv_overrides.insert({std::string(p->key), *p}); + } + } + + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta.reset(gguf_init_from_file(fname.c_str(), params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + files.emplace_back(new llama_file(fname.c_str(), "rb")); + contexts.emplace_back(ctx); + + // Save tensors data offset of the main file. + // For subsidiary files, `meta` tensor data offset must not be used, + // so we build a unified tensors index for weights. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); + } + uint16_t n_split = 0; + get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); + + // Load additional GGML contexts + if (n_split > 1) { + uint16_t idx = 0; + get_key(llm_kv(LLM_KV_SPLIT_NO), idx); + if (idx != 0) { + throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); + } + + std::vector split_prefix(llama_path_max(), 0); + if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) { + throw std::runtime_error(format("invalid split file: %s", fname.c_str())); + } + + if (trace > 0) { + LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); + } + + std::vector split_path(llama_path_max(), 0); + for (idx = 1; idx < n_split; idx++) { + llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split); + + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) }; + if (!ctx_gguf) { + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data())); + } + + files.emplace_back(new llama_file(split_path.data(), "rb")); + contexts.emplace_back(ctx); + + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); + } + } + + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); + + // sanity check + { + const int n_tensors_loaded = (int) weights_map.size(); + if (n_tensors != n_tensors_loaded) { + throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); + } + } + + LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); + } + + n_kv = gguf_get_n_kv(meta.get()); + n_tensors = weights_map.size(); + + fver = (enum llama_fver) gguf_get_version(meta.get()); + + LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", + __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + + // determine file type based on the number of tensors for each quantization and print meta data + // TODO: make optional + { + std::map n_type; + + uint32_t n_type_max = 0; + enum ggml_type type_max = GGML_TYPE_F32; + + for (const auto & it : weights_map) { + const llama_tensor_weight & w = it.second; + const ggml_tensor * tensor = w.tensor; + + enum ggml_type type = tensor->type; + + n_type[type]++; + + if (n_type_max < n_type[type]) { + n_type_max = n_type[type]; + type_max = type; + } + + if (trace > 0) { + const uint16_t sid = w.idx; + LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); + } + } + + switch (type_max) { + case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; + case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; + case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; + case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; + case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; + case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; + case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; + case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; + case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; + case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; + case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; + case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; + case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; + case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; + case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; + case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; + case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; + case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; + case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; + case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; + case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; + default: + { + LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); + ftype = LLAMA_FTYPE_ALL_F32; + } break; + } + + // this is a way to mark that we have "guessed" the file type + ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); + + { + const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV + if (kid >= 0) { + ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid); + } + } + + LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + + for (int i = 0; i < n_kv; i++) { + const char * name = gguf_get_key(meta.get(), i); + const enum gguf_type type = gguf_get_kv_type(meta.get(), i); + const std::string type_name = + type == GGUF_TYPE_ARRAY + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) + : gguf_type_name(type); + + std::string value = gguf_kv_to_str(meta.get(), i); + const size_t MAX_VALUE_LEN = 40; + if (value.size() > MAX_VALUE_LEN) { + value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); + } + replace_all(value, "\n", "\\n"); + + LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + } + + // print type counts + for (auto & kv : n_type) { + if (kv.second == 0) { + continue; + } + + LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + } + } + + if (!llama_mmap::SUPPORTED) { + LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); + use_mmap = false; + } + + this->use_mmap = use_mmap; + this->check_tensors = check_tensors; +} + +std::string llama_model_loader::get_arch_name() const { + return arch_name; +} + +enum llm_arch llama_model_loader::get_arch() const { + return llm_kv.arch; +} + +const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const { + auto pos = weights_map.find(name); + if (pos != weights_map.end()) { + return &pos->second; + } + + return nullptr; +} + +const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const { + const llama_tensor_weight * weight = get_weight(name); + if (!weight) { + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); + } + return *weight; +} + +struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const { + const auto * weight = get_weight(name); + if (!weight) { + return nullptr; + } + return weight->tensor; +} + +struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const { + struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); + if (!tensor) { + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); + } + return tensor; +} + +const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const { + const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); + + if (cur == NULL) { + if (!required) { + return NULL; + } + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); + } + + { + bool is_ok = true; + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { + is_ok = false; + break; + } + } + if (!is_ok) { + throw std::runtime_error( + format("%s: tensor '%s' has wrong shape; expected %s, got %s", + __func__, name.c_str(), + llama_format_tensor_shape(ne).c_str(), + llama_format_tensor_shape(cur).c_str())); + } + } + + return cur; +} + +struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list & ne, int flags) { + const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); + + if (cur == NULL) { + return NULL; + } + + bool duplicated = flags & TENSOR_DUPLICATED; + + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); + ggml_set_name(tensor, ggml_get_name(cur)); + + if (duplicated) { + size_data += ggml_nbytes(cur); + } else { + n_created++; + } + + return tensor; + +} + +struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required) { + const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); + + if (cur == NULL) { + return NULL; + } + + if (cur->type != base->type) { + throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); + } + + std::array dims; + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + dims[i] = i < ne.size() ? ne.begin()[i] : 1; + } + + struct ggml_tensor * tensor = ggml_view_4d(ctx, base, + dims[0], dims[1], dims[2], dims[3], + cur->nb[1], cur->nb[2], cur->nb[3], + offset); + + ggml_set_name(tensor, name.c_str()); + + n_created++; + + return tensor; +} + +void llama_model_loader::done_getting_tensors() const { + if (n_created != n_tensors) { + throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + } +} + +void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) { + if (use_mmap) { + mappings.reserve(files.size()); + mmaps_used.reserve(files.size()); + for (const auto & file : files) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); + mmaps_used.emplace_back(mapping->size(), 0); + if (mlock_mmaps) { + std::unique_ptr mlock_mmap(new llama_mlock()); + mlock_mmap->init(mapping->addr()); + mlock_mmaps->emplace_back(std::move(mlock_mmap)); + } + mappings.emplace_back(std::move(mapping)); + } + } + + // compute the total size of all tensors for progress reporting + for (const auto & it : weights_map) { + size_data += ggml_nbytes(it.second.tensor); + } +} + +void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { + GGML_ASSERT(!mappings.empty()); + const auto & mapping = mappings.at(idx); + + *first = mapping->size(); + *last = 0; + *addr = mapping->addr(); + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) { + continue; + } + *first = std::min(*first, weight->offs); + *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); + } +} + +void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { + const auto & w = require_weight(ggml_get_name(cur)); + + if (use_mmap) { + const auto & mapping = mappings.at(w.idx); + if (cur->data == nullptr) { + cur->data = (uint8_t *)mapping->addr() + w.offs; + } else { + memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); + } + } else { + GGML_ASSERT(cur->data != nullptr); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); + file->seek(w.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); + } + + if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } +} + +bool llama_model_loader::load_all_data( + struct ggml_context * ctx, + llama_buf_map & bufs, + llama_mlocks * lmlocks, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + GGML_ASSERT(size_data != 0 && "call init_mappings() first"); + + std::vector> read_buf; + std::vector>> validation_result; + + // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. + // NVMe raid configurations might require more / larger buffers. + constexpr size_t n_buffers = 4; + constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB + + std::vector host_buffers; + std::vector events; + std::vector host_ptrs; + size_t buffer_idx = 0; // buffer to use for async loads + ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { + if (use_mmap || check_tensors) { + return nullptr; + } + // When not using mmaped io use async uploads from pinned memory to GPU memory. + // First determine if the backend supports the necessary features for async uploads. + auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; + if (!buf) { + LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); + return nullptr; + } + + auto * buft = ggml_backend_buffer_get_type(buf); + auto * dev = ggml_backend_buft_get_device(buft); + if (!dev) { + LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, + ggml_backend_buft_name(buft)); + return nullptr; + } + + if (buft != ggml_backend_dev_buffer_type(dev)) { + LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, + ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); + return nullptr; + } + + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { + LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, + ggml_backend_dev_name(dev)); + return nullptr; + } + + auto * host_buft = ggml_backend_dev_host_buffer_type(dev); + if (!host_buft) { + LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, + ggml_backend_dev_name(dev)); + return nullptr; + } + + // If the backend is supported, create pinned memory buffers and events for synchronisation. + for (size_t idx = 0; idx < n_buffers; ++idx) { + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf) { + LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, + ggml_backend_dev_name(dev)); + return nullptr; + } + + host_buffers.emplace_back(buf); + host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); + + auto * event = ggml_backend_event_new(dev); + if (!event) { + LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, + ggml_backend_dev_name(dev)); + return nullptr; + } + + events.emplace_back(event); + } + + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (!backend) { + LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, + ggml_backend_dev_name(dev)); + return nullptr; + } + + return backend; + }(__func__); + + if (upload_backend) { + LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, + ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), + ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), + ggml_backend_name(upload_backend)); + } + + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + const auto * weight = get_weight(ggml_get_name(cur)); + if (weight == nullptr) { + // this can happen with split experts models + continue; + } + + if (progress_callback) { + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + return false; + } + } + + size_t n_size = ggml_nbytes(cur); + + if (use_mmap) { + const auto & mapping = mappings.at(weight->idx); + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs.count(weight->idx)) { + buf_mmap = bufs.at(weight->idx); + } + uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; + + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + })); + } + + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated + if (buf_mmap && cur->data == nullptr) { + ggml_backend_tensor_alloc(buf_mmap, cur, data); + if (lmlocks) { + const auto & lmlock = lmlocks->at(weight->idx); + lmlock->grow_to(weight->offs + n_size); + } + + auto & mmap_used = mmaps_used[weight->idx]; + mmap_used.first = std::min(mmap_used.first, weight->offs); + mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); + } else { + ggml_backend_tensor_set(cur, data, 0, n_size); + } + } else { + const auto & file = files.at(weight->idx); + if (ggml_backend_buffer_is_host(cur->buffer)) { + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); + })); + } + } else { + // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. + if (upload_backend) { + file->seek(weight->offs, SEEK_SET); + + size_t bytes_read = 0; + + while (bytes_read < n_size) { + size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx], upload_backend); + + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= n_buffers; + } + } else { + read_buf.resize(n_size); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } + } + } + } + + size_done += n_size; + } + + // free temporary resources used for async uploads + for (auto * event : events) { + ggml_backend_event_synchronize(event); + ggml_backend_event_free(event); + } + for (auto * buf : host_buffers) { + ggml_backend_buffer_free(buf); + } + ggml_backend_free(upload_backend); + + // check validation results + bool validation_failed = false; + for (auto & future : validation_result) { + auto result = future.get(); + if (!result.second) { + LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); + validation_failed = true; + } + } + if (validation_failed) { + throw std::runtime_error("found tensors with invalid data"); + } + + // check if this is the last call and do final cleanup + if (size_done >= size_data) { + // unmap offloaded tensors and metadata + if (use_mmap) { + for (uint32_t idx = 0; idx < mappings.size(); idx++) { + const auto & mmap_used = mmaps_used.at(idx); + auto & mapping = mappings.at(idx); + mapping->unmap_fragment(0, mmap_used.first); + if (mmap_used.second != 0) { + mapping->unmap_fragment(mmap_used.second, mapping->size()); + } + } + } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); + } + } + + return true; +} diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h new file mode 100644 index 00000000000000..c9f274fc3976c4 --- /dev/null +++ b/src/llama-model-loader.h @@ -0,0 +1,157 @@ +#pragma once + +#include "llama.h" + +#include "llama-impl.h" +#include "llama-arch.h" +#include "llama-mmap.h" + +#include "ggml-cpp.h" + +#include +#include +#include + +using llama_buf_map = std::unordered_map; + +enum llama_fver { + GGUF_FILE_VERSION_V1 = 1, + GGUF_FILE_VERSION_V2 = 2, + GGUF_FILE_VERSION_V3 = 3, +}; + +const char * llama_file_version_name(llama_fver version); + +struct llama_model_loader { + // Holds information on a model weight + struct llama_tensor_weight { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file + + ggml_tensor * tensor; + + llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); + if (tensor_idx < 0) { + throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); + } + + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) { + throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor))); + } + } + }; + + // custom comparator to sort weights more nicely by layer + struct weight_name_comparer { + bool operator()(const std::string & a, const std::string & b) const { + int a_layer = -1; + int b_layer = -1; + sscanf(a.c_str(), "blk.%d.", &a_layer); + sscanf(b.c_str(), "blk.%d.", &b_layer); + if (a_layer != b_layer) { + return a_layer < b_layer; + } + return a < b; + } + }; + + static const int TENSOR_NOT_REQUIRED = 1; + static const int TENSOR_DUPLICATED = 2; + + int n_kv = 0; + int n_tensors = 0; + int n_created = 0; + + uint64_t n_elements = 0; + size_t n_bytes = 0; + + bool use_mmap = false; + bool check_tensors; + + llama_files files; + llama_ftype ftype; + llama_fver fver; + + llama_mmaps mappings; + + std::map weights_map; + std::unordered_map kv_overrides; + + gguf_context_ptr meta; + std::vector contexts; + + std::string arch_name; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + + size_t size_done = 0; + size_t size_data = 0; + std::vector> mmaps_used; + + llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p); + + template + typename std::enable_if::value, bool>::type + get_arr_n(const std::string & key, T & result, bool required = true); + + template + typename std::enable_if::value, bool>::type + get_arr_n(enum llm_kv kid, T & result, bool required = true); + + template + bool get_arr(const std::string & key, std::vector & result, bool required = true); + + template + bool get_arr(const std::string & key, std::array & result, bool required = true); + + template + bool get_arr(enum llm_kv kid, T & result, bool required = true); + + template + bool get_key(const std::string & key, T & result, bool required = true); + + template + bool get_key(enum llm_kv kid, T & result, bool required = true); + + template + bool get_key_or_arr(const std::string & key, std::array & result, uint32_t n, bool required = true); + + template + bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true); + + std::string get_arch_name() const; + + enum llm_arch get_arch() const; + + const llama_tensor_weight * get_weight(const char * name) const; + + const llama_tensor_weight & require_weight(const char * name) const; + + struct ggml_tensor * get_tensor_meta(const char * name) const; + + struct ggml_tensor * require_tensor_meta(const std::string & name) const; + + const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const; + + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list & ne, int flags = 0); + + struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required = true); + + void done_getting_tensors() const; + + void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr); + + void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const; + + // for backwards compatibility, does not support ggml-backend + void load_data_for(struct ggml_tensor * cur) const; + + // Returns false if cancelled by progress_callback + bool load_all_data( + struct ggml_context * ctx, + llama_buf_map & bufs, + llama_mlocks * lmlocks, + llama_progress_callback progress_callback, + void * progress_callback_user_data); +}; diff --git a/src/llama.cpp b/src/llama.cpp index a09bed7548bacb..07215f76ca736d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6,6 +6,8 @@ #include "llama-vocab.h" #include "llama-sampling.h" #include "llama-kv-cache.h" +#include "llama-model-loader.h" +#include "llama-model.h" #include "unicode.h" // TODO: remove @@ -33,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -81,70 +82,10 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } -static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { - switch (type) { - case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); - case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); - case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); - case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); - case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); - case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); - case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); - case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); - case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); - case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); - case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; - default: return format("unknown type %d", type); - } -} - -static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); - - switch (type) { - case GGUF_TYPE_STRING: - return gguf_get_val_str(ctx_gguf, i); - case GGUF_TYPE_ARRAY: - { - const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); - int arr_n = gguf_get_arr_n(ctx_gguf, i); - const void * data = gguf_get_arr_data(ctx_gguf, i); - std::stringstream ss; - ss << "["; - for (int j = 0; j < arr_n; j++) { - if (arr_type == GGUF_TYPE_STRING) { - std::string val = gguf_get_arr_str(ctx_gguf, i, j); - // escape quotes - replace_all(val, "\\", "\\\\"); - replace_all(val, "\"", "\\\""); - ss << '"' << val << '"'; - } else if (arr_type == GGUF_TYPE_ARRAY) { - ss << "???"; - } else { - ss << gguf_data_to_str(arr_type, data, j); - } - if (j < arr_n - 1) { - ss << ", "; - } - } - ss << "]"; - return ss.str(); - } - default: - return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); - } -} - // // llama helpers // -template -struct no_init { - T value; - no_init() { /* do nothing */ } -}; - // NOTE: avoid ever using this except for building the token_to_piece caches static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) { std::string piece; @@ -178,1094 +119,6 @@ static int llama_get_device_count(const llama_model & model) { // model loading and saving // -enum llama_fver { - GGUF_FILE_VERSION_V1 = 1, - GGUF_FILE_VERSION_V2 = 2, - GGUF_FILE_VERSION_V3 = 3, -}; - -static const char * llama_file_version_name(llama_fver version) { - switch (version) { - case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; - case GGUF_FILE_VERSION_V2: return "GGUF V2"; - case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)"; - } - - return "unknown"; -} - -static std::string llama_format_tensor_shape(const std::vector & ne) { - char buf[256]; - snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0)); - for (size_t i = 1; i < ne.size(); i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i)); - } - return buf; -} - -static std::string llama_format_tensor_shape(const struct ggml_tensor * t) { - char buf[256]; - snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]); - } - return buf; -} - -namespace GGUFMeta { - template - struct GKV_Base_Type { - static constexpr gguf_type gt = gt_; - - static T getter(const gguf_context * ctx, const int kid) { - return gfun(ctx, kid); - } - }; - - template struct GKV_Base; - - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - template<> struct GKV_Base: GKV_Base_Type {}; - - template<> struct GKV_Base { - static constexpr gguf_type gt = GGUF_TYPE_STRING; - - static std::string getter(const gguf_context * ctx, const int kid) { - return gguf_get_val_str(ctx, kid); - } - }; - - struct ArrayInfo { - const gguf_type gt; - const size_t length; - const void * data; - }; - - template<> struct GKV_Base { - public: - static constexpr gguf_type gt = GGUF_TYPE_ARRAY; - static ArrayInfo getter(const gguf_context *ctx, const int k) { - return ArrayInfo { - gguf_get_arr_type(ctx, k), - size_t(gguf_get_arr_n(ctx, k)), - gguf_get_arr_data(ctx, k), - }; - } - }; - - template - class GKV : public GKV_Base { - GKV() = delete; - - public: - static T get_kv(const gguf_context * ctx, const int k) { - const enum gguf_type kt = gguf_get_kv_type(ctx, k); - - if (kt != GKV::gt) { - throw std::runtime_error(format("key %s has wrong type %s but expected type %s", - gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt))); - } - return GKV::getter(ctx, k); - } - - static const char * override_type_to_str(const llama_model_kv_override_type ty) { - switch (ty) { - case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; - case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; - case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; - case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; - } - return "unknown"; - } - - static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { - if (!ovrd) { return false; } - if (ovrd->tag == expected_type) { - LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", - __func__, override_type_to_str(ovrd->tag), ovrd->key); - switch (ovrd->tag) { - case LLAMA_KV_OVERRIDE_TYPE_BOOL: { - LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); - } break; - case LLAMA_KV_OVERRIDE_TYPE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); - } break; - case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); - } break; - case LLAMA_KV_OVERRIDE_TYPE_STR: { - LLAMA_LOG_INFO("%s\n", ovrd->val_str); - } break; - default: - // Shouldn't be possible to end up here, but just in case... - throw std::runtime_error( - format("Unsupported attempt to override %s type for metadata key %s\n", - override_type_to_str(ovrd->tag), ovrd->key)); - } - return true; - } - LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", - __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); - return false; - } - - template - static typename std::enable_if::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override * ovrd) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { - target = ovrd->val_bool; - return true; - } - return false; - } - - template - static typename std::enable_if::value && std::is_integral::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override * ovrd) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { - target = ovrd->val_i64; - return true; - } - return false; - } - - template - static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override * ovrd) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { - target = ovrd->val_f64; - return true; - } - return false; - } - - template - static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override * ovrd) { - if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { - target = ovrd->val_str; - return true; - } - return false; - } - - static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { - if (try_override(target, ovrd)) { - return true; - } - if (k < 0) { return false; } - target = get_kv(ctx, k); - return true; - } - - static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { - return set(ctx, gguf_find_key(ctx, key), target, ovrd); - } - - static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { - return set(ctx, key.c_str(), target, ovrd); - } - }; -} - -using llama_buf_map = std::unordered_map; - -struct llama_model_loader { - int n_kv = 0; - int n_tensors = 0; - int n_created = 0; - - uint64_t n_elements = 0; - size_t n_bytes = 0; - - bool use_mmap = false; - bool check_tensors; - - llama_files files; - llama_ftype ftype; - llama_fver fver; - - llama_mmaps mappings; - - // Holds information on a model weight - struct llama_tensor_weight { - uint16_t idx; // source file index - size_t offs; // tensor data offset in the original file - - ggml_tensor * tensor; - - llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { - const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); - if (tensor_idx < 0) { - throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); - } - - offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); - if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) { - throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor))); - } - } - }; - - // custom comparator to sort weights more nicely by layer - struct weight_name_comparer { - bool operator()(const std::string & a, const std::string & b) const { - int a_layer = -1; - int b_layer = -1; - sscanf(a.c_str(), "blk.%d.", &a_layer); - sscanf(b.c_str(), "blk.%d.", &b_layer); - if (a_layer != b_layer) { - return a_layer < b_layer; - } - return a < b; - } - }; - - std::map weights_map; - std::unordered_map kv_overrides; - - gguf_context_ptr meta; - std::vector contexts; - - std::string arch_name; - LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - - llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { - int trace = 0; - if (getenv("LLAMA_TRACE")) { - trace = atoi(getenv("LLAMA_TRACE")); - } - - if (param_overrides_p != nullptr) { - for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { - kv_overrides.insert({std::string(p->key), *p}); - } - } - - struct ggml_context * ctx = NULL; - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, - }; - - meta.reset(gguf_init_from_file(fname.c_str(), params)); - if (!meta) { - throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); - } - - get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); - llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - - files.emplace_back(new llama_file(fname.c_str(), "rb")); - contexts.emplace_back(ctx); - - // Save tensors data offset of the main file. - // For subsidiary files, `meta` tensor data offset must not be used, - // so we build a unified tensors index for weights. - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - std::string tensor_name = std::string(cur->name); - // make sure there is no duplicated tensor names - if (weights_map.find(tensor_name) != weights_map.end()) { - throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); - } - n_elements += ggml_nelements(cur); - n_bytes += ggml_nbytes(cur); - weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur)); - } - uint16_t n_split = 0; - get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); - - // Load additional GGML contexts - if (n_split > 1) { - uint16_t idx = 0; - get_key(llm_kv(LLM_KV_SPLIT_NO), idx); - if (idx != 0) { - throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); - } - - std::vector split_prefix(llama_path_max(), 0); - if (!llama_split_prefix(split_prefix.data(), split_prefix.size(), fname.c_str(), idx, n_split)) { - throw std::runtime_error(format("invalid split file: %s", fname.c_str())); - } - - if (trace > 0) { - LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); - } - - std::vector split_path(llama_path_max(), 0); - for (idx = 1; idx < n_split; idx++) { - llama_split_path(split_path.data(), split_path.size(), split_prefix.data(), idx, n_split); - - struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx, - }; - gguf_context_ptr ctx_gguf { gguf_init_from_file(split_path.data(), split_params) }; - if (!ctx_gguf) { - throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path.data())); - } - - files.emplace_back(new llama_file(split_path.data(), "rb")); - contexts.emplace_back(ctx); - - // Save tensors data offset info of the shard. - for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { - std::string tensor_name = std::string(cur->name); - // make sure there is no duplicated tensor names - if (weights_map.find(tensor_name) != weights_map.end()) { - throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); - } - n_elements += ggml_nelements(cur); - n_bytes += ggml_nbytes(cur); - weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur)); - } - } - - get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); - - // sanity check - { - const int n_tensors_loaded = (int) weights_map.size(); - if (n_tensors != n_tensors_loaded) { - throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); - } - } - - LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); - } - - n_kv = gguf_get_n_kv(meta.get()); - n_tensors = weights_map.size(); - - fver = (enum llama_fver) gguf_get_version(meta.get()); - - LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); - - // determine file type based on the number of tensors for each quantization and print meta data - // TODO: make optional - { - std::map n_type; - - uint32_t n_type_max = 0; - enum ggml_type type_max = GGML_TYPE_F32; - - for (const auto & it : weights_map) { - const llama_tensor_weight & w = it.second; - const ggml_tensor * tensor = w.tensor; - - enum ggml_type type = tensor->type; - - n_type[type]++; - - if (n_type_max < n_type[type]) { - n_type_max = n_type[type]; - type_max = type; - } - - if (trace > 0) { - const uint16_t sid = w.idx; - LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); - } - } - - switch (type_max) { - case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; - case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; - case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; - case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; - case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; - case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; - case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break; - case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break; - case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break; - case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break; - case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; - case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; - case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; - case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break; - case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break; - case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; - case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; - case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; - case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; - case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; - case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; - case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; - case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; - case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - default: - { - LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); - ftype = LLAMA_FTYPE_ALL_F32; - } break; - } - - // this is a way to mark that we have "guessed" the file type - ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); - - { - const int kid = gguf_find_key(meta.get(), "general.file_type"); // TODO: use LLM_KV - if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(meta.get(), kid); - } - } - - LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); - - for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(meta.get(), i); - const enum gguf_type type = gguf_get_kv_type(meta.get(), i); - const std::string type_name = - type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i)) - : gguf_type_name(type); - - std::string value = gguf_kv_to_str(meta.get(), i); - const size_t MAX_VALUE_LEN = 40; - if (value.size() > MAX_VALUE_LEN) { - value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); - } - replace_all(value, "\n", "\\n"); - - LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); - } - - // print type counts - for (auto & kv : n_type) { - if (kv.second == 0) { - continue; - } - - LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); - } - } - - if (!llama_mmap::SUPPORTED) { - LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__); - use_mmap = false; - } - - this->use_mmap = use_mmap; - this->check_tensors = check_tensors; - } - - template - typename std::enable_if::value, bool>::type - get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(meta.get(), key.c_str()); - - if (kid < 0) { - if (required) { - throw std::runtime_error(format("key not found in model: %s", key.c_str())); - } - return false; - } - - struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta.get(), kid); - - - result = arr_info.length; - return true; - } - - template - typename std::enable_if::value, bool>::type - get_arr_n(const enum llm_kv kid, T & result, const bool required = true) { - return get_arr_n(llm_kv(kid), result, required); - } - - template - bool get_arr(const std::string & key, std::vector & result, const bool required = true) { - const int kid = gguf_find_key(meta.get(), key.c_str()); - - if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { - if (required) { - throw std::runtime_error(format("array key not found in model: %s", key.c_str())); - } - return false; - } - - struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta.get(), kid); - - switch (arr_info.gt) { - case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; - case GGUF_TYPE_INT32: GGML_ASSERT( - (std::is_same::value) || - (std::is_same::value)); break; - default: - throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); - } - - result.resize(arr_info.length); - result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length); - - return true; - } - - template - bool get_arr(const std::string & key, std::array & result, const bool required = true) { - const int kid = gguf_find_key(meta.get(), key.c_str()); - - if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) { - if (required) { - throw std::runtime_error(format("array key not found in model: %s", key.c_str())); - } - return false; - } - - struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta.get(), kid); - - switch (arr_info.gt) { - case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; - case GGUF_TYPE_INT32: GGML_ASSERT( - (std::is_same::value) || - (std::is_same::value)); break; - default: - throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str())); - } - - if (arr_info.length > N_MAX) { - throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX)); - } - - std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin()); - - return true; - } - - template - bool get_arr(const enum llm_kv kid, T & result, const bool required = true) { - return get_arr(llm_kv(kid), result, required); - } - - template - bool get_key(const std::string & key, T & result, const bool required = true) { - auto it = kv_overrides.find(key); - - const struct llama_model_kv_override * override = - it != kv_overrides.end() ? &it->second : nullptr; - - const bool found = GGUFMeta::GKV::set(meta.get(), key, result, override); - - if (required && !found) { - throw std::runtime_error(format("key not found in model: %s", key.c_str())); - } - - return found; - } - - template - bool get_key(const enum llm_kv kid, T & result, const bool required = true) { - return get_key(llm_kv(kid), result, required); - } - - // get array of n <= N_MAX elements, or a single element repeated n times - template - bool get_key_or_arr(const std::string & key, std::array & result, uint32_t n, const bool required = true) { - const int kid = gguf_find_key(meta.get(), key.c_str()); - - if (kid < 0) { - if (required) { - throw std::runtime_error(format("key not found in model: %s", key.c_str())); - } - return false; - } - - if (n > N_MAX) { - throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); - } - - if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) { - struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(meta.get(), kid); - - if (n != arr_info.length) { - throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length)); - } - - return get_arr(key, result, required); - } else { - T value; - - bool ok = get_key(key, value, required); - if (!ok) { - return false; - } - - for (uint32_t i = 0; i < n; i++) { - result[i] = value; - } - - return true; - } - } - - template - bool get_key_or_arr(const enum llm_kv kid, T & result, uint32_t n, const bool required = true) { - return get_key_or_arr(llm_kv(kid), result, n, required); - } - - std::string get_arch_name() const { - return arch_name; - } - - enum llm_arch get_arch() const { - return llm_kv.arch; - } - - const llama_tensor_weight * get_weight(const char * name) const { - auto pos = weights_map.find(name); - if (pos != weights_map.end()) { - return &pos->second; - } - - return nullptr; - } - - const llama_tensor_weight & require_weight(const char * name) const { - const llama_tensor_weight * weight = get_weight(name); - if (!weight) { - throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); - } - return *weight; - } - - struct ggml_tensor * get_tensor_meta(const char * name) const { - const auto * weight = get_weight(name); - if (!weight) { - return nullptr; - } - return weight->tensor; - } - - struct ggml_tensor * require_tensor_meta(const std::string & name) const { - struct ggml_tensor * tensor = get_tensor_meta(name.c_str()); - if (!tensor) { - throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); - } - return tensor; - } - - const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const { - const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); - - if (cur == NULL) { - if (!required) { - return NULL; - } - throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); - } - - { - bool is_ok = true; - for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { - if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { - is_ok = false; - break; - } - } - if (!is_ok) { - throw std::runtime_error( - format("%s: tensor '%s' has wrong shape; expected %s, got %s", - __func__, name.c_str(), - llama_format_tensor_shape(ne).c_str(), - llama_format_tensor_shape(cur).c_str())); - } - } - - return cur; - } - - static const int TENSOR_NOT_REQUIRED = 1; - static const int TENSOR_DUPLICATED = 2; - - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list & ne, int flags = 0) { - const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED)); - - if (cur == NULL) { - return NULL; - } - - bool duplicated = flags & TENSOR_DUPLICATED; - - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); - ggml_set_name(tensor, ggml_get_name(cur)); - - if (duplicated) { - size_data += ggml_nbytes(cur); - } else { - n_created++; - } - - return tensor; - - } - - struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list & ne, size_t offset, bool required = true) { - const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); - - if (cur == NULL) { - return NULL; - } - - if (cur->type != base->type) { - throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); - } - - std::array dims; - for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { - dims[i] = i < ne.size() ? ne.begin()[i] : 1; - } - - struct ggml_tensor * tensor = ggml_view_4d(ctx, base, - dims[0], dims[1], dims[2], dims[3], - cur->nb[1], cur->nb[2], cur->nb[3], - offset); - - ggml_set_name(tensor, name.c_str()); - - n_created++; - - return tensor; - } - - void done_getting_tensors() const { - if (n_created != n_tensors) { - throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); - } - } - - void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) { - if (use_mmap) { - mappings.reserve(files.size()); - mmaps_used.reserve(files.size()); - for (const auto & file : files) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); - auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); - mmaps_used.emplace_back(mapping->size(), 0); - if (mlock_mmaps) { - std::unique_ptr mlock_mmap(new llama_mlock()); - mlock_mmap->init(mapping->addr()); - mlock_mmaps->emplace_back(std::move(mlock_mmap)); - } - mappings.emplace_back(std::move(mapping)); - } - } - - // compute the total size of all tensors for progress reporting - for (const auto & it : weights_map) { - size_data += ggml_nbytes(it.second.tensor); - } - } - - void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { - GGML_ASSERT(!mappings.empty()); - const auto & mapping = mappings.at(idx); - - *first = mapping->size(); - *last = 0; - *addr = mapping->addr(); - for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const auto * weight = get_weight(ggml_get_name(tensor)); - if (!weight || weight->idx != idx) { - continue; - } - *first = std::min(*first, weight->offs); - *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); - } - } - - // for backwards compatibility, does not support ggml-backend - void load_data_for(struct ggml_tensor * cur) const { - const auto & w = require_weight(ggml_get_name(cur)); - - if (use_mmap) { - const auto & mapping = mappings.at(w.idx); - if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr() + w.offs; - } else { - memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); - } - } else { - GGML_ASSERT(cur->data != nullptr); - GGML_ASSERT(w.idx < files.size()); - const auto & file = files.at(w.idx); - file->seek(w.offs, SEEK_SET); - file->read_raw(cur->data, ggml_nbytes(cur)); - } - - if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { - throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); - } - } - - size_t size_done = 0; - size_t size_data = 0; - std::vector> mmaps_used; - - // Returns false if cancelled by progress_callback - bool load_all_data( - struct ggml_context * ctx, - llama_buf_map & bufs, - llama_mlocks * lmlocks, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { - GGML_ASSERT(size_data != 0 && "call init_mappings() first"); - - std::vector> read_buf; - std::vector>> validation_result; - - // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. - // NVMe raid configurations might require more / larger buffers. - constexpr size_t n_buffers = 4; - constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB - - std::vector host_buffers; - std::vector events; - std::vector host_ptrs; - size_t buffer_idx = 0; // buffer to use for async loads - ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t { - if (use_mmap || check_tensors) { - return nullptr; - } - // When not using mmaped io use async uploads from pinned memory to GPU memory. - // First determine if the backend supports the necessary features for async uploads. - auto * buf = bufs.count(0) ? bufs.at(0) : nullptr; - if (!buf) { - LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func); - return nullptr; - } - - auto * buft = ggml_backend_buffer_get_type(buf); - auto * dev = ggml_backend_buft_get_device(buft); - if (!dev) { - LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func, - ggml_backend_buft_name(buft)); - return nullptr; - } - - if (buft != ggml_backend_dev_buffer_type(dev)) { - LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func, - ggml_backend_buft_name(buft), ggml_backend_dev_name(dev)); - return nullptr; - } - - ggml_backend_dev_props props; - ggml_backend_dev_get_props(dev, &props); - if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) { - LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - - auto * host_buft = ggml_backend_dev_host_buffer_type(dev); - if (!host_buft) { - LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - - // If the backend is supported, create pinned memory buffers and events for synchronisation. - for (size_t idx = 0; idx < n_buffers; ++idx) { - auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); - if (!buf) { - LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - - host_buffers.emplace_back(buf); - host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf)); - - auto * event = ggml_backend_event_new(dev); - if (!event) { - LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - - events.emplace_back(event); - } - - ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - if (!backend) { - LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func, - ggml_backend_dev_name(dev)); - return nullptr; - } - - return backend; - }(__func__); - - if (upload_backend) { - LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__, - ggml_backend_dev_name(ggml_backend_get_device(upload_backend)), - ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))), - ggml_backend_name(upload_backend)); - } - - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { - const auto * weight = get_weight(ggml_get_name(cur)); - if (weight == nullptr) { - // this can happen with split experts models - continue; - } - - if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { - return false; - } - } - - size_t n_size = ggml_nbytes(cur); - - if (use_mmap) { - const auto & mapping = mappings.at(weight->idx); - ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs.count(weight->idx)) { - buf_mmap = bufs.at(weight->idx); - } - uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; - - if (check_tensors) { - validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); - })); - } - - GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated - if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, data); - if (lmlocks) { - const auto & lmlock = lmlocks->at(weight->idx); - lmlock->grow_to(weight->offs + n_size); - } - - auto & mmap_used = mmaps_used[weight->idx]; - mmap_used.first = std::min(mmap_used.first, weight->offs); - mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); - } else { - ggml_backend_tensor_set(cur, data, 0, n_size); - } - } else { - const auto & file = files.at(weight->idx); - if (ggml_backend_buffer_is_host(cur->buffer)) { - file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); - if (check_tensors) { - validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); - })); - } - } else { - // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. - if (upload_backend) { - file->seek(weight->offs, SEEK_SET); - - size_t bytes_read = 0; - - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - - ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx], upload_backend); - - bytes_read += read_iteration; - ++buffer_idx; - buffer_idx %= n_buffers; - } - } else { - read_buf.resize(n_size); - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); - ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); - if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { - throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); - } - } - } - } - - size_done += n_size; - } - - // free temporary resources used for async uploads - for (auto * event : events) { - ggml_backend_event_synchronize(event); - ggml_backend_event_free(event); - } - for (auto * buf : host_buffers) { - ggml_backend_buffer_free(buf); - } - ggml_backend_free(upload_backend); - - // check validation results - bool validation_failed = false; - for (auto & future : validation_result) { - auto result = future.get(); - if (!result.second) { - LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); - validation_failed = true; - } - } - if (validation_failed) { - throw std::runtime_error("found tensors with invalid data"); - } - - // check if this is the last call and do final cleanup - if (size_done >= size_data) { - // unmap offloaded tensors and metadata - if (use_mmap) { - for (uint32_t idx = 0; idx < mappings.size(); idx++) { - const auto & mmap_used = mmaps_used.at(idx); - auto & mapping = mappings.at(idx); - mapping->unmap_fragment(0, mmap_used.first); - if (mmap_used.second != 0) { - mapping->unmap_fragment(mmap_used.second, mapping->size()); - } - } - } - if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); - } - } - - return true; - } -}; - -template<> -bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { - uint32_t tmp; - const bool found = get_key(kid, tmp, required); - if (found) { - result = (enum llama_pooling_type) tmp; - } else { - result = LLAMA_POOLING_TYPE_UNSPECIFIED; - } - return found; -} - - static void llm_load_stats(llama_model_loader & ml, llama_model & model) { model.n_elements = ml.n_elements; model.n_bytes = ml.n_bytes; @@ -1278,9 +131,7 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) { } } -static void llm_load_hparams( - llama_model_loader & ml, - llama_model & model) { +static void llm_load_hparams(llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; const gguf_context * ctx = ml.meta.get(); @@ -1616,9 +467,7 @@ static void llm_load_hparams( } break; case LLM_ARCH_QWEN2VL: { - std::array section_dims; - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true); - std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections)); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); } // fall through case LLM_ARCH_QWEN2: @@ -2116,9 +965,7 @@ static void llm_load_hparams( hparams.rope_type = llama_rope_type(&model); } -static void llm_load_vocab( - llama_model_loader & ml, - llama_model & model) { +static void llm_load_vocab(llama_model_loader & ml, llama_model & model) { auto & vocab = model.vocab; struct gguf_context * ctx = ml.meta.get();