diff --git a/otherarch/ggml_v3.c b/otherarch/ggml_v3.c index 826f285b0475db..838d7824ca1f4c 100644 --- a/otherarch/ggml_v3.c +++ b/otherarch/ggml_v3.c @@ -3161,7 +3161,7 @@ size_t ggml_v3_row_size(enum ggml_v3_type type, int64_t ne) { return ggml_v3_type_size(type)*ne/ggml_v3_blck_size(type); } -double ggml_v3_type_sizef(enum ggml_v3_type type) { +double ggml_v3_row_size(enum ggml_v3_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } diff --git a/otherarch/ggml_v3.h b/otherarch/ggml_v3.h index cd8ed48b138eeb..3dca51f486932d 100644 --- a/otherarch/ggml_v3.h +++ b/otherarch/ggml_v3.h @@ -666,7 +666,7 @@ extern "C" { GGML_V3_API size_t ggml_v3_row_size (enum ggml_v3_type type, int64_t ne); // size in bytes for all elements in a row GGML_V3_DEPRECATED( - GGML_V3_API double ggml_v3_type_sizef(enum ggml_v3_type type), // ggml_v3_type_size()/ggml_v3_blck_size() as float + GGML_V3_API double ggml_v3_row_size(enum ggml_v3_type type), // ggml_v3_type_size()/ggml_v3_blck_size() as float "use ggml_v3_row_size() instead"); GGML_V3_API const char * ggml_v3_type_name(enum ggml_v3_type type); diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index e52d9f213f6188..6b52f52a5b6555 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -136,33 +136,33 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head const int kv_dim = kv_heads * head_dim; - ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_b + ctx_size += n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // ln_f_b - ctx_size += n_vocab*n_embd*ggml_v3_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_v3_type_sizef(wtype); // lm_head + ctx_size += n_vocab*n_embd*ggml_v3_row_size(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_v3_row_size(wtype); // lm_head - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_2_b - ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_attn_w // TODO: - ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_v3_row_size(wtype)); // c_attn_attn_w // TODO: + ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_attn_proj_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_mlp_proj_b - ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_k - ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_v + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F16); // memory_k + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F16); // memory_v ctx_size += (6 + 12*n_layer)*1024; // object overhead diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 653a4055faff11..5b235fd73bbdcc 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -123,31 +123,31 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_b + ctx_size += n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype); // wte + ctx_size += n_embd*n_vocab*ggml_v3_row_size(wtype); // wte - ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype); // lmh_g - ctx_size += n_vocab*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // lmh_b + ctx_size += n_embd*n_vocab*ggml_v3_row_size(wtype); // lmh_g + ctx_size += n_vocab*ggml_v3_row_size(GGML_V3_TYPE_F32); // lmh_b - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_q_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_k_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_v_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_q_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_k_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_v_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_proj_w - ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_mlp_proj_b - ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(memory_type); // memory_k - ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(memory_type); // memory_v + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_row_size(memory_type); // memory_k + ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_row_size(memory_type); // memory_v ctx_size += (5 + 10*n_layer)*512; // object overhead diff --git a/otherarch/llama-util.h b/otherarch/llama-util.h index 9470f44c45d7c9..3a81e2e9ad335d 100644 --- a/otherarch/llama-util.h +++ b/otherarch/llama-util.h @@ -214,7 +214,7 @@ struct llama_v3_mmap { llama_v3_mmap(struct llama_v3_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { size = file->size; - int fd = fileno(file->fp); + int fd = _fileno(file->fp); int flags = MAP_SHARED; // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } diff --git a/otherarch/llama_v2-util.h b/otherarch/llama_v2-util.h index 7dc9156e921e82..4161a388e00b9a 100644 --- a/otherarch/llama_v2-util.h +++ b/otherarch/llama_v2-util.h @@ -156,7 +156,7 @@ struct llama_v2_mmap { llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) { size = file->size; - int fd = fileno(file->fp); + int fd = _fileno(file->fp); int flags = MAP_SHARED; #ifdef __linux__ flags |= MAP_POPULATE; diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 361247b18b93c7..e3bf8a627bc51c 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -126,18 +126,18 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo const size_t n_layer = hparams.n_layers; const size_t n_vocab = hparams.n_vocab; - ctx_size += n_embd * n_vocab * ggml_v3_type_sizef(wtype); // wte_weight - ctx_size += n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F32); // norm_f_weight - - ctx_size += n_layer * (n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_weight - ctx_size += n_layer * (3 * n_embd * n_embd * ggml_v3_type_sizef(wtype)); // attn_Wqkv_weight - ctx_size += n_layer * (n_embd * n_embd * ggml_v3_type_sizef(wtype)); // attn_out_proj_weight - ctx_size += n_layer * (n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_weight - ctx_size += n_layer * (4 * n_embd * n_embd * ggml_v3_type_sizef(wtype)); // mlp_mlp_up_weight - ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_v3_type_sizef(wtype)); // mlp_mlp_down_weight - - ctx_size += n_ctx * n_layer * n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_k - ctx_size += n_ctx * n_layer * n_embd * ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_v + ctx_size += n_embd * n_vocab * ggml_v3_row_size(wtype); // wte_weight + ctx_size += n_embd * ggml_v3_row_size(GGML_V3_TYPE_F32); // norm_f_weight + + ctx_size += n_layer * (n_embd * ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_weight + ctx_size += n_layer * (3 * n_embd * n_embd * ggml_v3_row_size(wtype)); // attn_Wqkv_weight + ctx_size += n_layer * (n_embd * n_embd * ggml_v3_row_size(wtype)); // attn_out_proj_weight + ctx_size += n_layer * (n_embd * ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_2_weight + ctx_size += n_layer * (4 * n_embd * n_embd * ggml_v3_row_size(wtype)); // mlp_mlp_up_weight + ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_v3_row_size(wtype)); // mlp_mlp_down_weight + + ctx_size += n_ctx * n_layer * n_embd * ggml_v3_row_size(GGML_V3_TYPE_F16); // memory_k + ctx_size += n_ctx * n_layer * n_embd * ggml_v3_row_size(GGML_V3_TYPE_F16); // memory_v ctx_size += (6 + 6 * n_layer) * 512; // object overhead diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 3ac7c5fff32d21..d75a285b8d1389 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -115,34 +115,34 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const size_t n_ctx = hparams.n_ctx; const size_t n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // ln_f_b + ctx_size += n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype); // wte + ctx_size += n_embd*n_vocab*ggml_v3_row_size(wtype); // wte - ctx_size += n_embd*n_vocab*ggml_v3_type_sizef(wtype); // lmh_g - //ctx_size += n_vocab*ggml_v3_type_sizef(GGML_V3_TYPE_F32); // lmh_b + ctx_size += n_embd*n_vocab*ggml_v3_row_size(wtype); // lmh_g + //ctx_size += n_vocab*ggml_v3_row_size(GGML_V3_TYPE_F32); // lmh_b - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(3*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(wtype)); // c_attn_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_attn_proj_b - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // ln_2_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v3_row_size(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_v3_row_size(GGML_V3_TYPE_F32)); // c_mlp_proj_b - ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_k - ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_type_sizef(GGML_V3_TYPE_F16); // memory_v + ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F16); // memory_k + ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_v3_row_size(GGML_V3_TYPE_F16); // memory_v ctx_size += (6 + 16*n_layer)*1024; // object overhead diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 48b25ba77804d4..0a3d6e7344e766 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -1378,7 +1378,7 @@ bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & inst RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_OPEN, file.file, "Failed to open file %s", file_path); // Be very careful when changing this code. It must support files larger than 2 GB by using 64-bit functions to get the file length. - RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_STAT, fstat(fileno(file.file), &file_stat) == 0, "Failed to stat file %s", file_path); + RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_STAT, fstat(_fileno(file.file), &file_stat) == 0, "Failed to stat file %s", file_path); RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE, rwkv_fread_file_header(file.file, model.header), "Invalid file header"); struct rwkv_tensor_header tensor_header; @@ -1799,7 +1799,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_OPEN, in_file.file, "Failed to open %s for reading", in_path); // Be very careful when changing this code. It must support files larger than 2 GB by using 64-bit functions to the get file length. - RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_STAT, fstat(fileno(in_file.file), &in_stat) == 0, "failed to stat file %s", in_path); + RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_STAT, fstat(_fileno(in_file.file), &in_stat) == 0, "failed to stat file %s", in_path); struct rwkv_file out_file(fopen(out_path, "wb")); RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_OPEN, out_file.file, "Failed to open %s for writing", out_path);