From 883f0bc2d3255f42ee81bc11b60ba9f0781b05d1 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 2 Oct 2023 16:07:22 +0200 Subject: [PATCH 01/23] ggml backends interface v1 --- examples/CMakeLists.txt | 6 +- examples/gpt-2/CMakeLists.txt | 10 + examples/gpt-2/main.cpp | 155 +++++-- include/ggml/ggml-alloc.h | 6 + include/ggml/ggml-backend.h | 151 +++++++ include/ggml/ggml.h | 15 +- src/CMakeLists.txt | 5 + src/ggml-alloc.c | 150 +++---- src/ggml-backend.c | 506 ++++++++++++++++++++++ src/ggml-cuda.cu | 507 ++++++++++++++++++---- src/ggml-cuda.h | 5 + src/ggml.c | 790 +++++++++------------------------- tests/CMakeLists.txt | 20 +- 13 files changed, 1489 insertions(+), 837 deletions(-) create mode 100644 include/ggml/ggml-backend.h create mode 100644 src/ggml-backend.c diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e3404fb8b..c0201c131 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -19,11 +19,11 @@ target_link_libraries(common-ggml PRIVATE ggml) target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory(gpt-2) -add_subdirectory(gpt-j) +#add_subdirectory(gpt-j) add_subdirectory(whisper) add_subdirectory(mnist) -add_subdirectory(gpt-neox) -add_subdirectory(dolly-v2) +#add_subdirectory(gpt-neox) +#add_subdirectory(dolly-v2) add_subdirectory(replit) add_subdirectory(mpt) add_subdirectory(starcoder) diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt index 1d9bcdd8a..2307a7dd9 100644 --- a/examples/gpt-2/CMakeLists.txt +++ b/examples/gpt-2/CMakeLists.txt @@ -11,3 +11,13 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) set(TEST_TARGET gpt-2-quantize) add_executable(${TEST_TARGET} quantize.cpp) target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) + +# +# For GPU offloading + +if (GGML_CUBLAS) + add_compile_definitions(GGML_USE_CUBLAS) +endif() +if (GGML_CLBLAST) + add_compile_definitions(GGML_USE_CLBLAST) +endif() diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 81859ca5c..184eb8e9a 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -1,5 +1,10 @@ #include "ggml/ggml.h" #include "ggml/ggml-alloc.h" +#include "ggml/ggml-backend.h" + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif #include "common.h" #include "common-ggml.h" @@ -70,11 +75,14 @@ struct gpt2_model { // struct ggml_context * ctx; + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer_w; + ggml_backend_buffer_t buffer_kv; std::map tensors; }; // load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) { +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_gpu_layers) { printf("%s: loading model from '%s'\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -155,7 +163,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & auto & ctx = model.ctx; - size_t ctx_size = 0; + size_t buffer_size = 0; { const auto & hparams = model.hparams; @@ -165,46 +173,44 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + buffer_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + buffer_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + buffer_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - ctx_size += (6 + 12*n_layer)*512; // object overhead + buffer_size += (6 + 12*n_layer)*128; // alignment overhead - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0)); } // create the ggml context { + size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer; struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, + /*.mem_size =*/ ggml_tensor_overhead() * n_tensors, /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, + /*.no_alloc =*/ true, }; model.ctx = ggml_init(params); @@ -214,6 +220,31 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } } + // initialize the backend +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + + if (!model.backend) { + // fallback to CPU backend + fprintf(stderr, "%s: using CPU backend\n", __func__); + model.backend = ggml_backend_cpu_init(); + } + + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__); + return false; + } + + // allocate weights buffer + model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size); + // prepare memory for the weights { const auto & hparams = model.hparams; @@ -299,14 +330,25 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + + // allocate buffer and tensors + model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256); + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); + ggml_allocr_alloc(alloc, model.memory_k); + ggml_allocr_alloc(alloc, model.memory_v); + ggml_allocr_free(alloc); } // load weights { + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w); + size_t total_size = 0; bool has_lm_head = false; + std::vector read_buf; + while (true) { int32_t n_dims; int32_t length; @@ -336,6 +378,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } auto tensor = model.tensors[name]; + ggml_set_name(tensor, name.c_str()); if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); return false; @@ -360,11 +403,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + // read into a temporary buffer first, then copy to the tensor + // TODO: read directly into the tensor if the backend is CPU + read_buf.resize(ggml_nbytes(tensor)); + fin.read(read_buf.data(), ggml_nbytes(tensor)); + + ggml_allocr_alloc(alloc, tensor); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); // GPT-2 models share the WTE tensor as the LM head if (name == "model/wte" && has_lm_head == false) { - memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + //ggml_allocr_alloc(alloc, model.lm_head); + //ggml_backend_tensor_copy(tensor, model.lm_head); + model.lm_head = tensor; } if (name == "model/lm_head") { @@ -374,6 +425,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & total_size += ggml_nbytes(tensor); } + ggml_allocr_free(alloc); printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); } @@ -416,21 +468,23 @@ struct ggml_cgraph * gpt2_graph( // avoid writing to tensors if we are only measuring the memory usage if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd)); } struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_allocr_alloc(allocr, position); if (!ggml_allocr_is_measure(allocr)) { for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i; + int32_t v = n_past + i; + ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v)); } } struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_allocr_alloc(allocr, KQ_scale); if (!ggml_allocr_is_measure(allocr)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + float s = 1.0f/sqrtf(float(n_embd)/n_head); + ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s)); } // wte + wpe @@ -453,7 +507,8 @@ struct ggml_cgraph * gpt2_graph( ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + //ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + model.layers[il].ln_1_b); } // attn @@ -599,7 +654,8 @@ struct ggml_cgraph * gpt2_graph( ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + //ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + model.layers[il].ln_2_b); } // fully connected @@ -654,7 +710,8 @@ struct ggml_cgraph * gpt2_graph( ggml_mul(ctx0, ggml_repeat(ctx0, model.ln_f_g, inpL), inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); + //ggml_repeat(ctx0, model.ln_f_b, inpL)); + model.ln_f_b); } // inpL = WTE * inpL @@ -703,11 +760,12 @@ bool gpt2_eval( ggml_allocr_alloc_graph(allocr, gf); // run the computation - struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); - static std::vector work_buffer; - work_buffer.resize(plan.work_size); - plan.work_data = work_buffer.data(); - ggml_graph_compute(gf, &plan); +#ifndef GGML_USE_CUBLAS + // FIXME: the backend may be CPU even if CUDA is enabled + // if (model.backend.id == GGML_BACKEND_ID_CPU) + ggml_backend_cpu_set_n_threads(model.backend, n_threads); +#endif + ggml_backend_graph_compute(model.backend, gf); //if (n_past%100 == 0) { // ggml_graph_print (&gf); @@ -718,11 +776,11 @@ bool gpt2_eval( struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N); // return result just for the last token embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab); return true; } @@ -759,7 +817,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!gpt2_model_load(params.model, model, vocab)) { + if (!gpt2_model_load(params.model, model, vocab, params.n_gpu_layers)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return 1; } @@ -770,7 +828,7 @@ int main(int argc, char ** argv) { } // keep this buffer alive while evaluating the model - std::vector compute_buffer; + ggml_backend_buffer_t buf_compute; struct ggml_allocr * allocr = NULL; // allocate the compute buffer @@ -787,8 +845,8 @@ int main(int argc, char ** argv) { // recreate the allocator with the required memory ggml_allocr_free(allocr); - compute_buffer.resize(mem_size); - allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); + allocr = ggml_allocr_new_from_buffer(buf_compute); fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); } @@ -888,5 +946,10 @@ int main(int argc, char ** argv) { ggml_free(model.ctx); + ggml_backend_buffer_free(model.buffer_w); + ggml_backend_buffer_free(model.buffer_kv); + ggml_backend_buffer_free(buf_compute); + ggml_backend_free(model.backend); + return 0; } diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h index 0c224f174..c87139491 100644 --- a/include/ggml/ggml-alloc.h +++ b/include/ggml/ggml-alloc.h @@ -6,9 +6,11 @@ extern "C" { #endif +struct ggml_backend_buffer; GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer); // tell the allocator to parse nodes following the order described in the list // you should call this if your graph are optimized to execute out-of-order @@ -20,6 +22,10 @@ GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc); +GGML_API size_t ggml_allocr_alloc_graph_n( + struct ggml_allocr * alloc, + struct ggml_cgraph ** graphs, int n_graphs, + struct ggml_tensor *** inputs, struct ggml_tensor *** outputs); #ifdef __cplusplus diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h new file mode 100644 index 000000000..17e5a38a7 --- /dev/null +++ b/include/ggml/ggml-backend.h @@ -0,0 +1,151 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + typedef struct ggml_backend_s * ggml_backend_t; + + // backend buffer + struct ggml_backend_buffer; + typedef struct ggml_backend_buffer * ggml_backend_buffer_t; + typedef void * ggml_buffer_context_t; + + struct ggml_backend_buffer_interface { + void (*free_buffer) (ggml_backend_buffer_t buffer); + size_t (*get_alignment) (ggml_backend_buffer_t buffer); + void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer + size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback + void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback + + }; + + struct ggml_backend_buffer { + struct ggml_backend_buffer_interface interface; + ggml_backend_t backend; + ggml_buffer_context_t context; + size_t size; + }; + + // backend buffer functions + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size); + GGML_API void ggml_backend_buffer_free(ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + + // backend + typedef void * ggml_backend_context_t; + typedef void * ggml_graph_plan_t; + + struct ggml_backend_interface { + const char * (*get_name)(ggml_backend_t backend); + + void (*free)(ggml_backend_t backend); + + // buffer allocation + ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size); + + // tensor data access + // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize + void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*synchronize) (ggml_backend_t backend); + + // (optional) copy tensor between different backends, allow for single-copy tranfers + void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + + // compute graph with a plan + ggml_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*graph_plan_free) (ggml_backend_t backend, ggml_graph_plan_t plan); + void (*graph_plan_compute)(ggml_backend_t backend, ggml_graph_plan_t plan); + // compute graph without a plan + void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // check if the backend supports an operation + bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + }; + + struct ggml_backend_s { + struct ggml_backend_interface interface; + ggml_backend_context_t context; + }; + + // backend helper functions + static inline ggml_backend_t get_backend(const struct ggml_tensor * tensor) { return tensor->buffer->backend; } + + static inline const char * ggml_backend_name(ggml_backend_t backend) { return backend->interface.get_name(backend); } + static inline void ggml_backend_free(ggml_backend_t backend) { backend->interface.free(backend); } + static inline ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return backend->interface.alloc_buffer(backend, size); } + static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); } + static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); } + static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); } + static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); } + static inline void ggml_backend_synchronize(ggml_backend_t backend) { backend->interface.synchronize(backend); } + static inline ggml_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); } + static inline void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); } + static inline void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); } + static inline void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); } + static inline bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { return backend->interface.supports_op(backend, op); } + + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); + + // CPU backend + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + + /////////////////////////// + +#if 0 + // graph splitting + #define GGML_MAX_SPLITS 200 + #define GGML_MAX_SPLIT_INPUTS 4 + + struct ggml_graph_split { + char name[GGML_MAX_NAME]; + struct ggml_context * ctx; + struct ggml_tensor * src_inputs[GGML_MAX_SPLIT_INPUTS + 1]; + struct ggml_tensor * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1]; + struct ggml_cgraph * graph; + }; + + // TODO: this shouldn't be fixed size, allocate from ggml_context + struct ggml_graph_splits { + int n_splits; + struct ggml_graph_split splits[GGML_MAX_SPLITS]; + }; + + // TODO: allocate in ggml_context + GGML_API struct ggml_graph_splits ggml_graph_split_init(void); + + // this won't be needed once we can allocate graphs from a ggml_context + GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits); + + // add a split to the graph - single and multiple inputs versions + GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...); + GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...); + + // build graphs for all splits + GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output); + + // compute + GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits); + + // graph tensor allocator + GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx); + GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits); + + // automatically split a graph into multiple graphs based on the location of the tensors + GGML_API struct ggml_graph_splits ggml_graph_split(struct ggml_cgraph * graph, struct ggml_context * ctx); +#endif + +#ifdef __cplusplus +} +#endif diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index a9d4e33d9..db7cad0dc 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -401,14 +401,10 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_1D, GGML_OP_CONV_2D, - GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, - GGML_OP_CONV_1D_STAGE_0, // internal - GGML_OP_CONV_1D_STAGE_1, // internal - GGML_OP_UPSCALE, // nearest interpolate GGML_OP_FLASH_ATTN, @@ -481,6 +477,7 @@ extern "C" { struct ggml_tensor { enum ggml_type type; enum ggml_backend backend; + struct ggml_backend_buffer * buffer; int n_dims; int64_t ne[GGML_MAX_DIMS]; // number of elements @@ -514,7 +511,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[12]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -1390,14 +1387,6 @@ extern "C" { int s, int d); - GGML_API struct ggml_tensor * ggml_conv_transpose_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0); - GGML_API struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c857659ff..95f91e331 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -212,6 +212,9 @@ if (GGML_CUBLAS) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + if (CMAKE_BUILD_TYPE MATCHES Debug) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") + endif() else() message(WARNING "cuBLAS not found") endif() @@ -249,8 +252,10 @@ endif() add_library(${TARGET} ggml.c ggml-alloc.c + ggml-backend.c ../include/ggml/ggml.h ../include/ggml/ggml-alloc.h + ../include/ggml/ggml-backend.h ${GGML_CUDA_SOURCES} ${GGML_OPENCL_SOURCES} ${GGML_METAL_SOURCES} diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 805759db7..afb4e10cf 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -1,4 +1,5 @@ #include "ggml-alloc.h" +#include "ggml-backend.h" #include "ggml.h" #include #include @@ -6,25 +7,6 @@ #include #include -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #include -#endif - #define UNUSED(x) (void)(x) #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -80,8 +62,9 @@ struct free_block { #define MAX_FREE_BLOCKS 256 struct ggml_allocr { + ggml_backend_buffer_t buffer; + bool buffer_owned; void * data; - size_t size; size_t alignment; int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; @@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens } #endif -static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { - return ggml_nbytes(tensor); - - UNUSED(alloc); -} - // check if a tensor is allocated by this buffer static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) { - void * ptr = tensor->data; - return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; + return tensor->buffer == alloc->buffer; } static bool ggml_is_view(struct ggml_tensor * t) { @@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) { } void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { -#ifdef GGML_ALLOCATOR_DEBUG GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated -#endif - size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + + size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); @@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) tensor->data = addr; AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data); + tensor->buffer = alloc->buffer; + ggml_backend_buffer_init_tensor(alloc->buffer, tensor); #ifdef GGML_ALLOCATOR_DEBUG add_allocated_tensor(alloc, tensor); @@ -208,24 +185,27 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) // this is a very naive implementation, but for our case the number of free blocks should be very small static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { - void * ptr = tensor->data; - if (ggml_allocr_is_own(alloc, tensor) == false) { // the tensor was not allocated in this buffer // this can happen because the graph allocator will try to free weights and other tensors from different buffers // the easiest way to deal with this is just to ignore it + AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, tensor->buffer, alloc->buffer); return; } - size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size); + ggml_backend_buffer_free_tensor(alloc->buffer, tensor); + #ifdef GGML_ALLOCATOR_DEBUG remove_allocated_tensor(alloc, tensor); #endif + void * ptr = tensor->data; + // see if we can merge with an existing block for (int i = 0; i < alloc->n_free_blocks; i++) { struct free_block * block = &alloc->free_blocks[i]; @@ -285,16 +265,27 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; - alloc->free_blocks[0].size = alloc->size - align_offset; + alloc->free_blocks[0].size = alloc->buffer->size - align_offset; } struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size); + + struct ggml_allocr * alloc = ggml_allocr_new_from_buffer(buffer); + alloc->alignment = alignment; + alloc->buffer_owned = true; + + return alloc; +} + +struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) { struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); *alloc = (struct ggml_allocr){ - /*.data = */ data, - /*.size = */ size, - /*.alignment = */ alignment, + /*.buffer = */ buffer, + /*.buffer_owned = */ false, + /*.base = */ ggml_backend_buffer_get_base(buffer), + /*.alignment = */ ggml_backend_buffer_get_alignment(buffer), /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, /*.hash_table = */ {{0}}, @@ -312,68 +303,19 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) return alloc; } -// OS specific functions to allocate and free uncommitted virtual memory -static void * alloc_vmem(size_t size) { -#if defined(_WIN32) - return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); -#elif defined(_POSIX_MAPPED_FILES) - void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); - if (ptr == MAP_FAILED) { - return NULL; - } - return ptr; -#else - // use a fixed address for other platforms - uintptr_t base_addr = (uintptr_t)-size - 0x100; - return (void *)base_addr; -#endif -} - -static void free_vmem(void * base_addr, size_t size) { -#if defined(_WIN32) - VirtualFree(base_addr, 0, MEM_RELEASE); - UNUSED(size); -#elif defined(_POSIX_MAPPED_FILES) - munmap(base_addr, size); -#else - // nothing to do - UNUSED(base_addr); - UNUSED(size); -#endif -} - -// allocate uncommitted virtual memory to measure the size of the graph -static void alloc_measure_vmem(void ** base_addr, size_t * size) { - // 128GB for 64-bit, 1GB for 32-bit - *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37; - do { - *base_addr = alloc_vmem(*size); - if (*base_addr != NULL) { - AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); - return; - } - // try again with half the size - *size /= 2; - } while (*size > 0); - - GGML_ASSERT(!"failed to allocate virtual memory for measure buffer"); -} - -static void free_measure_vmem(void * base_addr, size_t size) { - free_vmem(base_addr, size); -} - struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); - void * base_addr; - size_t size; - - alloc_measure_vmem(&base_addr, &size); + // TODO: these should be set by the backend: + // - get_alignment() + // - get_alloc_size() + // TODO: support other backends + struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr((void *)0x1000, (size_t)-0x1001); *alloc = (struct ggml_allocr){ - /*.data = */ base_addr, - /*.size = */ size, + /*.buffer = */ buffer, + /*.buffer_owned = */ true, + /*.base = */ ggml_backend_buffer_get_base(buffer), /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, @@ -393,8 +335,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { } void ggml_allocr_free(struct ggml_allocr * alloc) { - if (alloc->measure) { - free_measure_vmem(alloc->data, alloc->size); + if (alloc->buffer_owned) { + ggml_backend_buffer_free(alloc->buffer); } free(alloc); } @@ -451,6 +393,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) if (ggml_is_view(node)) { assert(node->view_src->data != NULL); node->data = (char *)node->view_src->data + node->view_offs; + node->buffer = node->view_src->buffer; + ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -479,12 +423,16 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); node->data = parent->data; + node->buffer = parent->buffer; + ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view return; } } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->data = parent->data; + node->buffer = parent->buffer; + ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view return; } } @@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) } } -static size_t ggml_allocr_alloc_graph_tensors_n( +size_t ggml_allocr_alloc_graph_n( struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { @@ -631,7 +579,11 @@ static size_t ggml_allocr_alloc_graph_tensors_n( } size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { - return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); + return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL); +} + +size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { + return alloc->max_size; } size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { diff --git a/src/ggml-backend.c b/src/ggml-backend.c new file mode 100644 index 000000000..d49c5e7a1 --- /dev/null +++ b/src/ggml-backend.c @@ -0,0 +1,506 @@ +#include "ggml-backend.h" +#include "ggml-alloc.h" +#include +#include +#include +#include +#include + +#define UNUSED(x) (void)(x) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + + +// backend buffer + +struct ggml_backend_buffer * ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size) { + struct ggml_backend_buffer * buffer = malloc(sizeof(struct ggml_backend_buffer)); + + GGML_ASSERT(interface.get_base != NULL); + + (*buffer) = (struct ggml_backend_buffer) { + /* .interface = */ interface, + /* .backend = */ backend, + /* .context = */ context, + /* .size = */ size, + }; + + return buffer; +} + +void ggml_backend_buffer_free(struct ggml_backend_buffer * buffer) { + if (buffer->interface.free_buffer != NULL) { + buffer->interface.free_buffer(buffer); + } +} + +size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { + if (buffer->interface.get_alignment) { + return buffer->interface.get_alignment(buffer); + } + return 64; +} + +void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + return buffer->interface.get_base(buffer); +} + +size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + if (buffer->interface.get_alloc_size) { + return buffer->interface.get_alloc_size(buffer, tensor); + } + return ggml_nbytes(tensor); +} + +void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + if (buffer->interface.init_tensor) { + buffer->interface.init_tensor(buffer, tensor); + } +} + +void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + if (buffer->interface.free_tensor) { + buffer->interface.free_tensor(buffer, tensor); + } +} + +// backend copy + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { + //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]); + //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); + GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); + + // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); + + if (src == dst) { + return; + } + + // TODO: allow backends to support copy to/from same backend + + if (get_backend(dst)->interface.cpy_tensor_from != NULL) { + get_backend(dst)->interface.cpy_tensor_from(get_backend(dst)->context, src, dst); + } else if (get_backend(src)->interface.cpy_tensor_to != NULL) { + get_backend(src)->interface.cpy_tensor_to(get_backend(src)->context, src, dst); + } else { + // shouldn't be hit when copying from/to CPU + #ifndef NDEBUG + fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend)); + #endif + size_t nbytes = ggml_nbytes(src); + void * data = malloc(nbytes); + ggml_backend_tensor_get(src, data, 0, nbytes); + ggml_backend_tensor_set(dst, data, 0, nbytes); + free(data); + } +} + +// backend CPU + +struct ggml_backend_cpu_context { + int n_threads; + void * work_data; + size_t work_size; +}; + +static const char * ggml_backend_cpu_name(ggml_backend_t backend) { + return "CPU"; + + UNUSED(backend); +} + +static void ggml_backend_cpu_free(ggml_backend_t backend) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + free(cpu_ctx->work_data); + free(cpu_ctx); + free(backend); +} + +static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 + +static size_t ggml_backend_cpu_buffer_get_alignment(ggml_backend_buffer_t buffer) { + return TENSOR_ALIGNMENT; + UNUSED(buffer); +} + +static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)buffer->context; +} + +static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); + UNUSED(buffer); +} + +static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = { + /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_get_alignment, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, + /* .free_tensor = */ NULL, +}; + +// for buffers from ptr, free is not called +static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_ptr = { + /* .free_buffer = */ NULL, + /* .get_alignment = */ ggml_backend_cpu_buffer_get_alignment, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, + /* .free_tensor = */ NULL, +}; + + +static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { + void * data = malloc(size + TENSOR_ALIGNMENT); + return ggml_backend_buffer_init(cpu_backend_buffer_interface, backend, data, size); +} + +static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(backend); +} + +static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { + UNUSED(backend); +} + +static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +struct ggml_backend_cpu_plan { + struct ggml_cplan cplan; + struct ggml_cgraph cgraph; +}; + +static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_backend_cpu_plan * cpu_plan = malloc(sizeof(struct ggml_backend_cpu_plan)); + + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + cpu_plan->cgraph = *cgraph; + + if (cpu_plan->cplan.work_size > 0) { + cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + } + + return cpu_plan; +} + +static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { + struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan; + + free(cpu_plan->cplan.work_data); + free(cpu_plan); + + UNUSED(backend); +} + +static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { + struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan; + + ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); + + UNUSED(backend); +} + +static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + + if (cpu_ctx->work_size < cplan.work_size) { + // TODO: may be faster to free and use malloc to avoid the copy + cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); + cpu_ctx->work_size = cplan.work_size; + } + + cplan.work_data = cpu_ctx->work_data; + + ggml_graph_compute(cgraph, &cplan); +} + +static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return true; + UNUSED(backend); + UNUSED(op); +} + +static struct ggml_backend_interface cpu_backend_interface = { + /* .get_name = */ ggml_backend_cpu_name, + /* .free = */ ggml_backend_cpu_free, + /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer, + /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async, + /* .synchronize = */ ggml_backend_cpu_synchronize, + /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to, + /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cpu_graph_compute, + /* .supports_op = */ ggml_backend_cpu_supports_op, +}; + +ggml_backend_t ggml_backend_cpu_init(void) { + struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + + ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend_s)); + + *cpu_backend = (struct ggml_backend_s) { + /* .interface = */ cpu_backend_interface, + /* .context = */ ctx + }; + return cpu_backend; +} + +void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->n_threads = n_threads; +} + +struct ggml_backend_buffer * ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { + // TODO: NULL backend? + // TODO: no free + return ggml_backend_buffer_init(cpu_backend_buffer_interface_from_ptr, NULL, ptr, size); +} + +#if 0 +// splits + +struct ggml_graph_splits ggml_graph_split_init(void) { + struct ggml_graph_splits splits = {0}; + return splits; +} + +// TODO: this can be removed after allocating the graphs in a ggml_context +void ggml_graph_splits_free(struct ggml_graph_splits * splits) { + for (int i = 0; i < splits->n_splits; i++) { + if (splits->splits[i].graph) { + free(splits->splits[i].graph); + } + } +} + +static void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, va_list args) { + GGML_ASSERT(splits->n_splits < GGML_MAX_SPLITS); + + struct ggml_graph_split * split = &splits->splits[splits->n_splits]; + + + if (splits->n_splits == 0) { + // always add the first split + int i = 0; + while (inputs[i] != NULL) { + GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); + split->src_inputs[i] = *inputs[i]; + split->dst_inputs[i] = *inputs[i]; + i++; + } + split->src_inputs[i] = NULL; + split->dst_inputs[i] = NULL; + split->ctx = ctx; + } + // check if the split is on the same context as the previous one + else if (splits->n_splits > 0 && splits->splits[splits->n_splits - 1].ctx == ctx) { + // add to the previous split + char name[GGML_MAX_NAME - 2]; + int n = vsnprintf(name, sizeof(name), fmt, args); + char new_name[GGML_MAX_NAME]; + snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name); + strcpy(splits->splits[splits->n_splits - 1].name, new_name); + return; + } else { + // add a new split + int i = 0; + while (inputs[i] != NULL) { + GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); + split->src_inputs[i] = *inputs[i]; + split->dst_inputs[i] = ggml_dup_tensor(ctx, *inputs[i]); + ggml_format_name(split->dst_inputs[i], "%s (split output)", split->src_inputs[i]->name); + // TODO: maybe support different layouts in ggml_backend_cpy_tensor instead + for (int j = 0; j < GGML_MAX_DIMS; j++) { + split->dst_inputs[i]->nb[j] = split->src_inputs[i]->nb[j]; + } + ggml_set_name(split->dst_inputs[i], ggml_get_name(*inputs[i])); + *inputs[i] = split->dst_inputs[i]; + i++; + } + split->src_inputs[i] = NULL; + split->dst_inputs[i] = NULL; + split->ctx = ctx; + } + + vsnprintf(split->name, GGML_MAX_NAME, fmt, args); + split->graph = NULL; + splits->n_splits++; +} + +void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** input, struct ggml_context * ctx, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + ggml_graph_splits_add_n_va(splits, input, ctx, fmt, args); + va_end(args); +} + +void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + ggml_graph_splits_add_n_va(splits, (struct ggml_tensor**[2]){ input, NULL }, ctx, fmt, args); + va_end(args); +} + +void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output) { + struct ggml_tensor *last_outputs[2] = { output, NULL }; + struct ggml_tensor ** outputs; + + for (int i = 0; i < splits->n_splits; i++) { + struct ggml_graph_split * split = &splits->splits[i]; + + if (i < splits->n_splits - 1) { + outputs = splits->splits[i + 1].src_inputs; + } else { + outputs = last_outputs; + } + + // build the graph + // TODO: allocate graphs in context + split->graph = (struct ggml_cgraph *) malloc(sizeof(struct ggml_cgraph)); + memset(split->graph, 0, sizeof(struct ggml_cgraph)); + for (int j = 0; outputs[j] != NULL; j++) { + ggml_build_forward_expand(split->graph, outputs[j]); + } + + for (int j = 1; j < split->graph->n_nodes; j++) { + if (split->graph->nodes[j]->backend != split->graph->nodes[0]->backend) { + fprintf(stderr, "split %s: node %s has different backend (%s) than the first node (%s)\n", + split->name, split->graph->nodes[j]->name, + ggml_backend_name(split->graph->nodes[j]->backend_s), + ggml_backend_name(split->graph->nodes[0]->backend_s)); + } + } + for (int j = 1; j < split->graph->n_leafs; j++) { + if (split->graph->leafs[j]->backend != split->graph->leafs[0]->backend) { + fprintf(stderr, "split %s: leaf %s has different backend (%s) than the first leaf (%s)\n", + split->name, split->graph->leafs[j]->name, + ggml_backend_name(split->graph->leafs[j]->backend_s), + ggml_backend_name(split->graph->leafs[0]->backend_s)); + } + } + } +} + +void ggml_graph_splits_compute(struct ggml_graph_splits * splits) { + uint64_t copy_us = 0; + uint64_t compute_cpu_us = 0; + uint64_t compute_gpu_us = 0; + int n_nodes = 0; + for (int i = 0; i < splits->n_splits; i++) { + struct ggml_graph_split * split = &splits->splits[i]; + + //printf("computing split %i (%s) on backend %s (%i nodes)\n", i, split->name, ggml_backend_name(split->dst_inputs[0]->backend), split->graph->n_nodes); + + // copy the input tensor to the backend + uint64_t copy_start_us = ggml_time_us(); + for (int j = 0; split->src_inputs[j] != NULL; j++) { + //printf("\tcopying tensor %d (%s) (%s -> %s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_backend_name(split->src_inputs[j]->backend), ggml_backend_name(split->dst_inputs[j]->backend), ggml_nbytes(split->src_inputs[j])); + //printf("%p %p\n", split->src_inputs[j], split->dst_inputs[j]); + ggml_backend_tensor_copy(split->src_inputs[j], split->dst_inputs[j]); + } + // ggml_backend_synchronize(split->dst_inputs[0]->backend); + copy_us += ggml_time_us() - copy_start_us; + +#if 0 + char split_filename[GGML_MAX_NAME]; + snprintf(split_filename, GGML_MAX_NAME, "split_%i.dot", i); + ggml_graph_dump_dot(split->graph, NULL, split_filename); +#endif + uint64_t start = ggml_time_us(); + ggml_backend_graph_compute(split->dst_inputs[0]->backend_s, split->graph); + //ggml_backend_synchronize(split->dst_inputs[0]->backend); + uint64_t end = ggml_time_us(); + if (strcmp(ggml_backend_name(split->dst_inputs[0]->backend_s), "CPU") == 0) { + compute_cpu_us += end - start; + } else { + compute_gpu_us += end - start; + } + + n_nodes += split->graph->n_nodes; + } + + //printf("ggml_graph_splits_compute: n_splits: %d, nodes: %d, copy: %.2fms, compute_cpu: %.2fms, compute_gpu: %.2fms\n", splits->n_splits, n_nodes, copy_us / 1000.0, compute_cpu_us / 1000.0, compute_gpu_us / 1000.0); + //exit(0); +} + +void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) { + // splits of the same backend are allocated together to ensure that dependencies from one split to the next + // are not overwritten when there is another split from a different backend between them (e.g. inpSA in llama.cpp) + bool visited[GGML_MAX_SPLITS] = {false}; + for (int i = 0; i < splits->n_splits; i++) { + if (!visited[i]) { + struct ggml_graph_split * split = &splits->splits[i]; + struct ggml_context * ctx = split->ctx; + struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS]; + struct ggml_tensor ** graph_inputs[GGML_MAX_SPLITS]; + struct ggml_tensor ** graph_outputs[GGML_MAX_SPLITS]; + int n_graphs = 0; + + for (int j = i; j < splits->n_splits; j++) { + if (splits->splits[j].ctx == ctx) { + graph_inputs[n_graphs] = splits->splits[j].dst_inputs; + graph_outputs[n_graphs] = j < splits->n_splits - 1 ? splits->splits[j + 1].src_inputs : NULL; + backend_graphs[n_graphs] = splits->splits[j].graph; + visited[j] = true; + n_graphs++; + } + } + + struct ggml_allocr * alloc = NULL; + ggml_allocr_alloc_graph_n(alloc, backend_graphs, n_graphs, graph_inputs, graph_outputs); + } + } +} +#endif diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 989c419cd..74b443e6c 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7,6 +7,8 @@ #include #include +#define GGML_CUDA_FORCE_DMMV // FIXME: ggml_cuda_op_mul_mat_vec_q produces wrong results with GPT-2 + #if defined(GGML_USE_HIPBLAS) #include #include @@ -419,6 +421,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 +#define CUDA_GET_ROWS_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X @@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest reinterpret_cast(y[ib].ds.y) = sum; } +template +static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) { + const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2; + const int row = blockDim.y*blockIdx.y + threadIdx.y; + + if (col >= ncols) { + return; + } + + const int r = y[row]; + + // copy x[r*ncols + col] to dst[row*ncols + col] + const int xi = r*ncols + col; + const int di = row*ncols + col; + + const int ib = xi/qk; // block index + const int iqs = (xi%qk)/qr; // quant index + const int iybs = di - di%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(x, ib, iqs, v); + + dst[iybs + iqs + 0] = v.x; + dst[iybs + iqs + y_offset] = v.y; +} + template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; @@ -4555,6 +4586,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale dst[i] = scale * x[i]; } + +template +static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const dim3 block_nums(block_num_x, nrows, 1); + k_get_rows<<>>(x, y, dst, ncols); +} + static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; add_f32<<>>(x, y, dst, kx, ky); @@ -5703,7 +5743,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; - struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; CUDA_CHECK(cudaGetDevice(&id)); src_ptr = (char *) extra->data_device[id]; @@ -5739,6 +5779,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } } +static void ggml_cuda_op_repeat( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) { + // guaranteed to be an integer due to the check in ggml_can_repeat + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // TODO: very inefficient, implement in a kernel + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + CUDA_CHECK(cudaMemcpyAsync( + (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, + (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, + ne00*nb0, cudaMemcpyDeviceToDevice, stream)); + } + } + } + } + } + } + } + + (void) src1; + (void) src1_d; +} + +static void ggml_cuda_op_get_rows( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + const int ncols = src0->ne[0]; + const int nrows = ggml_nelements(src1); + + const int * src1_i32 = (const int *) src1_d; + + switch (src0->type) { + case GGML_TYPE_F16: + get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + default: + // TODO: k-quants + GGML_ASSERT(false); + break; + } +} + inline void ggml_cuda_op_add( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { @@ -6343,7 +6484,14 @@ inline void ggml_cuda_op_scale( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float scale = ((float *) src1->data)[0]; + float scale; + // HACK: support for ggml backend interface + if (src1->backend == GGML_BACKEND_CPU) { + scale = ((float *) src1->data)[0]; + } else { + // TODO: pass pointer to kernel instead of copying to host + CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost)); + } scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); @@ -6362,9 +6510,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; @@ -6505,9 +6653,9 @@ static void ggml_cuda_op_mul_mat( const size_t q8_1_ts = sizeof(block_q8_1); const size_t q8_1_bs = QK8_1; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); @@ -6758,6 +6906,14 @@ static void ggml_cuda_op_mul_mat( } } +static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat); +} + +static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows); +} + static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); } @@ -6812,13 +6968,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); @@ -6843,13 +6999,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; const int64_t row_stride_x = nb01 / sizeof(half); @@ -6870,11 +7026,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } - if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_p021(src0, src1, dst); } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - }else if (src0->type == GGML_TYPE_F32) { + } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { @@ -6935,8 +7091,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; @@ -6992,7 +7148,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const size_t nb1 = tensor->nb[1]; ggml_backend backend = tensor->backend; - struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; + ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); for (int64_t id = 0; id < g_device_count; ++id) { @@ -7046,7 +7202,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); } - CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); extra->data_device[id] = buf; @@ -7085,17 +7240,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } -static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; static size_t g_temp_tensor_extra_index = 0; -static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { +static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (g_temp_tensor_extras == nullptr) { g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; } size_t alloc_index = g_temp_tensor_extra_index; g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; - struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); return extra; @@ -7123,7 +7278,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra return; } - struct ggml_tensor_extra_gpu * extra; + ggml_tensor_extra_gpu * extra; const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW || @@ -7132,7 +7287,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra CUDA_CHECK(ggml_cuda_set_device(g_main_device)); if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -7141,7 +7296,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src0_ddc + offset; } else if (tensor->op == GGML_OP_CPY) { - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; void * src1_ddv = src1_extra->data_device[g_main_device]; extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src1_ddv; @@ -7183,13 +7338,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); } - struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); + ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW; if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t view_offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -7207,7 +7362,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(ggml_is_contiguous(tensor)); - struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice)); } @@ -7264,58 +7419,47 @@ void ggml_cuda_free_scratch() { g_scratch_buffer = nullptr; } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) { + return false; + } + switch (tensor->op) { + case GGML_OP_REPEAT: + func = ggml_cuda_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_cuda_get_rows; + break; case GGML_OP_DUP: - if (!any_on_device) { - return false; - } func = ggml_cuda_dup; break; case GGML_OP_ADD: - if (!any_on_device) { - return false; - } func = ggml_cuda_add; break; case GGML_OP_MUL: - if (!any_on_device) { - return false; - } func = ggml_cuda_mul; break; case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_GELU: - if (!any_on_device) { - return false; - } func = ggml_cuda_gelu; break; case GGML_UNARY_OP_SILU: - if (!any_on_device) { - return false; - } func = ggml_cuda_silu; break; default: return false; } break; case GGML_OP_NORM: - if (!any_on_device) { - return false; - } func = ggml_cuda_norm; break; case GGML_OP_RMS_NORM: - if (!any_on_device) { - return false; - } func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: @@ -7325,54 +7469,30 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_mul_mat; break; case GGML_OP_SCALE: - if (!any_on_device) { - return false; - } func = ggml_cuda_scale; break; case GGML_OP_CPY: - if (!any_on_device) { - return false; - } func = ggml_cuda_cpy; break; case GGML_OP_CONT: - if (!any_on_device) { - return false; - } func = ggml_cuda_dup; break; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - if (!any_on_device) { - return false; - } func = ggml_cuda_nop; break; case GGML_OP_DIAG_MASK_INF: - if (!any_on_device) { - return false; - } func = ggml_cuda_diag_mask_inf; break; case GGML_OP_SOFT_MAX: - if (!any_on_device) { - return false; - } func = ggml_cuda_soft_max; break; case GGML_OP_ROPE: - if (!any_on_device) { - return false; - } func = ggml_cuda_rope; break; case GGML_OP_ALIBI: - if (!any_on_device) { - return false; - } func = ggml_cuda_alibi; break; default: @@ -7400,3 +7520,240 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#define UNUSED(x) (void)(x) + +struct ggml_backend_cuda_context { +}; + +static const char * ggml_backend_cuda_name(ggml_backend_t backend) { + return GGML_CUDA_NAME; + + UNUSED(backend); +} + +static void ggml_backend_cuda_free(ggml_backend_t backend) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + delete cuda_ctx; + delete backend; +} + +struct ggml_cuda_buffer_context { + void * device; + + ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; + size_t temp_tensor_extra_index = 0; + + ~ggml_cuda_buffer_context() { + delete[] temp_tensor_extras; + } + + ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (temp_tensor_extras == nullptr) { + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + } + + size_t alloc_index = temp_tensor_extra_index; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES; + ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; + } +}; + +static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; + CUDA_CHECK(cudaFree(ctx->device)); + delete ctx; +} + +static size_t ggml_backend_cuda_buffer_get_alignment(ggml_backend_buffer_t buffer) { + return 128; + UNUSED(buffer); +} + +static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; + return ctx->device; +} + +static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + return ggml_nbytes_split(tensor, nrows_split); + + UNUSED(buffer); +} + +static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; + ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); + + extra->data_device[g_main_device] = tensor->data; + + tensor->backend = GGML_BACKEND_GPU; + tensor->extra = extra; + + // initialize padding to 0 to avoid possible NaN values + size_t original_size = ggml_nbytes(tensor); + size_t size = ggml_backend_cuda_buffer_get_alloc_size(buffer, tensor); + + if (size > original_size && tensor->view_src == nullptr) { + CUDA_CHECK(cudaMemset((char *) tensor->data + original_size, 0, size - original_size)); + } + + UNUSED(buffer); +} + +static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = { + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_alignment = */ ggml_backend_cuda_buffer_get_alignment, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .free_tensor = */ NULL, +}; + +static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) { + ggml_cuda_buffer_context * ctx = new ggml_cuda_buffer_context; + CUDA_CHECK(cudaMalloc(&ctx->device, size)); + return ggml_backend_buffer_init(cuda_backend_buffer_interface, backend, ctx, size); +} + +static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + //GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0])); + + UNUSED(backend); +} + +static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + + UNUSED(backend); +} + +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { + CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + UNUSED(backend); +} + +static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { + GGML_ASSERT(!"not implemented"); + + return nullptr; + + UNUSED(backend); + UNUSED(cgraph); +} + +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +#include +static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_compute_params params = {}; + params.type = GGML_TASK_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + // views of allocated tensors don't call init_tensor, handle them here + // TODO: handle in ggml-alloc + if (node->extra == nullptr) { + GGML_ASSERT(node->view_src != nullptr); + GGML_ASSERT(node->view_src->backend == GGML_BACKEND_GPU); + ggml_backend_cuda_buffer_init_tensor(node->buffer, node); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr && node->src[j]->extra == nullptr) { + GGML_ASSERT(node->src[j]->view_src != nullptr); + GGML_ASSERT(node->src[j]->view_src->backend == GGML_BACKEND_GPU); + ggml_backend_cuda_buffer_init_tensor(node->src[j]->buffer, node->src[j]); + } + } + + bool ok = ggml_cuda_compute_forward(¶ms, node); + if (!ok) { + fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + +#if 0 + if (node->type == GGML_TYPE_F32) { + cudaDeviceSynchronize(); + std::vector tmp(ggml_nelements(node), 0.0f); + cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); + printf("\n%s (%s) (%s %s): ", node->name, ggml_op_name(node->op), + ggml_type_name(node->src[0]->type), + node->src[1] ? ggml_type_name(node->src[1]->type) : "none"); + double sum = 0.0; + double sq_sum = 0.0; + for (int i = 0; i < ggml_nelements(node); i++) { + //printf("%f ", tmp[i]); + sum += tmp[i]; + sq_sum += tmp[i]*tmp[i]; + } + //printf("\n"); + printf("sum: %f, ", sum); + printf("sq_sum: %f\n", sq_sum); + } +#endif + } + + UNUSED(backend); +} + +static ggml_backend_interface cuda_backend_interface = { + /* .get_name = */ ggml_backend_cuda_name, + /* .free = */ ggml_backend_cuda_free, + /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer, + /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, + /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .cpy_tensor_from = */ nullptr, + /* .cpy_tensor_to = */ nullptr, + /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cuda_graph_compute, + /* .supports_op = */ nullptr, +}; + +ggml_backend_t ggml_backend_cuda_init() { + ggml_init_cublas(); // TODO: remove from ggml.c + + ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context; + + ggml_backend_t cuda_backend = new ggml_backend_s; + *cuda_backend = (ggml_backend_s){ + /* .interface = */ cuda_backend_interface, + /* .context = */ ctx + }; + return cuda_backend; +} diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h index fda704b66..81ee9a2e9 100644 --- a/src/ggml-cuda.h +++ b/src/ggml-cuda.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #ifdef GGML_USE_HIPBLAS #define GGML_CUDA_NAME "ROCm" @@ -42,6 +43,10 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s GGML_API int ggml_cuda_get_device_count(void); GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); +// backend API +GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use + + #ifdef __cplusplus } #endif diff --git a/src/ggml.c b/src/ggml.c index b72069087..b1d11ba5c 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4081,16 +4081,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ALIBI", "CLAMP", "CONV_1D", - "CONV_TRANSPOSE_1D", "CONV_2D", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", - "CONV_1D_STAGE_0", - "CONV_1D_STAGE_1", - "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", @@ -4116,7 +4112,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4167,16 +4163,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "alibi(x)", "clamp(x)", "conv_1d(x)", - "conv_transpose_1d(x)", "conv_2d(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", - "conv_1d_stage_0(x)", - "conv_1d_stage_1(x)", - "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", @@ -4202,7 +4194,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4231,10 +4223,7 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_DIAG_MASK_INF ] = true; p[GGML_OP_DIAG_MASK_ZERO ] = true; p[GGML_OP_CONV_1D ] = true; - p[GGML_OP_CONV_1D_STAGE_0 ] = true; - p[GGML_OP_CONV_1D_STAGE_1 ] = true; p[GGML_OP_CONV_2D ] = true; - p[GGML_OP_CONV_TRANSPOSE_1D ] = true; p[GGML_OP_CONV_TRANSPOSE_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -4951,6 +4940,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, /*.backend =*/ GGML_BACKEND_CPU, + /*.buffer =*/ NULL, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -4983,6 +4973,11 @@ static struct ggml_tensor * ggml_new_tensor_impl( result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } + if (view_src != NULL) { + result->backend = view_src->backend; + result->buffer = view_src->buffer; + } + ctx->n_objects++; return result; @@ -5797,7 +5792,7 @@ static struct ggml_tensor * ggml_mul_impl( bool inplace) { // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); - GGML_ASSERT(ggml_can_repeat_rows(b, a)); + //GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; @@ -7514,17 +7509,14 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -// im2col: [N, IC, IL] => [N, OL, IC*K] -// a: [OC,IC, K] -// b: [N, IC, IL] -// result: [N, OL, IC*K] -static struct ggml_tensor * ggml_conv_1d_stage_0( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0) { +GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + GGML_ASSERT(ggml_is_matrix(b)); GGML_ASSERT(a->ne[1] == b->ne[1]); bool is_node = false; @@ -7533,54 +7525,16 @@ static struct ggml_tensor * ggml_conv_1d_stage_0( is_node = true; } - const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); - const int64_t ne[4] = { - a->ne[1] * a->ne[0], - OL, - b->ne[2], - 1, + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_1D_STAGE_0; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - - return result; -} - -// ggml_conv_1d_stage_1 - -// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] -// a: [OC, IC, K] -// b: [N, OL, IC * K] -// result: [N, OC, OL] -static struct ggml_tensor * ggml_conv_1d_stage_1( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - const int64_t ne[4] = { - b->ne[1], - a->ne[2], - b->ne[2], - 1, - }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - - result->op = GGML_OP_CONV_1D_STAGE_1; + result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7588,53 +7542,6 @@ static struct ggml_tensor * ggml_conv_1d_stage_1( return result; } -// ggml_conv_1d - -GGML_API struct ggml_tensor * ggml_conv_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0) { - struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0); - result = ggml_conv_1d_stage_1(ctx, a, result); - return result; -} - -// GGML_API struct ggml_tensor * ggml_conv_1d( -// struct ggml_context * ctx, -// struct ggml_tensor * a, -// struct ggml_tensor * b, -// int s0, -// int p0, -// int d0) { -// GGML_ASSERT(ggml_is_matrix(b)); -// GGML_ASSERT(a->ne[1] == b->ne[1]); -// bool is_node = false; - -// if (a->grad || b->grad) { -// GGML_ASSERT(false); // TODO: implement backward -// is_node = true; -// } - -// const int64_t ne[4] = { -// ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), -// a->ne[2], 1, 1, -// }; -// struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - -// int32_t params[] = { s0, p0, d0 }; -// ggml_set_op_params(result, params, sizeof(params)); - -// result->op = GGML_OP_CONV_1D; -// result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; -// result->src[0] = a; -// result->src[1] = b; - -// return result; -// } - // ggml_conv_1d_ph struct ggml_tensor* ggml_conv_1d_ph( @@ -7646,50 +7553,6 @@ struct ggml_tensor* ggml_conv_1d_ph( return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } -// ggml_conv_transpose_1d - -static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { - return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; -} - -GGML_API struct ggml_tensor * ggml_conv_transpose_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[2] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); - - GGML_ASSERT(p0 == 0); - GGML_ASSERT(d0 == 1); - - bool is_node = false; - - if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - const int64_t ne[4] = { - ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), - a->ne[1], b->ne[2], 1, - }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - - int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, params, sizeof(params)); - - result->op = GGML_OP_CONV_TRANSPOSE_1D; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - result->src[1] = b; - - return result; -} - // ggml_conv_2d struct ggml_tensor * ggml_conv_2d( @@ -13829,7 +13692,7 @@ static void ggml_compute_forward_rope_back( // ggml_compute_forward_conv_1d -static void ggml_compute_forward_conv_1d_f16_f32( +static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -13847,33 +13710,42 @@ static void ggml_compute_forward_conv_1d_f16_f32( const int nth = params->nth; const int nk = ne00; + const int nh = nk/2; - // size of the convolution row - the kernel size unrolled across all input channels - const int ew0 = nk*ne01; - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + const int ew0 = ggml_up32(ne01); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + // prepare kernel data (src0) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - ggml_fp16_t * dst_data = wdata; + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } - for (int64_t i0 = 0; i0 < ne0; i0++) { - for (int64_t ik = 0; ik < nk; ik++) { - const int idx0 = i0*s0 + ik*d0 - p0; + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; - if(!(idx0 < 0 || idx0 >= ne10)) { - dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]); - } + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); } } } @@ -13886,7 +13758,7 @@ static void ggml_compute_forward_conv_1d_f16_f32( } // total rows in dst - const int nr = ne2; + const int nr = ne02; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -13895,22 +13767,23 @@ static void ggml_compute_forward_conv_1d_f16_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - - for (int i2 = 0; i2 < ne2; i2++) { - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); - - for (int i0 = 0; i0 < ne0; i0++) { - ggml_vec_dot_f16(ew0, dst_data + i0, - (ggml_fp16_t *) ((char *) src0->data + i1*nb02), - (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0); + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int64_t i0 = 0; i0 < ne10; ++i0) { + dst_data[i0] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0] += v; } } } } -static void ggml_compute_forward_conv_1d_f32( +static void ggml_compute_forward_conv_1d_s1_ph_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -13928,32 +13801,42 @@ static void ggml_compute_forward_conv_1d_f32( const int nth = params->nth; const int nk = ne00; + const int nh = nk/2; - const int ew0 = nk*ne01; - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + const int ew0 = ggml_up32(ne01); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - float * const wdata = (float *) params->wdata + 0; + // prepare kernel data (src0) + { + float * const wdata = (float *) params->wdata + 0; - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - float * dst_data = wdata; + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i02*ew0*ne00; + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } - for (int64_t i0 = 0; i0 < ne0; i0++) { - for (int64_t ik = 0; ik < nk; ik++) { - const int idx0 = i0*s0 + ik*d0 - p0; + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + ne02*ew0*ne00; - if(!(idx0 < 0 || idx0 >= ne10)) { - dst_data[i0*ew0 + i11*nk + ik] = src[idx0]; - } + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + for (int64_t i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = src[i10]; } } } @@ -13975,225 +13858,35 @@ static void ggml_compute_forward_conv_1d_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - float * const wdata = (float *) params->wdata + 0; - - for (int i2 = 0; i2 < ne2; i2++) { - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); - - for (int i0 = 0; i0 < ne0; i0++) { - ggml_vec_dot_f32(ew0, dst_data + i0, - (float *) ((char *) src0->data + i1*nb02), - (float *) wdata + i2*nb2 + i0*ew0); - } - } - } -} - -static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, - ggml_fp16_t * A, - ggml_fp16_t * B, - float * C, - const int ith, const int nth) { - // does not seem to make a difference - int64_t m0, m1, n0, n1; - // patches per thread - if (m > n) { - n0 = 0; - n1 = n; - - // total patches in dst - const int np = m; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - m0 = dp*ith; - m1 = MIN(m0 + dp, np); - } else { - m0 = 0; - m1 = m; - - // total patches in dst - const int np = n; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - n0 = dp*ith; - n1 = MIN(n0 + dp, np); - } - - // block-tiling attempt - int64_t blck_n = 16; - int64_t blck_m = 16; - - // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB - // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K); - // if (blck_size > 0) { - // blck_0 = 4; - // blck_1 = blck_size / blck_0; - // if (blck_1 < 0) { - // blck_1 = 1; - // } - // // blck_0 = (int64_t)sqrt(blck_size); - // // blck_1 = blck_0; - // } - // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1); - - for (int j = n0; j < n1; j+=blck_n) { - for (int i = m0; i < m1; i+=blck_m) { - // printf("i j k => %d %d %d\n", i, j, K); - for (int ii = i; ii < i + blck_m && ii < m1; ii++) { - for (int jj = j; jj < j + blck_n && jj < n1; jj++) { - ggml_vec_dot_f16(k, - C + ii*n + jj, - A + ii * k, - B + jj * k); - } - } - } - } -} - -// src0: kernel [OC, IC, K] -// src1: signal [N, IC, IL] -// dst: result [N, OL, IC*K] -static void ggml_compute_forward_conv_1d_stage_0_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16); - - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - GGML_TENSOR_BINARY_OP_LOCALS; - - const int64_t N = ne12; - const int64_t IC = ne11; - const int64_t IL = ne10; - - const int64_t K = ne00; - - const int64_t OL = ne1; - - const int ith = params->ith; - const int nth = params->nth; - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); - - if (params->type == GGML_TASK_INIT) { - memset(dst->data, 0, ggml_nbytes(dst)); - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - // im2col: [N, IC, IL] => [N, OL, IC*K] - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; - - for (int64_t in = 0; in < N; in++) { - for (int64_t iol = 0; iol < OL; iol++) { - for (int64_t iic = ith; iic < IC; iic+=nth) { - - // micro kernel - ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K] - const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL] - - for (int64_t ik = 0; ik < K; ik++) { - const int64_t iil = iol*s0 + ik*d0 - p0; - - if (!(iil < 0 || iil >= IL)) { - dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]); - } - } - } + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int64_t i0 = 0; i0 < ne10; ++i0) { + dst_data[i0] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f32(ew0, &v, + (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0] += v; } } } } -// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] -// src0: [OC, IC, K] -// src1: [N, OL, IC * K] -// result: [N, OC, OL] -static void ggml_compute_forward_conv_1d_stage_1_f16( +static void ggml_compute_forward_conv_1d_s1_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - int64_t t0 = ggml_perf_time_us(); - UNUSED(t0); - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } - - GGML_TENSOR_BINARY_OP_LOCALS; - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb0 == sizeof(float)); - - const int N = ne12; - const int OL = ne11; - - const int OC = ne02; - const int IC = ne01; - const int K = ne00; - - const int ith = params->ith; - const int nth = params->nth; - - int64_t m = OC; - int64_t n = OL; - int64_t k = IC * K; - - // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] - for (int i = 0; i < N; i++) { - ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] - ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k] - float * C = (float *)dst->data + i * m * n; // [m, n] - - gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); - } -} - -static void ggml_compute_forward_conv_1d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - switch(src0->type) { + switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_1d_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst); } break; default: { @@ -14202,43 +13895,7 @@ static void ggml_compute_forward_conv_1d( } } -static void ggml_compute_forward_conv_1d_stage_0( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - switch(src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - -static void ggml_compute_forward_conv_1d_stage_1( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - switch(src0->type) { - case GGML_TYPE_F16: - { - ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - -// ggml_compute_forward_conv_transpose_1d - -static void ggml_compute_forward_conv_transpose_1d_f16_f32( +static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -14255,38 +13912,43 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00*ne01*ne02; + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + // prepare kernel data (src0) { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ne02 + i02] = src[i00]; + dst_data[i00*ew0 + i01] = src[i00]; } } } } - // permute source data (src1) from (L x Cin) to (Cin x L) + // prepare source data (src1) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; - ggml_fp16_t * dst_data = wdata; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); } } } @@ -14298,10 +13960,8 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( return; } - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - // total rows in dst - const int nr = ne1; + const int nr = ne02; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -14310,26 +13970,23 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - ggml_fp16_t * const wdata_src = wdata + nk; - for (int i1 = ir0; i1 < ir1; i1++) { float * dst_data = (float *)((char *) dst->data + i1*nb1); - ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - ggml_vec_dot_f16(ne02, &v, - (ggml_fp16_t *) wdata_src + i1n, - (ggml_fp16_t *) wdata_kernel + i00*ne02); - dst_data[i10*s0 + i00] += v; + for (int64_t i0 = 0; i0 < ne10; i0 += 2) { + dst_data[i0/2] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0/2] += v; } } } } -static void ggml_compute_forward_conv_transpose_1d_f32( +static void ggml_compute_forward_conv_1d_s2_ph_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -14346,24 +14003,29 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00*ne01*ne02; + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) + // prepare kernel data (src0) { float * const wdata = (float *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); - float * dst_data = wdata + i01*ne00*ne02; + float * dst_data = wdata + i02*ew0*ne00; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00]; + dst_data[i00*ew0 + i01] = src[i00]; } } } @@ -14371,13 +14033,13 @@ static void ggml_compute_forward_conv_transpose_1d_f32( // prepare source data (src1) { - float * const wdata = (float *) params->wdata + nk; - float * dst_data = wdata; + float * const wdata = (float *) params->wdata + ne02*ew0*ne00; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = src[i10]; + dst_data[(i10 + nh)*ew0 + i11] = src[i10]; } } } @@ -14389,10 +14051,8 @@ static void ggml_compute_forward_conv_transpose_1d_f32( return; } - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - // total rows in dst - const int nr = ne1; + const int nr = ne02; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -14401,26 +14061,23 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - float * const wdata = (float *) params->wdata + 0; - float * const wdata_src = wdata + nk; - for (int i1 = ir0; i1 < ir1; i1++) { float * dst_data = (float *)((char *) dst->data + i1*nb1); - float * wdata_kernel = wdata + i1*ne02*ne00; - for (int i10 = 0; i10 < ne10; i10++) { - const int i1n = i10*ne11; - for (int i00 = 0; i00 < ne00; i00++) { - float v = 0; - ggml_vec_dot_f32(ne02, &v, - wdata_src + i1n, - wdata_kernel + i00*ne02); - dst_data[i10*s0 + i00] += v; + for (int64_t i0 = 0; i0 < ne10; i0 += 2) { + dst_data[i0/2] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f32(ew0, &v, + (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0/2] += v; } } } } -static void ggml_compute_forward_conv_transpose_1d( +static void ggml_compute_forward_conv_1d_s2_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -14428,11 +14085,11 @@ static void ggml_compute_forward_conv_transpose_1d( switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst); } break; default: { @@ -14441,6 +14098,27 @@ static void ggml_compute_forward_conv_transpose_1d( } } +// ggml_compute_forward_conv_1d + +static void ggml_compute_forward_conv_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + GGML_ASSERT(d0 == 1); // dilation not supported + GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported + if (s0 == 1) { + ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst); + } else if (s0 == 2) { + ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); + } else { + GGML_ASSERT(false); // only stride 1 and 2 supported + } +} + // ggml_compute_forward_conv_2d static void ggml_compute_forward_conv_2d_f16_f32( @@ -14483,22 +14161,20 @@ static void ggml_compute_forward_conv_2d_f16_f32( { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); - - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - const int idx0 = i0*s0 + ik0*d0 - p0; - const int idx1 = i1*s1 + ik1*d1 - p1; - - if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); - } + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i12*nb12); + ggml_fp16_t * dst_data = wdata; + + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); } } } @@ -16781,18 +16457,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); } break; - case GGML_OP_CONV_1D_STAGE_0: - { - ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor); - } break; - case GGML_OP_CONV_1D_STAGE_1: - { - ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor); - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); - } break; case GGML_OP_CONV_2D: { ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -17718,22 +17382,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_1D_STAGE_0: - { - GGML_ASSERT(false); // TODO: not implemented - } break; - case GGML_OP_CONV_1D_STAGE_1: - { - GGML_ASSERT(false); // TODO: not implemented - } break; case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - GGML_ASSERT(false); // TODO: not implemented - } break; case GGML_OP_CONV_TRANSPOSE_2D: { GGML_ASSERT(false); // TODO: not implemented @@ -18575,68 +18227,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { GGML_ASSERT(node->src[1]->ne[2] == 1); GGML_ASSERT(node->src[1]->ne[3] == 1); - const int64_t ne00 = node->src[0]->ne[0]; - const int64_t ne01 = node->src[0]->ne[1]; - const int64_t ne02 = node->src[0]->ne[2]; - - const int64_t ne10 = node->src[1]->ne[0]; - const int64_t ne11 = node->src[1]->ne[1]; - - const int64_t ne0 = node->ne[0]; - const int64_t ne1 = node->ne[1]; - const int64_t nk = ne00; - const int64_t ew0 = nk * ne01; - - UNUSED(ne02); - UNUSED(ne10); - UNUSED(ne11); - size_t cur = 0; + const int nk = node->src[0]->ne[0]; if (node->src[0]->type == GGML_TYPE_F16 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*(ne0*ne1*ew0); - } else { - GGML_ASSERT(false); - } - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CONV_1D_STAGE_0: - { - n_tasks = n_threads; - } break; - case GGML_OP_CONV_1D_STAGE_1: - { - n_tasks = n_threads; - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - n_tasks = n_threads; - - GGML_ASSERT(node->src[0]->ne[3] == 1); - GGML_ASSERT(node->src[1]->ne[2] == 1); - GGML_ASSERT(node->src[1]->ne[3] == 1); - - const int64_t ne00 = node->src[0]->ne[0]; // K - const int64_t ne01 = node->src[0]->ne[1]; // Cout - const int64_t ne02 = node->src[0]->ne[2]; // Cin - - const int64_t ne10 = node->src[1]->ne[0]; // L - const int64_t ne11 = node->src[1]->ne[1]; // Cin - - size_t cur = 0; - if (node->src[0]->type == GGML_TYPE_F16 && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; - cur += sizeof(ggml_fp16_t)*ne10*ne11; - } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(float)*ne00*ne01*ne02; - cur += sizeof(float)*ne10*ne11; + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); } else { GGML_ASSERT(false); } @@ -19796,6 +19401,9 @@ static enum ggml_opt_result ggml_opt_adam( // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { + if (cancel) { + break; + } opt->iter = iter0 + t + 1; GGML_PRINT_DEBUG ("=== iter %d ===\n", t); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a1cedf0f8..804689fb7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -176,11 +176,11 @@ endif() # # test-grad0 -set(TEST_TARGET test-grad0) -add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml) -add_test(NAME ${TEST_TARGET} COMMAND $) -set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +#set(TEST_TARGET test-grad0) +#add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +#target_link_libraries(${TEST_TARGET} PRIVATE ggml) +#add_test(NAME ${TEST_TARGET} COMMAND $) +#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") # # test-opt @@ -350,8 +350,8 @@ set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_ # # test-xpos -set(TEST_TARGET test-xpos) -add_executable(${TEST_TARGET} ${TEST_TARGET}.c) -target_link_libraries(${TEST_TARGET} PRIVATE ggml) -add_test(NAME ${TEST_TARGET} COMMAND $) -set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +#set(TEST_TARGET test-xpos) +#add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +#target_link_libraries(${TEST_TARGET} PRIVATE ggml) +#add_test(NAME ${TEST_TARGET} COMMAND $) +#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") From c05714fbd0dd6213c07546001aa07df1c3b56f13 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 3 Oct 2023 15:15:25 +0200 Subject: [PATCH 02/23] move get_alignment from buffer to backend --- examples/gpt-2/main.cpp | 6 ++++-- include/ggml/ggml-backend.h | 3 ++- src/ggml-backend.c | 29 ++++++++++++----------------- src/ggml-cuda.cu | 12 ++++++------ 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 184eb8e9a..e9d20b522 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -833,7 +833,9 @@ int main(int argc, char ** argv) { struct ggml_allocr * allocr = NULL; // allocate the compute buffer { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + // alignment required by the backend + size_t align = ggml_backend_get_alignment(model.backend); + allocr = ggml_allocr_new_measure(align); // create the worst case graph for memory usage estimation int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); @@ -841,7 +843,7 @@ int main(int argc, char ** argv) { struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; + size_t mem_size = ggml_allocr_alloc_graph(allocr, gf); // recreate the allocator with the required memory ggml_allocr_free(allocr); diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 17e5a38a7..c71f50225 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -14,7 +14,6 @@ extern "C" { struct ggml_backend_buffer_interface { void (*free_buffer) (ggml_backend_buffer_t buffer); - size_t (*get_alignment) (ggml_backend_buffer_t buffer); void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback @@ -49,6 +48,7 @@ extern "C" { // buffer allocation ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size); + size_t (*get_alignment)(ggml_backend_t backend); // tensor data access // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize @@ -82,6 +82,7 @@ extern "C" { static inline const char * ggml_backend_name(ggml_backend_t backend) { return backend->interface.get_name(backend); } static inline void ggml_backend_free(ggml_backend_t backend) { backend->interface.free(backend); } static inline ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return backend->interface.alloc_buffer(backend, size); } + static inline size_t ggml_backend_get_alignment(ggml_backend_t backend) { return backend->interface.get_alignment(backend); } static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); } static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); } static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); } diff --git a/src/ggml-backend.c b/src/ggml-backend.c index d49c5e7a1..da0d9c639 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -34,10 +34,7 @@ void ggml_backend_buffer_free(struct ggml_backend_buffer * buffer) { } size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { - if (buffer->interface.get_alignment) { - return buffer->interface.get_alignment(buffer); - } - return 64; + return ggml_backend_get_alignment(buffer->backend); } void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -131,13 +128,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) { free(backend); } -static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 - -static size_t ggml_backend_cpu_buffer_get_alignment(ggml_backend_buffer_t buffer) { - return TENSOR_ALIGNMENT; - UNUSED(buffer); -} - static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } @@ -149,29 +139,33 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = { /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_get_alignment, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .init_tensor = */ NULL, - /* .free_tensor = */ NULL, + /* .init_tensor = */ NULL, // no initialization required + /* .free_tensor = */ NULL, // no cleanup required }; // for buffers from ptr, free is not called static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_ptr = { - /* .free_buffer = */ NULL, - /* .get_alignment = */ ggml_backend_cpu_buffer_get_alignment, + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .init_tensor = */ NULL, /* .free_tensor = */ NULL, }; +static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { - void * data = malloc(size + TENSOR_ALIGNMENT); + void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned return ggml_backend_buffer_init(cpu_backend_buffer_interface, backend, data, size); } +static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) { + return TENSOR_ALIGNMENT; + UNUSED(backend); +} + static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); @@ -270,6 +264,7 @@ static struct ggml_backend_interface cpu_backend_interface = { /* .get_name = */ ggml_backend_cpu_name, /* .free = */ ggml_backend_cpu_free, /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_get_alignment, /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async, /* .synchronize = */ ggml_backend_cpu_synchronize, diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 74b443e6c..9d596e15b 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7572,11 +7572,6 @@ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { delete ctx; } -static size_t ggml_backend_cuda_buffer_get_alignment(ggml_backend_buffer_t buffer) { - return 128; - UNUSED(buffer); -} - static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; return ctx->device; @@ -7614,7 +7609,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, - /* .get_alignment = */ ggml_backend_cuda_buffer_get_alignment, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, @@ -7627,6 +7621,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe return ggml_backend_buffer_init(cuda_backend_buffer_interface, backend, ctx, size); } +static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) { + return 128; + UNUSED(backend); +} + static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); @@ -7733,6 +7732,7 @@ static ggml_backend_interface cuda_backend_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_get_alignment, /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, From da82697cfd03cb58b78c089a3c5d74094592e679 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 3 Oct 2023 16:03:21 +0200 Subject: [PATCH 03/23] ggml-cuda : fix ggml_cuda_op_mul_mat_vec_q --- src/ggml-cuda.cu | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 9d596e15b..163708c1e 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7,8 +7,6 @@ #include #include -#define GGML_CUDA_FORCE_DMMV // FIXME: ggml_cuda_op_mul_mat_vec_q produces wrong results with GPT-2 - #if defined(GGML_USE_HIPBLAS) #include #include @@ -6733,7 +6731,8 @@ static void ggml_cuda_op_mul_mat( if (convert_src1_to_q8_1) { src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); - if (split && src1_on_device && src1_is_contiguous) { + // FIXME: why split only? src1 never gets quantized, breaks ggml-backend/GPT-2 + if (/*split &&*/ src1_on_device && src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -7582,7 +7581,16 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; - return ggml_nbytes_split(tensor, nrows_split); + size_t size = ggml_nbytes_split(tensor, nrows_split); + + int64_t ne0 = tensor->ne[0]; + + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) + * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + } + + return size; UNUSED(buffer); } @@ -7601,7 +7609,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g size_t size = ggml_backend_cuda_buffer_get_alloc_size(buffer, tensor); if (size > original_size && tensor->view_src == nullptr) { - CUDA_CHECK(cudaMemset((char *) tensor->data + original_size, 0, size - original_size)); + CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, size - original_size, g_cudaStreams[g_main_device][0])); } UNUSED(buffer); @@ -7629,7 +7637,7 @@ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) { static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - //GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0])); @@ -7675,7 +7683,6 @@ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_gr UNUSED(plan); } -#include static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_compute_params params = {}; params.type = GGML_TASK_COMPUTE; @@ -7708,17 +7715,19 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph cudaDeviceSynchronize(); std::vector tmp(ggml_nelements(node), 0.0f); cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); - printf("\n%s (%s) (%s %s): ", node->name, ggml_op_name(node->op), + printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), ggml_type_name(node->src[0]->type), - node->src[1] ? ggml_type_name(node->src[1]->type) : "none"); + node->src[1] ? ggml_type_name(node->src[1]->type) : "none", + node->src[0]->name, + node->src[1] ? node->src[1]->name : "none"); double sum = 0.0; double sq_sum = 0.0; for (int i = 0; i < ggml_nelements(node); i++) { - //printf("%f ", tmp[i]); + printf("%f ", tmp[i]); sum += tmp[i]; sq_sum += tmp[i]*tmp[i]; } - //printf("\n"); + printf("\n"); printf("sum: %f, ", sum); printf("sq_sum: %f\n", sq_sum); } From 3cf87a3e7f969147f2b4a19bce1075b2c719ee9a Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 3 Oct 2023 16:25:08 +0200 Subject: [PATCH 04/23] gpt-2 : better check for CPU backend when settings n_threads --- examples/gpt-2/main.cpp | 8 +++----- src/ggml-cuda.cu | 9 +++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index e9d20b522..87cdf9065 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -760,11 +760,9 @@ bool gpt2_eval( ggml_allocr_alloc_graph(allocr, gf); // run the computation -#ifndef GGML_USE_CUBLAS - // FIXME: the backend may be CPU even if CUDA is enabled - // if (model.backend.id == GGML_BACKEND_ID_CPU) - ggml_backend_cpu_set_n_threads(model.backend, n_threads); -#endif + if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } ggml_backend_graph_compute(model.backend, gf); //if (n_past%100 == 0) { diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 163708c1e..87c44da8f 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -62,6 +62,7 @@ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyKind hipMemcpyKind #define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags @@ -1576,7 +1577,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest } template -static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) { +static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) { const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2; const int row = blockDim.y*blockIdx.y + threadIdx.y; @@ -4586,7 +4587,7 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale template -static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { +static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); const dim3 block_nums(block_num_x, nrows, 1); @@ -5810,7 +5811,7 @@ static void ggml_cuda_op_repeat( GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); - // TODO: very inefficient, implement in a kernel + // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors for (int i3 = 0; i3 < nr3; i3++) { for (int k3 = 0; k3 < ne03; k3++) { for (int i2 = 0; i2 < nr2; i2++) { @@ -5847,7 +5848,7 @@ static void ggml_cuda_op_get_rows( const int ncols = src0->ne[0]; const int nrows = ggml_nelements(src1); - const int * src1_i32 = (const int *) src1_d; + const int32_t * src1_i32 = (const int32_t *) src1_d; switch (src0->type) { case GGML_TYPE_F16: From 319b4bc8563a9bbf983ae5f6487e78db1e0d8ca2 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 3 Oct 2023 22:11:06 +0200 Subject: [PATCH 05/23] .gitignore : add .clangd --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d7e11716a..a66ac17df 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ compile_commands.json CMakeSettings.json .vs/ .vscode/ +.clangd .exrc .cache @@ -32,4 +33,4 @@ zig-cache/ *.sw? -__pycache__/ \ No newline at end of file +__pycache__/ From b527d48cacdbeab7aed9412773d935eb6ad55c68 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 4 Oct 2023 15:17:29 +0200 Subject: [PATCH 06/23] merge master --- examples/CMakeLists.txt | 6 +- include/ggml/ggml.h | 12 + src/ggml-alloc.c | 4 - src/ggml.c | 793 ++++++++++++++++++++++++++++++---------- 4 files changed, 608 insertions(+), 207 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c0201c131..e3404fb8b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -19,11 +19,11 @@ target_link_libraries(common-ggml PRIVATE ggml) target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory(gpt-2) -#add_subdirectory(gpt-j) +add_subdirectory(gpt-j) add_subdirectory(whisper) add_subdirectory(mnist) -#add_subdirectory(gpt-neox) -#add_subdirectory(dolly-v2) +add_subdirectory(gpt-neox) +add_subdirectory(dolly-v2) add_subdirectory(replit) add_subdirectory(mpt) add_subdirectory(starcoder) diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index db7cad0dc..a26b9119b 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -401,10 +401,14 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_1D, GGML_OP_CONV_2D, + GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, + GGML_OP_CONV_1D_STAGE_0, // internal + GGML_OP_CONV_1D_STAGE_1, // internal + GGML_OP_UPSCALE, // nearest interpolate GGML_OP_FLASH_ATTN, @@ -1387,6 +1391,14 @@ extern "C" { int s, int d); + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0); + GGML_API struct ggml_tensor * ggml_conv_2d( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index afb4e10cf..44cb97481 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -585,7 +585,3 @@ size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { return alloc->max_size; } - -size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { - return alloc->max_size; -} diff --git a/src/ggml.c b/src/ggml.c index b1d11ba5c..aabe2e4df 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4081,12 +4081,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ALIBI", "CLAMP", "CONV_1D", + "CONV_TRANSPOSE_1D", "CONV_2D", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", + "CONV_1D_STAGE_0", + "CONV_1D_STAGE_1", + "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", @@ -4112,7 +4116,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); +static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4163,12 +4167,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "alibi(x)", "clamp(x)", "conv_1d(x)", + "conv_transpose_1d(x)", "conv_2d(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", + "conv_1d_stage_0(x)", + "conv_1d_stage_1(x)", + "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", @@ -4194,7 +4202,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); +static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4223,7 +4231,10 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_DIAG_MASK_INF ] = true; p[GGML_OP_DIAG_MASK_ZERO ] = true; p[GGML_OP_CONV_1D ] = true; + p[GGML_OP_CONV_1D_STAGE_0 ] = true; + p[GGML_OP_CONV_1D_STAGE_1 ] = true; p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_TRANSPOSE_1D ] = true; p[GGML_OP_CONV_TRANSPOSE_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -4939,8 +4950,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ GGML_BACKEND_CPU, - /*.buffer =*/ NULL, + /*.backend =*/ view_src ? view_src->backend : GGML_BACKEND_CPU, + /*.buffer =*/ view_src ? view_src->buffer : NULL, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -4973,11 +4984,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } - if (view_src != NULL) { - result->backend = view_src->backend; - result->buffer = view_src->buffer; - } - ctx->n_objects++; return result; @@ -5792,7 +5798,7 @@ static struct ggml_tensor * ggml_mul_impl( bool inplace) { // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); - //GGML_ASSERT(ggml_can_repeat_rows(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; @@ -7509,14 +7515,17 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -GGML_API struct ggml_tensor * ggml_conv_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0) { - GGML_ASSERT(ggml_is_matrix(b)); +// im2col: [N, IC, IL] => [N, OL, IC*K] +// a: [OC,IC, K] +// b: [N, IC, IL] +// result: [N, OL, IC*K] +static struct ggml_tensor * ggml_conv_1d_stage_0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { GGML_ASSERT(a->ne[1] == b->ne[1]); bool is_node = false; @@ -7525,16 +7534,54 @@ GGML_API struct ggml_tensor * ggml_conv_1d( is_node = true; } + const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t ne[4] = { - ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), - a->ne[2], 1, 1, + a->ne[1] * a->ne[0], + OL, + b->ne[2], + 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne); int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_1D; + result->op = GGML_OP_CONV_1D_STAGE_0; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_1d_stage_1 + +// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] +// a: [OC, IC, K] +// b: [N, OL, IC * K] +// result: [N, OC, OL] +static struct ggml_tensor * ggml_conv_1d_stage_1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + b->ne[1], + a->ne[2], + b->ne[2], + 1, + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_CONV_1D_STAGE_1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7542,6 +7589,53 @@ GGML_API struct ggml_tensor * ggml_conv_1d( return result; } +// ggml_conv_1d + +GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0); + result = ggml_conv_1d_stage_1(ctx, a, result); + return result; +} + +// GGML_API struct ggml_tensor * ggml_conv_1d( +// struct ggml_context * ctx, +// struct ggml_tensor * a, +// struct ggml_tensor * b, +// int s0, +// int p0, +// int d0) { +// GGML_ASSERT(ggml_is_matrix(b)); +// GGML_ASSERT(a->ne[1] == b->ne[1]); +// bool is_node = false; + +// if (a->grad || b->grad) { +// GGML_ASSERT(false); // TODO: implement backward +// is_node = true; +// } + +// const int64_t ne[4] = { +// ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), +// a->ne[2], 1, 1, +// }; +// struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + +// int32_t params[] = { s0, p0, d0 }; +// ggml_set_op_params(result, params, sizeof(params)); + +// result->op = GGML_OP_CONV_1D; +// result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; +// result->src[0] = a; +// result->src[1] = b; + +// return result; +// } + // ggml_conv_1d_ph struct ggml_tensor* ggml_conv_1d_ph( @@ -7553,6 +7647,50 @@ struct ggml_tensor* ggml_conv_1d_ph( return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } +// ggml_conv_transpose_1d + +static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; +} + +GGML_API struct ggml_tensor * ggml_conv_transpose_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + GGML_ASSERT(ggml_is_matrix(b)); + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(a->ne[3] == 1); + + GGML_ASSERT(p0 == 0); + GGML_ASSERT(d0 == 1); + + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), + a->ne[1], b->ne[2], 1, + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_TRANSPOSE_1D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_conv_2d struct ggml_tensor * ggml_conv_2d( @@ -13692,7 +13830,7 @@ static void ggml_compute_forward_rope_back( // ggml_compute_forward_conv_1d -static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( +static void ggml_compute_forward_conv_1d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -13710,42 +13848,33 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( const int nth = params->nth; const int nk = ne00; - const int nh = nk/2; - const int ew0 = ggml_up32(ne01); + // size of the convolution row - the kernel size unrolled across all input channels + const int ew0 = nk*ne01; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - // prepare kernel data (src0) - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ew0 + i01] = src[i00]; - } - } - } - } + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; - // prepare source data (src1) - { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + for (int64_t i0 = 0; i0 < ne0; i0++) { + for (int64_t ik = 0; ik < nk; ik++) { + const int idx0 = i0*s0 + ik*d0 - p0; - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - ggml_fp16_t * dst_data = wdata; - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + if(!(idx0 < 0 || idx0 >= ne10)) { + dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]); + } } } } @@ -13758,7 +13887,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( } // total rows in dst - const int nr = ne02; + const int nr = ne2; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -13767,23 +13896,22 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - for (int64_t i0 = 0; i0 < ne10; ++i0) { - dst_data[i0] = 0; - for (int k = -nh; k <= nh; k++) { - float v = 0.0f; - ggml_vec_dot_f16(ew0, &v, - (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); - - dst_data[i0] += v; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + + for (int i0 = 0; i0 < ne0; i0++) { + ggml_vec_dot_f16(ew0, dst_data + i0, + (ggml_fp16_t *) ((char *) src0->data + i1*nb02), + (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0); } } } } -static void ggml_compute_forward_conv_1d_s1_ph_f32( +static void ggml_compute_forward_conv_1d_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -13801,42 +13929,32 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32( const int nth = params->nth; const int nk = ne00; - const int nh = nk/2; - const int ew0 = ggml_up32(ne01); + const int ew0 = nk*ne01; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - // prepare kernel data (src0) - { - float * const wdata = (float *) params->wdata + 0; + float * const wdata = (float *) params->wdata + 0; - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); - float * dst_data = wdata + i02*ew0*ne00; - for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ew0 + i01] = src[i00]; - } - } - } - } + for (int64_t i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; - // prepare source data (src1) - { - float * const wdata = (float *) params->wdata + ne02*ew0*ne00; + for (int64_t i0 = 0; i0 < ne0; i0++) { + for (int64_t ik = 0; ik < nk; ik++) { + const int idx0 = i0*s0 + ik*d0 - p0; - for (int64_t i11 = 0; i11 < ne11; i11++) { - const float * const src = (float *)((char *) src1->data + i11*nb11); - float * dst_data = wdata; - for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = src[i10]; + if(!(idx0 < 0 || idx0 >= ne10)) { + dst_data[i0*ew0 + i11*nk + ik] = src[idx0]; + } } } } @@ -13858,35 +13976,225 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - for (int i1 = ir0; i1 < ir1; i1++) { - float * dst_data = (float *)((char *) dst->data + i1*nb1); - for (int64_t i0 = 0; i0 < ne10; ++i0) { - dst_data[i0] = 0; - for (int k = -nh; k <= nh; k++) { - float v = 0.0f; - ggml_vec_dot_f32(ew0, &v, - (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); - - dst_data[i0] += v; + float * const wdata = (float *) params->wdata + 0; + + for (int i2 = 0; i2 < ne2; i2++) { + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1); + + for (int i0 = 0; i0 < ne0; i0++) { + ggml_vec_dot_f32(ew0, dst_data + i0, + (float *) ((char *) src0->data + i1*nb02), + (float *) wdata + i2*nb2 + i0*ew0); } } } } -static void ggml_compute_forward_conv_1d_s1_ph( +static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, + ggml_fp16_t * A, + ggml_fp16_t * B, + float * C, + const int ith, const int nth) { + // does not seem to make a difference + int64_t m0, m1, n0, n1; + // patches per thread + if (m > n) { + n0 = 0; + n1 = n; + + // total patches in dst + const int np = m; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + m0 = dp*ith; + m1 = MIN(m0 + dp, np); + } else { + m0 = 0; + m1 = m; + + // total patches in dst + const int np = n; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + n0 = dp*ith; + n1 = MIN(n0 + dp, np); + } + + // block-tiling attempt + int64_t blck_n = 16; + int64_t blck_m = 16; + + // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB + // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K); + // if (blck_size > 0) { + // blck_0 = 4; + // blck_1 = blck_size / blck_0; + // if (blck_1 < 0) { + // blck_1 = 1; + // } + // // blck_0 = (int64_t)sqrt(blck_size); + // // blck_1 = blck_0; + // } + // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1); + + for (int j = n0; j < n1; j+=blck_n) { + for (int i = m0; i < m1; i+=blck_m) { + // printf("i j k => %d %d %d\n", i, j, K); + for (int ii = i; ii < i + blck_m && ii < m1; ii++) { + for (int jj = j; jj < j + blck_n && jj < n1; jj++) { + ggml_vec_dot_f16(k, + C + ii*n + jj, + A + ii * k, + B + jj * k); + } + } + } + } +} + +// src0: kernel [OC, IC, K] +// src1: signal [N, IC, IL] +// dst: result [N, OL, IC*K] +static void ggml_compute_forward_conv_1d_stage_0_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - switch (src0->type) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = ne12; + const int64_t IC = ne11; + const int64_t IL = ne10; + + const int64_t K = ne00; + + const int64_t OL = ne1; + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IL] => [N, OL, IC*K] + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t iol = 0; iol < OL; iol++) { + for (int64_t iic = ith; iic < IC; iic+=nth) { + + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K] + const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL] + + for (int64_t ik = 0; ik < K; ik++) { + const int64_t iil = iol*s0 + ik*d0 - p0; + + if (!(iil < 0 || iil >= IL)) { + dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]); + } + } + } + } + } + } +} + +// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] +// src0: [OC, IC, K] +// src1: [N, OL, IC * K] +// result: [N, OC, OL] +static void ggml_compute_forward_conv_1d_stage_1_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb0 == sizeof(float)); + + const int N = ne12; + const int OL = ne11; + + const int OC = ne02; + const int IC = ne01; + const int K = ne00; + + const int ith = params->ith; + const int nth = params->nth; + + int64_t m = OC; + int64_t n = OL; + int64_t k = IC * K; + + // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m, n] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); + } +} + +static void ggml_compute_forward_conv_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_f32(params, src0, src1, dst); } break; default: { @@ -13895,7 +14203,43 @@ static void ggml_compute_forward_conv_1d_s1_ph( } } -static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( +static void ggml_compute_forward_conv_1d_stage_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_1d_stage_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch(src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_conv_transpose_1d + +static void ggml_compute_forward_conv_transpose_1d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -13912,43 +14256,38 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00; - const int nh = nk/2; - - const int ew0 = ggml_up32(ne01); + const int nk = ne00*ne01*ne02; - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - // prepare kernel data (src0) + // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + ggml_fp16_t * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ew0 + i01] = src[i00]; + dst_data[i00*ne02 + i02] = src[i00]; } } } } - // prepare source data (src1) + // permute source data (src1) from (L x Cin) to (Cin x L) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + ggml_fp16_t * dst_data = wdata; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); - ggml_fp16_t * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); } } } @@ -13960,8 +14299,10 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( return; } + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + // total rows in dst - const int nr = ne02; + const int nr = ne1; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -13970,23 +14311,26 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata_src = wdata + nk; + for (int i1 = ir0; i1 < ir1; i1++) { float * dst_data = (float *)((char *) dst->data + i1*nb1); - for (int64_t i0 = 0; i0 < ne10; i0 += 2) { - dst_data[i0/2] = 0; - for (int k = -nh; k <= nh; k++) { - float v = 0.0f; - ggml_vec_dot_f16(ew0, &v, - (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); - - dst_data[i0/2] += v; + ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f16(ne02, &v, + (ggml_fp16_t *) wdata_src + i1n, + (ggml_fp16_t *) wdata_kernel + i00*ne02); + dst_data[i10*s0 + i00] += v; } } } } -static void ggml_compute_forward_conv_1d_s2_ph_f32( +static void ggml_compute_forward_conv_transpose_1d_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -14003,29 +14347,24 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( const int ith = params->ith; const int nth = params->nth; - const int nk = ne00; - const int nh = nk/2; - - const int ew0 = ggml_up32(ne01); + const int nk = ne00*ne01*ne02; - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); - // prepare kernel data (src0) + // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) { float * const wdata = (float *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); - float * dst_data = wdata + i02*ew0*ne00; + float * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i00*ew0 + i01] = src[i00]; + dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00]; } } } @@ -14033,13 +14372,13 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( // prepare source data (src1) { - float * const wdata = (float *) params->wdata + ne02*ew0*ne00; + float * const wdata = (float *) params->wdata + nk; + float * dst_data = wdata; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); - float * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = src[i10]; + dst_data[i10*ne11 + i11] = src[i10]; } } } @@ -14051,8 +14390,10 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( return; } + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + // total rows in dst - const int nr = ne02; + const int nr = ne1; // rows per thread const int dr = (nr + nth - 1)/nth; @@ -14061,23 +14402,26 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); + float * const wdata = (float *) params->wdata + 0; + float * const wdata_src = wdata + nk; + for (int i1 = ir0; i1 < ir1; i1++) { float * dst_data = (float *)((char *) dst->data + i1*nb1); - for (int64_t i0 = 0; i0 < ne10; i0 += 2) { - dst_data[i0/2] = 0; - for (int k = -nh; k <= nh; k++) { - float v = 0.0f; - ggml_vec_dot_f32(ew0, &v, - (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); - - dst_data[i0/2] += v; + float * wdata_kernel = wdata + i1*ne02*ne00; + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i10*ne11; + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f32(ne02, &v, + wdata_src + i1n, + wdata_kernel + i00*ne02); + dst_data[i10*s0 + i00] += v; } } } } -static void ggml_compute_forward_conv_1d_s2_ph( +static void ggml_compute_forward_conv_transpose_1d( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -14085,11 +14429,11 @@ static void ggml_compute_forward_conv_1d_s2_ph( switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst); + ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst); } break; default: { @@ -14098,27 +14442,6 @@ static void ggml_compute_forward_conv_1d_s2_ph( } } -// ggml_compute_forward_conv_1d - -static void ggml_compute_forward_conv_1d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; - GGML_ASSERT(d0 == 1); // dilation not supported - GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported - if (s0 == 1) { - ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst); - } else if (s0 == 2) { - ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); - } else { - GGML_ASSERT(false); // only stride 1 and 2 supported - } -} - // ggml_compute_forward_conv_2d static void ggml_compute_forward_conv_2d_f16_f32( @@ -14161,20 +14484,22 @@ static void ggml_compute_forward_conv_2d_f16_f32( { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i12*nb12); - ggml_fp16_t * dst_data = wdata; - - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - const int idx0 = i0*s0 + ik0*d0 - p0; - const int idx1 = i1*s1 + ik1*d1 - p1; - - if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + for (int i13 = 0; i13 < ne13; i13++) { + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); + ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); + + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + } } } } @@ -16457,6 +16782,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_CONV_1D_STAGE_0: + { + ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_1D_STAGE_1: + { + ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor); + } break; case GGML_OP_CONV_2D: { ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -17382,10 +17719,22 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONV_1D_STAGE_0: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D_STAGE_1: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_CONV_TRANSPOSE_2D: { GGML_ASSERT(false); // TODO: not implemented @@ -18227,21 +18576,68 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { GGML_ASSERT(node->src[1]->ne[2] == 1); GGML_ASSERT(node->src[1]->ne[3] == 1); + const int64_t ne00 = node->src[0]->ne[0]; + const int64_t ne01 = node->src[0]->ne[1]; + const int64_t ne02 = node->src[0]->ne[2]; + + const int64_t ne10 = node->src[1]->ne[0]; + const int64_t ne11 = node->src[1]->ne[1]; + + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t nk = ne00; + const int64_t ew0 = nk * ne01; + + UNUSED(ne02); + UNUSED(ne10); + UNUSED(ne11); + size_t cur = 0; - const int nk = node->src[0]->ne[0]; if (node->src[0]->type == GGML_TYPE_F16 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + - ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] - ); + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + - ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] - ); + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*(ne0*ne1*ew0); + } else { + GGML_ASSERT(false); + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_1D_STAGE_0: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_1D_STAGE_1: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); + + const int64_t ne00 = node->src[0]->ne[0]; // K + const int64_t ne01 = node->src[0]->ne[1]; // Cout + const int64_t ne02 = node->src[0]->ne[2]; // Cin + + const int64_t ne10 = node->src[1]->ne[0]; // L + const int64_t ne11 = node->src[1]->ne[1]; // Cin + + size_t cur = 0; + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; + cur += sizeof(ggml_fp16_t)*ne10*ne11; + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(float)*ne00*ne01*ne02; + cur += sizeof(float)*ne10*ne11; } else { GGML_ASSERT(false); } @@ -19401,9 +19797,6 @@ static enum ggml_opt_result ggml_opt_adam( // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { - if (cancel) { - break; - } opt->iter = iter0 + t + 1; GGML_PRINT_DEBUG ("=== iter %d ===\n", t); From abf2669e765cca1162e2b5fc3a7427da9e7e0d51 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 4 Oct 2023 15:23:09 +0200 Subject: [PATCH 07/23] restore tests/CMakeLists.txt --- tests/CMakeLists.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 804689fb7..a1cedf0f8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -176,11 +176,11 @@ endif() # # test-grad0 -#set(TEST_TARGET test-grad0) -#add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) -#target_link_libraries(${TEST_TARGET} PRIVATE ggml) -#add_test(NAME ${TEST_TARGET} COMMAND $) -#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +set(TEST_TARGET test-grad0) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") # # test-opt @@ -350,8 +350,8 @@ set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_ # # test-xpos -#set(TEST_TARGET test-xpos) -#add_executable(${TEST_TARGET} ${TEST_TARGET}.c) -#target_link_libraries(${TEST_TARGET} PRIVATE ggml) -#add_test(NAME ${TEST_TARGET} COMMAND $) -#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +set(TEST_TARGET test-xpos) +add_executable(${TEST_TARGET} ${TEST_TARGET}.c) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") From a1fd06c7c243b8350e0951f8fcc1716a9010bcf4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Oct 2023 15:50:06 +0300 Subject: [PATCH 08/23] ggml-backend : code style suggestions (#551) * ggml-backend : code style suggestions * ggml-backend : move ggml_backend and ggml_backend_buffer in the source file * ggml-backend : move structs back to header + rename type * ggml-backend : remove obsolete comment * fix leak in ggml_backend_buffer_free * ggml-backend : re-introduce typedefs as a declaration of intent --------- Co-authored-by: slaren --- .gitignore | 1 + examples/gpt-2/main.cpp | 4 ++ include/ggml/ggml-alloc.h | 12 ++-- include/ggml/ggml-backend.h | 118 +++++++++++++++++++++------------ include/ggml/ggml.h | 7 +- src/ggml-alloc.c | 4 +- src/ggml-backend.c | 128 ++++++++++++++++++++++++++++-------- src/ggml-cuda.cu | 41 ++++++------ 8 files changed, 216 insertions(+), 99 deletions(-) diff --git a/.gitignore b/.gitignore index a66ac17df..35c37674d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ build-sanitize-thread/ build-cov/ build-ci-debug/ build-ci-release/ +build-cublas/ out/ tmp/ models/ diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 87cdf9065..a046b19ea 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -75,9 +75,12 @@ struct gpt2_model { // struct ggml_context * ctx; + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer_w; ggml_backend_buffer_t buffer_kv; + std::map tensors; }; @@ -333,6 +336,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // allocate buffer and tensors model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256); + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); ggml_allocr_alloc(alloc, model.memory_k); ggml_allocr_alloc(alloc, model.memory_v); diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h index c87139491..e38758878 100644 --- a/include/ggml/ggml-alloc.h +++ b/include/ggml/ggml-alloc.h @@ -16,18 +16,18 @@ GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_bu // you should call this if your graph are optimized to execute out-of-order GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); -GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); -GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); -GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); -GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API void ggml_allocr_free (struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor); GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); -GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc); +GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc); + GGML_API size_t ggml_allocr_alloc_graph_n( struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs); - #ifdef __cplusplus } #endif diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index c71f50225..96a1ab201 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -5,55 +5,74 @@ #ifdef __cplusplus extern "C" { #endif - typedef struct ggml_backend_s * ggml_backend_t; - - // backend buffer + struct ggml_backend; struct ggml_backend_buffer; + + // type-erased backend-specific types / wrappers + typedef void * ggml_backend_context_t; + typedef void * ggml_backend_graph_plan_t; + typedef void * ggml_backend_buffer_context_t; + + // avoid accessing internals of these types + typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend_buffer * ggml_backend_buffer_t; - typedef void * ggml_buffer_context_t; - struct ggml_backend_buffer_interface { + // + // backend buffer + // + + struct ggml_backend_buffer_i { void (*free_buffer) (ggml_backend_buffer_t buffer); void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback - }; + // TODO: hide behind API struct ggml_backend_buffer { - struct ggml_backend_buffer_interface interface; - ggml_backend_t backend; - ggml_buffer_context_t context; + struct ggml_backend_buffer_i interface; + + ggml_backend_t backend; + ggml_backend_buffer_context_t context; + size_t size; }; // backend buffer functions - GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size); - GGML_API void ggml_backend_buffer_free(ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer); + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( + struct ggml_backend * backend, + struct ggml_backend_buffer_i interface, + ggml_backend_buffer_context_t context, + size_t size); + + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + // // backend - typedef void * ggml_backend_context_t; - typedef void * ggml_graph_plan_t; + // - struct ggml_backend_interface { + struct ggml_backend_i { const char * (*get_name)(ggml_backend_t backend); void (*free)(ggml_backend_t backend); // buffer allocation ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size); - size_t (*get_alignment)(ggml_backend_t backend); + + // get buffer alignment + size_t (*get_alignment)(ggml_backend_t backend); // tensor data access // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize - void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*synchronize) (ggml_backend_t backend); // (optional) copy tensor between different backends, allow for single-copy tranfers @@ -61,9 +80,10 @@ extern "C" { void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // compute graph with a plan - ggml_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - void (*graph_plan_free) (ggml_backend_t backend, ggml_graph_plan_t plan); - void (*graph_plan_compute)(ggml_backend_t backend, ggml_graph_plan_t plan); + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // compute graph without a plan void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); @@ -71,35 +91,49 @@ extern "C" { bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); }; - struct ggml_backend_s { - struct ggml_backend_interface interface; + // TODO: hide behind API + struct ggml_backend { + struct ggml_backend_i interface; + ggml_backend_context_t context; }; // backend helper functions - static inline ggml_backend_t get_backend(const struct ggml_tensor * tensor) { return tensor->buffer->backend; } - - static inline const char * ggml_backend_name(ggml_backend_t backend) { return backend->interface.get_name(backend); } - static inline void ggml_backend_free(ggml_backend_t backend) { backend->interface.free(backend); } - static inline ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return backend->interface.alloc_buffer(backend, size); } - static inline size_t ggml_backend_get_alignment(ggml_backend_t backend) { return backend->interface.get_alignment(backend); } - static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); } - static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); } - static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); } - static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); } - static inline void ggml_backend_synchronize(ggml_backend_t backend) { backend->interface.synchronize(backend); } - static inline ggml_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); } - static inline void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); } - static inline void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); } - static inline void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); } - static inline bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { return backend->interface.supports_op(backend, op); } + GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor); + + GGML_API const char * ggml_backend_name(ggml_backend_t backend); + GGML_API void ggml_backend_free(ggml_backend_t backend); + + GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); + + GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); + + GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); + + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); + // // CPU backend + // + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); /////////////////////////// diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index a26b9119b..5e7f39dc4 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -326,7 +326,7 @@ extern "C" { GGML_TYPE_COUNT, }; - enum ggml_backend { + enum ggml_backend_type { GGML_BACKEND_CPU = 0, GGML_BACKEND_GPU = 10, GGML_BACKEND_GPU_SPLIT = 20, @@ -479,8 +479,9 @@ extern "C" { // n-dimensional tensor struct ggml_tensor { - enum ggml_type type; - enum ggml_backend backend; + enum ggml_type type; + enum ggml_backend_type backend; + struct ggml_backend_buffer * buffer; int n_dims; diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 44cb97481..3f53c4c82 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -62,7 +62,7 @@ struct free_block { #define MAX_FREE_BLOCKS 256 struct ggml_allocr { - ggml_backend_buffer_t buffer; + struct ggml_backend_buffer * buffer; bool buffer_owned; void * data; size_t alignment; @@ -265,7 +265,7 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; - alloc->free_blocks[0].size = alloc->buffer->size - align_offset; + alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; } struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { diff --git a/src/ggml-backend.c b/src/ggml-backend.c index da0d9c639..9e5dc8c9a 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -1,19 +1,24 @@ #include "ggml-backend.h" #include "ggml-alloc.h" + #include #include #include #include #include -#define UNUSED(x) (void)(x) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define UNUSED GGML_UNUSED +#define MAX(a, b) ((a) > (b) ? (a) : (b)) // backend buffer -struct ggml_backend_buffer * ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size) { - struct ggml_backend_buffer * buffer = malloc(sizeof(struct ggml_backend_buffer)); +ggml_backend_buffer_t ggml_backend_buffer_init( + struct ggml_backend * backend, + struct ggml_backend_buffer_i interface, + ggml_backend_buffer_context_t context, + size_t size) { + ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); GGML_ASSERT(interface.get_base != NULL); @@ -27,10 +32,11 @@ struct ggml_backend_buffer * ggml_backend_buffer_init(struct ggml_backend_buffer return buffer; } -void ggml_backend_buffer_free(struct ggml_backend_buffer * buffer) { +void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->interface.free_buffer != NULL) { buffer->interface.free_buffer(buffer); } + free(buffer); } size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { @@ -41,6 +47,10 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return buffer->interface.get_base(buffer); } +size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { + return buffer->size; +} + size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { if (buffer->interface.get_alloc_size) { return buffer->interface.get_alloc_size(buffer, tensor); @@ -60,6 +70,70 @@ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_t } } +// backend + +ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) { + return tensor->buffer->backend; +} + +const char * ggml_backend_name(ggml_backend_t backend) { + return backend->interface.get_name(backend); +} + +void ggml_backend_free(ggml_backend_t backend) { + backend->interface.free(backend); +} + +ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { + return backend->interface.alloc_buffer(backend, size); +} + +size_t ggml_backend_get_alignment(ggml_backend_t backend) { + return backend->interface.get_alignment(backend); +} + +void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->interface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); +} + +void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->interface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); +} + +void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->interface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); + ggml_get_backend(tensor)->interface.synchronize(ggml_get_backend(tensor)); +} + +void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->interface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); + ggml_get_backend(tensor)->interface.synchronize(ggml_get_backend(tensor)); +} + +void ggml_backend_synchronize(ggml_backend_t backend) { + backend->interface.synchronize(backend); +} + +ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + return backend->interface.graph_plan_create(backend, cgraph); +} + +void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + backend->interface.graph_plan_free(backend, plan); +} + +void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + backend->interface.graph_plan_compute(backend, plan); +} + +void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + backend->interface.graph_compute(backend, cgraph); +} + +bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return backend->interface.supports_op(backend, op); +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -90,10 +164,10 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst // TODO: allow backends to support copy to/from same backend - if (get_backend(dst)->interface.cpy_tensor_from != NULL) { - get_backend(dst)->interface.cpy_tensor_from(get_backend(dst)->context, src, dst); - } else if (get_backend(src)->interface.cpy_tensor_to != NULL) { - get_backend(src)->interface.cpy_tensor_to(get_backend(src)->context, src, dst); + if (ggml_get_backend(dst)->interface.cpy_tensor_from != NULL) { + ggml_get_backend(dst)->interface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst); + } else if (ggml_get_backend(src)->interface.cpy_tensor_to != NULL) { + ggml_get_backend(src)->interface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst); } else { // shouldn't be hit when copying from/to CPU #ifndef NDEBUG @@ -137,7 +211,7 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { UNUSED(buffer); } -static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = { +static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -146,7 +220,7 @@ static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = { }; // for buffers from ptr, free is not called -static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_ptr = { +static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -156,9 +230,10 @@ static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_pt static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { +static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned - return ggml_backend_buffer_init(cpu_backend_buffer_interface, backend, data, size); + // TODO: maybe use GGML_ALIGNED_MALLOC? + return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); } static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) { @@ -201,15 +276,15 @@ static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_t UNUSED(backend); } -struct ggml_backend_cpu_plan { +struct ggml_backend_plan_cpu { struct ggml_cplan cplan; struct ggml_cgraph cgraph; }; -static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_backend_cpu_plan * cpu_plan = malloc(sizeof(struct ggml_backend_cpu_plan)); + struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); cpu_plan->cgraph = *cgraph; @@ -221,8 +296,8 @@ static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backe return cpu_plan; } -static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { - struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan; +static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; free(cpu_plan->cplan.work_data); free(cpu_plan); @@ -230,8 +305,8 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_graph_ UNUSED(backend); } -static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { - struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan; +static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -260,7 +335,7 @@ static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct gg UNUSED(op); } -static struct ggml_backend_interface cpu_backend_interface = { +static struct ggml_backend_i cpu_backend_i = { /* .get_name = */ ggml_backend_cpu_name, /* .free = */ ggml_backend_cpu_free, /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer, @@ -279,14 +354,15 @@ static struct ggml_backend_interface cpu_backend_interface = { ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->work_data = NULL; ctx->work_size = 0; - ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend_s)); + ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); - *cpu_backend = (struct ggml_backend_s) { - /* .interface = */ cpu_backend_interface, + *cpu_backend = (struct ggml_backend) { + /* .interface = */ cpu_backend_i, /* .context = */ ctx }; return cpu_backend; @@ -297,10 +373,10 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } -struct ggml_backend_buffer * ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { +ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { // TODO: NULL backend? // TODO: no free - return ggml_backend_buffer_init(cpu_backend_buffer_interface_from_ptr, NULL, ptr, size); + return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); } #if 0 diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 87c44da8f..d65e2143e 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7147,7 +7147,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const size_t nb1 = tensor->nb[1]; - ggml_backend backend = tensor->backend; + ggml_backend_type backend = tensor->backend; ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); @@ -7525,9 +7525,9 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des // backend interface -#define UNUSED(x) (void)(x) +#define UNUSED GGML_UNUSED -struct ggml_backend_cuda_context { +struct ggml_backend_context_cuda { }; static const char * ggml_backend_cuda_name(ggml_backend_t backend) { @@ -7537,18 +7537,18 @@ static const char * ggml_backend_cuda_name(ggml_backend_t backend) { } static void ggml_backend_cuda_free(ggml_backend_t backend) { - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; delete cuda_ctx; delete backend; } -struct ggml_cuda_buffer_context { +struct ggml_backend_buffer_context_cuda { void * device; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; - ~ggml_cuda_buffer_context() { + ~ggml_backend_buffer_context_cuda() { delete[] temp_tensor_extras; } @@ -7567,13 +7567,13 @@ struct ggml_cuda_buffer_context { }; static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; CUDA_CHECK(cudaFree(ctx->device)); delete ctx; } static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; return ctx->device; } @@ -7597,7 +7597,7 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff } static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context; + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = tensor->data; @@ -7616,7 +7616,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g UNUSED(buffer); } -static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = { +static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size, @@ -7625,9 +7625,9 @@ static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = { }; static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) { - ggml_cuda_buffer_context * ctx = new ggml_cuda_buffer_context; + ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda; CUDA_CHECK(cudaMalloc(&ctx->device, size)); - return ggml_backend_buffer_init(cuda_backend_buffer_interface, backend, ctx, size); + return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size); } static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) { @@ -7661,7 +7661,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { GGML_ASSERT(!"not implemented"); return nullptr; @@ -7670,14 +7670,14 @@ static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t back UNUSED(cgraph); } -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { +static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); @@ -7738,7 +7738,7 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph UNUSED(backend); } -static ggml_backend_interface cuda_backend_interface = { +static ggml_backend_i cuda_backend_i = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer, @@ -7758,12 +7758,13 @@ static ggml_backend_interface cuda_backend_interface = { ggml_backend_t ggml_backend_cuda_init() { ggml_init_cublas(); // TODO: remove from ggml.c - ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context; + ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda; - ggml_backend_t cuda_backend = new ggml_backend_s; - *cuda_backend = (ggml_backend_s){ - /* .interface = */ cuda_backend_interface, + ggml_backend_t cuda_backend = new ggml_backend; + *cuda_backend = (ggml_backend){ + /* .interface = */ cuda_backend_i, /* .context = */ ctx }; + return cuda_backend; } From c4cd2d74a2cbf87010f7f7d708885663f05ce1fb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Oct 2023 15:53:52 +0300 Subject: [PATCH 09/23] gpt-2 : add comments about KV allocation --- examples/gpt-2/main.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index a046b19ea..4b8f20321 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -334,13 +334,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - // allocate buffer and tensors + // create a backend buffer (can be in host or device memory) model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256); - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); - ggml_allocr_alloc(alloc, model.memory_k); - ggml_allocr_alloc(alloc, model.memory_v); - ggml_allocr_free(alloc); + // allocate the tensors into the backend buffer + // TODO: better API for this + { + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); + + // this updates the pointers in the tensors to point to the correct location in the buffer + // this is necessary since the ggml_context is .no_alloc == true + ggml_allocr_alloc(alloc, model.memory_k); + ggml_allocr_alloc(alloc, model.memory_v); + + ggml_allocr_free(alloc); + } } // load weights From d8b3efc794c6776be9c981de3e62cbe56f1376db Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 5 Oct 2023 14:56:44 +0200 Subject: [PATCH 10/23] add ggml_backend_is_cpu --- examples/gpt-2/main.cpp | 18 +++++++++++------- include/ggml/ggml-backend.h | 2 ++ src/ggml-backend.c | 4 ++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 4b8f20321..e7bab0ba1 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -415,13 +415,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - // read into a temporary buffer first, then copy to the tensor - // TODO: read directly into the tensor if the backend is CPU - read_buf.resize(ggml_nbytes(tensor)); - fin.read(read_buf.data(), ggml_nbytes(tensor)); - ggml_allocr_alloc(alloc, tensor); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); + + if (ggml_backend_is_cpu(model.backend)) { + // for the CPU backend, we can read directly into the tensor + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(ggml_nbytes(tensor)); + fin.read(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); + } // GPT-2 models share the WTE tensor as the LM head if (name == "model/wte" && has_lm_head == false) { @@ -772,7 +776,7 @@ bool gpt2_eval( ggml_allocr_alloc_graph(allocr, gf); // run the computation - if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) { + if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); } ggml_backend_graph_compute(model.backend, gf); diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 96a1ab201..606ea5e4d 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -132,6 +132,8 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 9e5dc8c9a..8e3628a2c 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -368,6 +368,10 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend->interface.get_name == ggml_backend_cpu_name; +} + void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; ctx->n_threads = n_threads; From 3dbc43a403fa799aec4fc6049be079d98aa0a0af Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 5 Oct 2023 15:07:42 +0200 Subject: [PATCH 11/23] add backend check to ggml_backend_cpu_set_n_threads --- src/ggml-backend.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 8e3628a2c..f9c58aeef 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -373,13 +373,14 @@ bool ggml_backend_is_cpu(ggml_backend_t backend) { } void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; ctx->n_threads = n_threads; } ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { // TODO: NULL backend? - // TODO: no free return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); } From b74ffd5397f7905b4b38c99d06b6c845f773a06b Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 5 Oct 2023 16:13:24 +0200 Subject: [PATCH 12/23] backend cpu: fix buffer alignment --- src/ggml-backend.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ggml-backend.c b/src/ggml-backend.c index f9c58aeef..c5bc03280 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -231,8 +231,9 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { - void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned - // TODO: maybe use GGML_ALIGNED_MALLOC? + size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned + void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? + return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); } From b4ec9787e256b320cf8892b77e358d40877ace98 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 01:38:45 +0200 Subject: [PATCH 13/23] fix CUDA_ARCHITECTURES for mmq --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 95f91e331..bcfb4b23b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -306,7 +306,7 @@ endif() if (GGML_CUDA_SOURCES) message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "52;61") + set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "52;61;70") set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") if (NOT MSVC) target_link_libraries(ggml PUBLIC stdc++) From 25ce18aecb32a4a5078839f77b70799ba323c7c8 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 01:46:46 +0200 Subject: [PATCH 14/23] ggml-alloc : better handle view initialization --- src/ggml-alloc.c | 35 ++++++++++++++++++++++++----------- src/ggml.c | 4 ++-- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 3f53c4c82..bb027e2f9 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -379,7 +379,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_ROPE: case GGML_OP_RMS_NORM: case GGML_OP_SOFT_MAX: - case GGML_OP_CONT: return true; default: @@ -387,14 +386,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) { } } +static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) { + assert(view->view_src != NULL && view->view_src->data != NULL); + view->backend = view->view_src->backend; + view->buffer = view->view_src->buffer; + view->data = (char *)view->view_src->data + view->view_offs; + + // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend + // due to the ggml_tensor_extra_gpu ring buffer overwriting the with the KV cache extras + assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend); + ggml_backend_buffer_init_tensor(alloc->buffer, view); +} + static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { - assert(node->view_src->data != NULL); - node->data = (char *)node->view_src->data + node->view_offs; - node->buffer = node->view_src->buffer; - ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view + init_view(alloc, node); } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -422,17 +430,15 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->data = parent->data; - node->buffer = parent->buffer; - ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view + node->view_src = parent; + init_view(alloc, node); return; } } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->data = parent->data; - node->buffer = parent->buffer; - ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view + node->view_src = parent; + init_view(alloc, node); return; } } @@ -461,6 +467,10 @@ size_t ggml_allocr_alloc_graph_n( if (ggml_is_view(node)) { struct ggml_tensor * view_src = node->view_src; hash_get(ht, view_src)->n_views += 1; + if (node->buffer == NULL && node->data != NULL) { + // view of a pre-allocated tensor, didn't call init_view() yet + init_view(alloc, node); + } } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -469,6 +479,9 @@ size_t ggml_allocr_alloc_graph_n( break; } hash_get(ht, parent)->n_children += 1; + if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { + init_view(alloc, parent); + } } } } diff --git a/src/ggml.c b/src/ggml.c index aabe2e4df..b606d7cc3 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4950,8 +4950,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ view_src ? view_src->backend : GGML_BACKEND_CPU, - /*.buffer =*/ view_src ? view_src->buffer : NULL, + /*.backend =*/ GGML_BACKEND_CPU, + /*.buffer =*/ NULL, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, From b42e19c4de695fe11f225e0d826a5cbd533d23e1 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 01:47:59 +0200 Subject: [PATCH 15/23] ggml-cuda : fix padding clearing --- src/ggml-alloc.c | 2 +- src/ggml-cuda.cu | 47 ++++++++++++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index bb027e2f9..a749c810d 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -393,7 +393,7 @@ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) { view->data = (char *)view->view_src->data + view->view_offs; // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend - // due to the ggml_tensor_extra_gpu ring buffer overwriting the with the KV cache extras + // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend); ggml_backend_buffer_init_tensor(alloc->buffer, view); } diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index d65e2143e..f21cb1a1c 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -7586,9 +7586,11 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff int64_t ne0 = tensor->ne[0]; - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) + * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + } } return size; @@ -7598,6 +7600,14 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + if (tensor->view_src != NULL && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->backend == buffer->backend); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = tensor->data; @@ -7605,12 +7615,18 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g tensor->backend = GGML_BACKEND_GPU; tensor->extra = extra; - // initialize padding to 0 to avoid possible NaN values - size_t original_size = ggml_nbytes(tensor); - size_t size = ggml_backend_cuda_buffer_get_alloc_size(buffer, tensor); + if (ggml_is_quantized(tensor->type)) { + // initialize padding to 0 to avoid possible NaN values + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t original_size = ggml_nbytes_split(tensor, nrows_split); + size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor); - if (size > original_size && tensor->view_src == nullptr) { - CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, size - original_size, g_cudaStreams[g_main_device][0])); + if (padded_size > original_size && tensor->view_src == nullptr) { + CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0])); + } } UNUSED(buffer); @@ -7690,18 +7706,11 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - // views of allocated tensors don't call init_tensor, handle them here - // TODO: handle in ggml-alloc - if (node->extra == nullptr) { - GGML_ASSERT(node->view_src != nullptr); - GGML_ASSERT(node->view_src->backend == GGML_BACKEND_GPU); - ggml_backend_cuda_buffer_init_tensor(node->buffer, node); - } + + assert(node->backend == GGML_BACKEND_GPU); for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr && node->src[j]->extra == nullptr) { - GGML_ASSERT(node->src[j]->view_src != nullptr); - GGML_ASSERT(node->src[j]->view_src->backend == GGML_BACKEND_GPU); - ggml_backend_cuda_buffer_init_tensor(node->src[j]->buffer, node->src[j]); + if (node->src[j] != nullptr) { + assert(node->src[j]->backend == GGML_BACKEND_GPU); } } From 94b05299b117142817f1f6cca1abea8e1133663b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 10:51:58 +0300 Subject: [PATCH 16/23] ggml-backend : metal (#552) * ggml-backend : metal (WIP) * ggml-backend : metal (adapt CPU backend) * ggml-backend : working metal * ggml-backend : clean-up metal implementation * ggml-backend : add ggml_backend_is_metal() --- examples/gpt-2/CMakeLists.txt | 5 ++ examples/gpt-2/main.cpp | 55 +++++++++----- include/ggml/ggml-backend.h | 7 +- src/CMakeLists.txt | 2 +- src/ggml-backend.c | 16 +++- src/ggml-cuda.h | 1 - src/ggml-metal.h | 17 ++++- src/ggml-metal.m | 135 +++++++++++++++++++++++++++++++++- 8 files changed, 210 insertions(+), 28 deletions(-) diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt index 2307a7dd9..6ddada061 100644 --- a/examples/gpt-2/CMakeLists.txt +++ b/examples/gpt-2/CMakeLists.txt @@ -18,6 +18,11 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) if (GGML_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) endif() + if (GGML_CLBLAST) add_compile_definitions(GGML_USE_CLBLAST) endif() + +if (GGML_METAL) + add_compile_definitions(GGML_USE_METAL) +endif() diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index e7bab0ba1..25725a1d1 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -6,6 +6,10 @@ #include "ggml-cuda.h" #endif +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + #include "common.h" #include "common-ggml.h" @@ -22,6 +26,13 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + // default hparams (GPT-2 117M) struct gpt2_hparams { int32_t n_vocab = 50257; @@ -234,6 +245,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } #endif +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + } +#endif + if (!model.backend) { // fallback to CPU backend fprintf(stderr, "%s: using CPU backend\n", __func__); @@ -521,9 +543,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] cur = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), - cur), - //ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + cur, + model.layers[il].ln_1_g), model.layers[il].ln_1_b); } @@ -541,8 +562,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), - cur); + cur, + model.layers[il].c_attn_attn_b); } // self-attention @@ -649,8 +670,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), - cur); + cur, + model.layers[il].c_attn_proj_b); } // add the input @@ -668,9 +689,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] cur = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), - cur), - //ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + cur, + model.layers[il].ln_2_g), model.layers[il].ln_2_b); } @@ -687,8 +707,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), - cur); + cur, + model.layers[il].c_mlp_fc_b); // GELU activation // [3072, N] @@ -707,8 +727,8 @@ struct ggml_cgraph * gpt2_graph( cur); cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), - cur); + cur, + model.layers[il].c_mlp_proj_b); } // input for next layer @@ -724,9 +744,8 @@ struct ggml_cgraph * gpt2_graph( // [ 768, N] inpL = ggml_add(ctx0, ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), - inpL), - //ggml_repeat(ctx0, model.ln_f_b, inpL)); + inpL, + model.ln_f_g), model.ln_f_b); } @@ -778,6 +797,8 @@ bool gpt2_eval( // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } else if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_threads(model.backend, n_threads); } ggml_backend_graph_compute(model.backend, gf); diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 606ea5e4d..36457e991 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -132,14 +132,17 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); /////////////////////////// + // TODO: we should probably do something better here + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend); + GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); + #if 0 // graph splitting #define GGML_MAX_SPLITS 200 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bcfb4b23b..b225597ed 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -229,7 +229,7 @@ if (GGML_METAL) set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - add_compile_definitions(GGML_METAL_NDEBUG) + #add_compile_definitions(GGML_METAL_NDEBUG) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/src/ggml-backend.c b/src/ggml-backend.c index c5bc03280..187a149c4 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -369,10 +369,6 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } -bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->interface.get_name == ggml_backend_cpu_name; -} - void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -385,6 +381,18 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); } +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend->interface.get_name == ggml_backend_cpu_name; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return strcmp(ggml_backend_name(backend), "CUDA") == 0; +} + +bool ggml_backend_is_metal(ggml_backend_t backend) { + return strcmp(ggml_backend_name(backend), "Metal") == 0; +} + #if 0 // splits diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h index 81ee9a2e9..57adc9cf3 100644 --- a/src/ggml-cuda.h +++ b/src/ggml-cuda.h @@ -46,7 +46,6 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, // backend API GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use - #ifdef __cplusplus } #endif diff --git a/src/ggml-metal.h b/src/ggml-metal.h index 790cf0bf7..bc6773a6e 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -20,6 +20,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #include #include @@ -35,10 +36,15 @@ struct ggml_cgraph; extern "C" { #endif -void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); +// +// internal API +// temporary exposed to user-code +// struct ggml_metal_context; +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + // number of command buffers to use struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); @@ -83,6 +89,15 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // creates gf->n_threads command buffers in parallel void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +// +// backend API +// user-code should use only these functions +// + +GGML_API ggml_backend_t ggml_backend_metal_init(void); + +GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads); + #ifdef __cplusplus } #endif diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 866fed434..a06b738a1 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -151,8 +151,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ } } - - struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: allocating\n", __func__); @@ -1371,3 +1369,136 @@ void ggml_metal_graph_compute( } } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +static const char * ggml_backend_metal_name(ggml_backend_t backend) { + return "Metal"; + + UNUSED(backend); +} + +static void ggml_backend_metal_free(ggml_backend_t backend) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + ggml_metal_free(ctx); + free(backend); +} + +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)buffer->context; +} + +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i metal_backend_buffer_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, // no initialization required + /* .free_tensor = */ NULL, // no cleanup required +}; + +static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + void * data = ggml_metal_host_malloc(size); + + // TODO: set proper name of the buffers + ggml_metal_add_buffer(ctx, "backend", data, size, 0); + + return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); +} + +static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { + return 32; + UNUSED(backend); +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_graph_compute(metal_ctx, cgraph); +} + +static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return true; + UNUSED(backend); + UNUSED(op); +} + +static struct ggml_backend_i metal_backend_i = { + /* .get_name = */ ggml_backend_metal_name, + /* .free = */ ggml_backend_metal_free, + /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_get_alignment, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .synchronize = */ ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, + /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_metal_graph_compute, + /* .supports_op = */ ggml_backend_metal_supports_op, +}; + +ggml_backend_t ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + + ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + + ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + + *metal_backend = (struct ggml_backend) { + /* .interface = */ metal_backend_i, + /* .context = */ ctx, + }; + + return metal_backend; +} + +void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_set_n_cb(ctx, n_threads); +} From ce797df6c3e03f23e6ba6222ca54f2111c12f93e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 10:57:23 +0300 Subject: [PATCH 17/23] gpt-2 : take advantage of Metal unified memory --- examples/gpt-2/main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 25725a1d1..0b379a960 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -439,8 +439,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & ggml_allocr_alloc(alloc, tensor); - if (ggml_backend_is_cpu(model.backend)) { - // for the CPU backend, we can read directly into the tensor + if (ggml_backend_is_cpu (model.backend) || + ggml_backend_is_metal(model.backend)) { + // for the CPU and Metal backend, we can read directly into the tensor fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); } else { // read into a temporary buffer first, then copy to device memory From e8bc940ac6a2698fa14cd9ea8c7f463f8936b62d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 10:59:01 +0300 Subject: [PATCH 18/23] gpt-2 : remove TODO + update comment --- examples/gpt-2/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 0b379a960..f86fbf9f3 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -360,12 +360,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256); // allocate the tensors into the backend buffer - // TODO: better API for this { ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); // this updates the pointers in the tensors to point to the correct location in the buffer // this is necessary since the ggml_context is .no_alloc == true + // note that the buffer can actually be a device buffer, depending on the backend ggml_allocr_alloc(alloc, model.memory_k); ggml_allocr_alloc(alloc, model.memory_v); From 5ca14cedb4bbe692710a0e507bc59b0949bed11d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 11:05:36 +0300 Subject: [PATCH 19/23] ggml-backend : fix ggml_backend_is_xxx() interface --- examples/gpt-2/main.cpp | 6 +++++- include/ggml/ggml-backend.h | 7 ++----- src/ggml-backend.c | 16 ++++------------ src/ggml-metal.h | 2 ++ src/ggml-metal.m | 4 ++++ 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index f86fbf9f3..7a2d5374d 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -798,9 +798,13 @@ bool gpt2_eval( // run the computation if (ggml_backend_is_cpu(model.backend)) { ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } else if (ggml_backend_is_metal(model.backend)) { + } +#ifdef GGML_USE_METAL + // TODO: not great - what should we do? + if (ggml_backend_is_metal(model.backend)) { ggml_backend_metal_set_n_threads(model.backend, n_threads); } +#endif ggml_backend_graph_compute(model.backend, gf); //if (n_past%100 == 0) { diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 36457e991..22d324e9d 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -132,17 +132,14 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); /////////////////////////// - // TODO: we should probably do something better here - GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend); - GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); - #if 0 // graph splitting #define GGML_MAX_SPLITS 200 diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 187a149c4..c5bc03280 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -369,6 +369,10 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend->interface.get_name == ggml_backend_cpu_name; +} + void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); @@ -381,18 +385,6 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); } -bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->interface.get_name == ggml_backend_cpu_name; -} - -bool ggml_backend_is_cuda(ggml_backend_t backend) { - return strcmp(ggml_backend_name(backend), "CUDA") == 0; -} - -bool ggml_backend_is_metal(ggml_backend_t backend) { - return strcmp(ggml_backend_name(backend), "Metal") == 0; -} - #if 0 // splits diff --git a/src/ggml-metal.h b/src/ggml-metal.h index bc6773a6e..cb5646587 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -96,6 +96,8 @@ void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgrap GGML_API ggml_backend_t ggml_backend_metal_init(void); +GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); + GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads); #ifdef __cplusplus diff --git a/src/ggml-metal.m b/src/ggml-metal.m index a06b738a1..055d137f8 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1497,6 +1497,10 @@ ggml_backend_t ggml_backend_metal_init(void) { return metal_backend; } +bool ggml_backend_is_metal(ggml_backend_t backend) { + return backend->interface.get_name == ggml_backend_metal_name; +} + void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; From b22916c3a3f4d0006e85ca4ee145456966fe553b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Oct 2023 11:33:32 +0300 Subject: [PATCH 20/23] gpt-2 : fix build --- examples/gpt-2/main.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 7a2d5374d..61b53296b 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -439,8 +439,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & ggml_allocr_alloc(alloc, tensor); - if (ggml_backend_is_cpu (model.backend) || - ggml_backend_is_metal(model.backend)) { + if (ggml_backend_is_cpu (model.backend) +#ifdef GGML_USE_METAL + || ggml_backend_is_metal(model.backend) +#endif + ) { // for the CPU and Metal backend, we can read directly into the tensor fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); } else { @@ -800,7 +803,6 @@ bool gpt2_eval( ggml_backend_cpu_set_n_threads(model.backend, n_threads); } #ifdef GGML_USE_METAL - // TODO: not great - what should we do? if (ggml_backend_is_metal(model.backend)) { ggml_backend_metal_set_n_threads(model.backend, n_threads); } From 01710cc037206eb1c66347b3df3888487d97551d Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 14:46:20 +0200 Subject: [PATCH 21/23] ggml-cuda : cleanup, fix case for src1 not contiguous --- src/ggml-cuda.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index f21cb1a1c..c8c36c573 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -6732,8 +6732,7 @@ static void ggml_cuda_op_mul_mat( if (convert_src1_to_q8_1) { src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); - // FIXME: why split only? src1 never gets quantized, breaks ggml-backend/GPT-2 - if (/*split &&*/ src1_on_device && src1_is_contiguous) { + if (src1_on_device && src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -6815,7 +6814,7 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } From 9cb2626053cd532021342f2fc1ddd8de42535718 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 18:24:56 +0200 Subject: [PATCH 22/23] remove commented code --- include/ggml/ggml-backend.h | 49 +-------- src/ggml-alloc.c | 8 +- src/ggml-backend.c | 202 +----------------------------------- 3 files changed, 6 insertions(+), 253 deletions(-) diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 22d324e9d..9e0567c6b 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -132,56 +132,11 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); - GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); - - /////////////////////////// - -#if 0 - // graph splitting - #define GGML_MAX_SPLITS 200 - #define GGML_MAX_SPLIT_INPUTS 4 - - struct ggml_graph_split { - char name[GGML_MAX_NAME]; - struct ggml_context * ctx; - struct ggml_tensor * src_inputs[GGML_MAX_SPLIT_INPUTS + 1]; - struct ggml_tensor * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1]; - struct ggml_cgraph * graph; - }; - - // TODO: this shouldn't be fixed size, allocate from ggml_context - struct ggml_graph_splits { - int n_splits; - struct ggml_graph_split splits[GGML_MAX_SPLITS]; - }; - - // TODO: allocate in ggml_context - GGML_API struct ggml_graph_splits ggml_graph_split_init(void); - - // this won't be needed once we can allocate graphs from a ggml_context - GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits); - - // add a split to the graph - single and multiple inputs versions - GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...); - GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...); - - // build graphs for all splits - GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output); - - // compute - GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits); - - // graph tensor allocator - GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx); - GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits); - - // automatically split a graph into multiple graphs based on the location of the tensors - GGML_API struct ggml_graph_splits ggml_graph_split(struct ggml_cgraph * graph, struct ggml_context * ctx); -#endif + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size); #ifdef __cplusplus } diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index a749c810d..e1b4377d6 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -269,7 +269,7 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) { } struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { - struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size); + struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size); struct ggml_allocr * alloc = ggml_allocr_new_from_buffer(buffer); alloc->alignment = alignment; @@ -306,11 +306,7 @@ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * bu struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); - // TODO: these should be set by the backend: - // - get_alignment() - // - get_alloc_size() - // TODO: support other backends - struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr((void *)0x1000, (size_t)-0x1001); + struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, (void *)0x1000, (size_t)-0x1001); *alloc = (struct ggml_allocr){ /*.buffer = */ buffer, diff --git a/src/ggml-backend.c b/src/ggml-backend.c index c5bc03280..f9e53a8a0 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -380,204 +380,6 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } -ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { - // TODO: NULL backend? - return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size); +ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) { + return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size); } - -#if 0 -// splits - -struct ggml_graph_splits ggml_graph_split_init(void) { - struct ggml_graph_splits splits = {0}; - return splits; -} - -// TODO: this can be removed after allocating the graphs in a ggml_context -void ggml_graph_splits_free(struct ggml_graph_splits * splits) { - for (int i = 0; i < splits->n_splits; i++) { - if (splits->splits[i].graph) { - free(splits->splits[i].graph); - } - } -} - -static void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, va_list args) { - GGML_ASSERT(splits->n_splits < GGML_MAX_SPLITS); - - struct ggml_graph_split * split = &splits->splits[splits->n_splits]; - - - if (splits->n_splits == 0) { - // always add the first split - int i = 0; - while (inputs[i] != NULL) { - GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); - split->src_inputs[i] = *inputs[i]; - split->dst_inputs[i] = *inputs[i]; - i++; - } - split->src_inputs[i] = NULL; - split->dst_inputs[i] = NULL; - split->ctx = ctx; - } - // check if the split is on the same context as the previous one - else if (splits->n_splits > 0 && splits->splits[splits->n_splits - 1].ctx == ctx) { - // add to the previous split - char name[GGML_MAX_NAME - 2]; - int n = vsnprintf(name, sizeof(name), fmt, args); - char new_name[GGML_MAX_NAME]; - snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name); - strcpy(splits->splits[splits->n_splits - 1].name, new_name); - return; - } else { - // add a new split - int i = 0; - while (inputs[i] != NULL) { - GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS); - split->src_inputs[i] = *inputs[i]; - split->dst_inputs[i] = ggml_dup_tensor(ctx, *inputs[i]); - ggml_format_name(split->dst_inputs[i], "%s (split output)", split->src_inputs[i]->name); - // TODO: maybe support different layouts in ggml_backend_cpy_tensor instead - for (int j = 0; j < GGML_MAX_DIMS; j++) { - split->dst_inputs[i]->nb[j] = split->src_inputs[i]->nb[j]; - } - ggml_set_name(split->dst_inputs[i], ggml_get_name(*inputs[i])); - *inputs[i] = split->dst_inputs[i]; - i++; - } - split->src_inputs[i] = NULL; - split->dst_inputs[i] = NULL; - split->ctx = ctx; - } - - vsnprintf(split->name, GGML_MAX_NAME, fmt, args); - split->graph = NULL; - splits->n_splits++; -} - -void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** input, struct ggml_context * ctx, const char * fmt, ...) { - va_list args; - va_start(args, fmt); - ggml_graph_splits_add_n_va(splits, input, ctx, fmt, args); - va_end(args); -} - -void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...) { - va_list args; - va_start(args, fmt); - ggml_graph_splits_add_n_va(splits, (struct ggml_tensor**[2]){ input, NULL }, ctx, fmt, args); - va_end(args); -} - -void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output) { - struct ggml_tensor *last_outputs[2] = { output, NULL }; - struct ggml_tensor ** outputs; - - for (int i = 0; i < splits->n_splits; i++) { - struct ggml_graph_split * split = &splits->splits[i]; - - if (i < splits->n_splits - 1) { - outputs = splits->splits[i + 1].src_inputs; - } else { - outputs = last_outputs; - } - - // build the graph - // TODO: allocate graphs in context - split->graph = (struct ggml_cgraph *) malloc(sizeof(struct ggml_cgraph)); - memset(split->graph, 0, sizeof(struct ggml_cgraph)); - for (int j = 0; outputs[j] != NULL; j++) { - ggml_build_forward_expand(split->graph, outputs[j]); - } - - for (int j = 1; j < split->graph->n_nodes; j++) { - if (split->graph->nodes[j]->backend != split->graph->nodes[0]->backend) { - fprintf(stderr, "split %s: node %s has different backend (%s) than the first node (%s)\n", - split->name, split->graph->nodes[j]->name, - ggml_backend_name(split->graph->nodes[j]->backend_s), - ggml_backend_name(split->graph->nodes[0]->backend_s)); - } - } - for (int j = 1; j < split->graph->n_leafs; j++) { - if (split->graph->leafs[j]->backend != split->graph->leafs[0]->backend) { - fprintf(stderr, "split %s: leaf %s has different backend (%s) than the first leaf (%s)\n", - split->name, split->graph->leafs[j]->name, - ggml_backend_name(split->graph->leafs[j]->backend_s), - ggml_backend_name(split->graph->leafs[0]->backend_s)); - } - } - } -} - -void ggml_graph_splits_compute(struct ggml_graph_splits * splits) { - uint64_t copy_us = 0; - uint64_t compute_cpu_us = 0; - uint64_t compute_gpu_us = 0; - int n_nodes = 0; - for (int i = 0; i < splits->n_splits; i++) { - struct ggml_graph_split * split = &splits->splits[i]; - - //printf("computing split %i (%s) on backend %s (%i nodes)\n", i, split->name, ggml_backend_name(split->dst_inputs[0]->backend), split->graph->n_nodes); - - // copy the input tensor to the backend - uint64_t copy_start_us = ggml_time_us(); - for (int j = 0; split->src_inputs[j] != NULL; j++) { - //printf("\tcopying tensor %d (%s) (%s -> %s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_backend_name(split->src_inputs[j]->backend), ggml_backend_name(split->dst_inputs[j]->backend), ggml_nbytes(split->src_inputs[j])); - //printf("%p %p\n", split->src_inputs[j], split->dst_inputs[j]); - ggml_backend_tensor_copy(split->src_inputs[j], split->dst_inputs[j]); - } - // ggml_backend_synchronize(split->dst_inputs[0]->backend); - copy_us += ggml_time_us() - copy_start_us; - -#if 0 - char split_filename[GGML_MAX_NAME]; - snprintf(split_filename, GGML_MAX_NAME, "split_%i.dot", i); - ggml_graph_dump_dot(split->graph, NULL, split_filename); -#endif - uint64_t start = ggml_time_us(); - ggml_backend_graph_compute(split->dst_inputs[0]->backend_s, split->graph); - //ggml_backend_synchronize(split->dst_inputs[0]->backend); - uint64_t end = ggml_time_us(); - if (strcmp(ggml_backend_name(split->dst_inputs[0]->backend_s), "CPU") == 0) { - compute_cpu_us += end - start; - } else { - compute_gpu_us += end - start; - } - - n_nodes += split->graph->n_nodes; - } - - //printf("ggml_graph_splits_compute: n_splits: %d, nodes: %d, copy: %.2fms, compute_cpu: %.2fms, compute_gpu: %.2fms\n", splits->n_splits, n_nodes, copy_us / 1000.0, compute_cpu_us / 1000.0, compute_gpu_us / 1000.0); - //exit(0); -} - -void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) { - // splits of the same backend are allocated together to ensure that dependencies from one split to the next - // are not overwritten when there is another split from a different backend between them (e.g. inpSA in llama.cpp) - bool visited[GGML_MAX_SPLITS] = {false}; - for (int i = 0; i < splits->n_splits; i++) { - if (!visited[i]) { - struct ggml_graph_split * split = &splits->splits[i]; - struct ggml_context * ctx = split->ctx; - struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS]; - struct ggml_tensor ** graph_inputs[GGML_MAX_SPLITS]; - struct ggml_tensor ** graph_outputs[GGML_MAX_SPLITS]; - int n_graphs = 0; - - for (int j = i; j < splits->n_splits; j++) { - if (splits->splits[j].ctx == ctx) { - graph_inputs[n_graphs] = splits->splits[j].dst_inputs; - graph_outputs[n_graphs] = j < splits->n_splits - 1 ? splits->splits[j + 1].src_inputs : NULL; - backend_graphs[n_graphs] = splits->splits[j].graph; - visited[j] = true; - n_graphs++; - } - } - - struct ggml_allocr * alloc = NULL; - ggml_allocr_alloc_graph_n(alloc, backend_graphs, n_graphs, graph_inputs, graph_outputs); - } - } -} -#endif From 1ad7c5ee442b797a2ce1992772991ecc216b54b9 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 18:46:29 +0200 Subject: [PATCH 23/23] rename ggml_backend_metal_set_n_threads to n_cb --- examples/gpt-2/main.cpp | 2 +- src/ggml-metal.h | 2 +- src/ggml-metal.m | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 61b53296b..0acb3a1b1 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -804,7 +804,7 @@ bool gpt2_eval( } #ifdef GGML_USE_METAL if (ggml_backend_is_metal(model.backend)) { - ggml_backend_metal_set_n_threads(model.backend, n_threads); + ggml_backend_metal_set_n_cb(model.backend, n_threads); } #endif ggml_backend_graph_compute(model.backend, gf); diff --git a/src/ggml-metal.h b/src/ggml-metal.h index cb5646587..096b844e3 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -98,7 +98,7 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads); +GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); #ifdef __cplusplus } diff --git a/src/ggml-metal.m b/src/ggml-metal.m index 055d137f8..e56436394 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -1501,8 +1501,8 @@ bool ggml_backend_is_metal(ggml_backend_t backend) { return backend->interface.get_name == ggml_backend_metal_name; } -void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) { +void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; - ggml_metal_set_n_cb(ctx, n_threads); + ggml_metal_set_n_cb(ctx, n_cb); }