From 883f0bc2d3255f42ee81bc11b60ba9f0781b05d1 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 2 Oct 2023 16:07:22 +0200
Subject: [PATCH 01/23] ggml backends interface v1

---
 examples/CMakeLists.txt       |   6 +-
 examples/gpt-2/CMakeLists.txt |  10 +
 examples/gpt-2/main.cpp       | 155 +++++--
 include/ggml/ggml-alloc.h     |   6 +
 include/ggml/ggml-backend.h   | 151 +++++++
 include/ggml/ggml.h           |  15 +-
 src/CMakeLists.txt            |   5 +
 src/ggml-alloc.c              | 150 +++----
 src/ggml-backend.c            | 506 ++++++++++++++++++++++
 src/ggml-cuda.cu              | 507 ++++++++++++++++++----
 src/ggml-cuda.h               |   5 +
 src/ggml.c                    | 790 +++++++++-------------------------
 tests/CMakeLists.txt          |  20 +-
 13 files changed, 1489 insertions(+), 837 deletions(-)
 create mode 100644 include/ggml/ggml-backend.h
 create mode 100644 src/ggml-backend.c

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e3404fb8b..c0201c131 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,11 +19,11 @@ target_link_libraries(common-ggml PRIVATE ggml)
 target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_subdirectory(gpt-2)
-add_subdirectory(gpt-j)
+#add_subdirectory(gpt-j)
 add_subdirectory(whisper)
 add_subdirectory(mnist)
-add_subdirectory(gpt-neox)
-add_subdirectory(dolly-v2)
+#add_subdirectory(gpt-neox)
+#add_subdirectory(dolly-v2)
 add_subdirectory(replit)
 add_subdirectory(mpt)
 add_subdirectory(starcoder)
diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt
index 1d9bcdd8a..2307a7dd9 100644
--- a/examples/gpt-2/CMakeLists.txt
+++ b/examples/gpt-2/CMakeLists.txt
@@ -11,3 +11,13 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 set(TEST_TARGET gpt-2-quantize)
 add_executable(${TEST_TARGET} quantize.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# For GPU offloading
+
+if (GGML_CUBLAS)
+    add_compile_definitions(GGML_USE_CUBLAS)
+endif()
+if (GGML_CLBLAST)
+    add_compile_definitions(GGML_USE_CLBLAST)
+endif()
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 81859ca5c..184eb8e9a 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -1,5 +1,10 @@
 #include "ggml/ggml.h"
 #include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
 
 #include "common.h"
 #include "common-ggml.h"
@@ -70,11 +75,14 @@ struct gpt2_model {
 
     //
     struct ggml_context * ctx;
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer_w;
+    ggml_backend_buffer_t buffer_kv;
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
 // load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_gpu_layers) {
     printf("%s: loading model from '%s'\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
@@ -155,7 +163,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
     auto & ctx = model.ctx;
 
-    size_t ctx_size = 0;
+    size_t buffer_size = 0;
 
     {
         const auto & hparams = model.hparams;
@@ -165,46 +173,44 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         const int n_ctx   = hparams.n_ctx;
         const int n_vocab = hparams.n_vocab;
 
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+        buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        buffer_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
 
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
 
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
 
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+        buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        buffer_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        buffer_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
 
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        buffer_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
 
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        buffer_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
 
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
+        buffer_size += (6 + 12*n_layer)*128; // alignment overhead
 
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+        printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
     }
 
     // create the ggml context
     {
+        size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
         struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
+            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
             /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
+            /*.no_alloc   =*/ true,
         };
 
         model.ctx = ggml_init(params);
@@ -214,6 +220,31 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         }
     }
 
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if (!model.backend) {
+        // fallback to CPU backend
+        fprintf(stderr, "%s: using CPU backend\n", __func__);
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
+        return false;
+    }
+
+    // allocate weights buffer
+    model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
     // prepare memory for the weights
     {
         const auto & hparams = model.hparams;
@@ -299,14 +330,25 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
         printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+
+        // allocate buffer and tensors
+        model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
+        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
+        ggml_allocr_alloc(alloc, model.memory_k);
+        ggml_allocr_alloc(alloc, model.memory_v);
+        ggml_allocr_free(alloc);
     }
 
     // load weights
     {
+        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
+
         size_t total_size = 0;
 
         bool has_lm_head = false;
 
+        std::vector<char> read_buf;
+
         while (true) {
             int32_t n_dims;
             int32_t length;
@@ -336,6 +378,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             }
 
             auto tensor = model.tensors[name];
+            ggml_set_name(tensor, name.c_str());
             if (ggml_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
                 return false;
@@ -360,11 +403,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
                 return false;
             }
 
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            // read into a temporary buffer first, then copy to the tensor
+            // TODO: read directly into the tensor if the backend is CPU
+            read_buf.resize(ggml_nbytes(tensor));
+            fin.read(read_buf.data(), ggml_nbytes(tensor));
+
+            ggml_allocr_alloc(alloc, tensor);
+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
 
             // GPT-2 models share the WTE tensor as the LM head
             if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+                //ggml_allocr_alloc(alloc, model.lm_head);
+                //ggml_backend_tensor_copy(tensor, model.lm_head);
+                model.lm_head = tensor;
             }
 
             if (name == "model/lm_head") {
@@ -374,6 +425,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             total_size += ggml_nbytes(tensor);
         }
 
+        ggml_allocr_free(alloc);
         printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
     }
 
@@ -416,21 +468,23 @@ struct ggml_cgraph * gpt2_graph(
 
     // avoid writing to tensors if we are only measuring the memory usage
     if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+        ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
     }
 
     struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_allocr_alloc(allocr, position);
     if (!ggml_allocr_is_measure(allocr)) {
         for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i;
+            int32_t v = n_past + i;
+            ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
         }
     }
 
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
     ggml_allocr_alloc(allocr, KQ_scale);
     if (!ggml_allocr_is_measure(allocr)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+        float s = 1.0f/sqrtf(float(n_embd)/n_head);
+        ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
     }
 
     // wte + wpe
@@ -453,7 +507,8 @@ struct ggml_cgraph * gpt2_graph(
                     ggml_mul(ctx0,
                         ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
                         cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                    //ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                    model.layers[il].ln_1_b);
         }
 
         // attn
@@ -599,7 +654,8 @@ struct ggml_cgraph * gpt2_graph(
                         ggml_mul(ctx0,
                             ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
                             cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+                        //ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+                        model.layers[il].ln_2_b);
             }
 
             // fully connected
@@ -654,7 +710,8 @@ struct ggml_cgraph * gpt2_graph(
                 ggml_mul(ctx0,
                     ggml_repeat(ctx0, model.ln_f_g, inpL),
                     inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
+                //ggml_repeat(ctx0, model.ln_f_b, inpL));
+                model.ln_f_b);
     }
 
     // inpL = WTE * inpL
@@ -703,11 +760,12 @@ bool gpt2_eval(
     ggml_allocr_alloc_graph(allocr, gf);
 
     // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
+#ifndef GGML_USE_CUBLAS
+    // FIXME: the backend may be CPU even if CUDA is enabled
+    // if (model.backend.id == GGML_BACKEND_ID_CPU)
+    ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+#endif
+    ggml_backend_graph_compute(model.backend, gf);
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
@@ -718,11 +776,11 @@ bool gpt2_eval(
     struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
 
     //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+    //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
 
     // return result just for the last token
     embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+    ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
 
     return true;
 }
@@ -759,7 +817,7 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if (!gpt2_model_load(params.model, model, vocab)) {
+        if (!gpt2_model_load(params.model, model, vocab, params.n_gpu_layers)) {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
             return 1;
         }
@@ -770,7 +828,7 @@ int main(int argc, char ** argv) {
     }
 
     // keep this buffer alive while evaluating the model
-    std::vector<uint8_t> compute_buffer;
+    ggml_backend_buffer_t buf_compute;
 
     struct ggml_allocr * allocr = NULL;
     // allocate the compute buffer
@@ -787,8 +845,8 @@ int main(int argc, char ** argv) {
 
         // recreate the allocator with the required memory
         ggml_allocr_free(allocr);
-        compute_buffer.resize(mem_size);
-        allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        allocr = ggml_allocr_new_from_buffer(buf_compute);
 
         fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
     }
@@ -888,5 +946,10 @@ int main(int argc, char ** argv) {
 
     ggml_free(model.ctx);
 
+    ggml_backend_buffer_free(model.buffer_w);
+    ggml_backend_buffer_free(model.buffer_kv);
+    ggml_backend_buffer_free(buf_compute);
+    ggml_backend_free(model.backend);
+
     return 0;
 }
diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h
index 0c224f174..c87139491 100644
--- a/include/ggml/ggml-alloc.h
+++ b/include/ggml/ggml-alloc.h
@@ -6,9 +6,11 @@
 extern "C" {
 #endif
 
+struct ggml_backend_buffer;
 
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
@@ -20,6 +22,10 @@ GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
 GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
+GGML_API size_t ggml_allocr_alloc_graph_n(
+                    struct ggml_allocr * alloc,
+                    struct ggml_cgraph ** graphs, int n_graphs,
+                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
 
 
 #ifdef  __cplusplus
diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
new file mode 100644
index 000000000..17e5a38a7
--- /dev/null
+++ b/include/ggml/ggml-backend.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+    typedef struct ggml_backend_s * ggml_backend_t;
+
+    // backend buffer
+    struct ggml_backend_buffer;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef void * ggml_buffer_context_t;
+
+    struct ggml_backend_buffer_interface {
+        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
+        size_t (*get_alignment) (ggml_backend_buffer_t buffer);
+        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
+        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_interface interface;
+        ggml_backend_t backend;
+        ggml_buffer_context_t context;
+        size_t size;
+    };
+
+    // backend buffer functions
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size);
+    GGML_API void   ggml_backend_buffer_free(ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer);
+    GGML_API void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+    // backend
+    typedef void * ggml_backend_context_t;
+    typedef void * ggml_graph_plan_t;
+
+    struct ggml_backend_interface {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // buffer allocation
+        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+
+        // tensor data access
+        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+        void (*synchronize)     (ggml_backend_t backend);
+
+        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // compute graph with a plan
+        ggml_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void              (*graph_plan_free)   (ggml_backend_t backend, ggml_graph_plan_t plan);
+        void              (*graph_plan_compute)(ggml_backend_t backend, ggml_graph_plan_t plan);
+        // compute graph without a plan
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // check if the backend supports an operation
+        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+    };
+
+    struct ggml_backend_s {
+        struct ggml_backend_interface interface;
+        ggml_backend_context_t context;
+    };
+
+    // backend helper functions
+    static inline ggml_backend_t get_backend(const struct ggml_tensor * tensor) { return tensor->buffer->backend; }
+
+    static inline const char * ggml_backend_name(ggml_backend_t backend) { return backend->interface.get_name(backend); }
+    static inline void ggml_backend_free(ggml_backend_t backend) { backend->interface.free(backend); }
+    static inline ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return backend->interface.alloc_buffer(backend, size); }
+    static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); }
+    static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); }
+    static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); }
+    static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); }
+    static inline void ggml_backend_synchronize(ggml_backend_t backend) { backend->interface.synchronize(backend); }
+    static inline ggml_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); }
+    static inline void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); }
+    static inline void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); }
+    static inline void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); }
+    static inline bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { return backend->interface.supports_op(backend, op); }
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // CPU backend
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+
+    ///////////////////////////
+
+#if 0
+    // graph splitting
+    #define GGML_MAX_SPLITS 200
+    #define GGML_MAX_SPLIT_INPUTS 4
+
+    struct ggml_graph_split {
+        char name[GGML_MAX_NAME];
+        struct ggml_context * ctx;
+        struct ggml_tensor  * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
+        struct ggml_tensor  * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
+        struct ggml_cgraph  * graph;
+    };
+
+    // TODO: this shouldn't be fixed size, allocate from ggml_context
+    struct ggml_graph_splits {
+        int n_splits;
+        struct ggml_graph_split splits[GGML_MAX_SPLITS];
+    };
+
+    // TODO: allocate in ggml_context
+    GGML_API struct ggml_graph_splits ggml_graph_split_init(void);
+
+    // this won't be needed once we can allocate graphs from a ggml_context
+    GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits);
+
+    // add a split to the graph - single and multiple inputs versions
+    GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...);
+    GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...);
+
+    // build graphs for all splits
+    GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output);
+
+    // compute
+    GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
+
+    // graph tensor allocator
+    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx);
+    GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
+
+    // automatically split a graph into multiple graphs based on the location of the tensors
+    GGML_API struct ggml_graph_splits ggml_graph_split(struct ggml_cgraph * graph, struct ggml_context * ctx);
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index a9d4e33d9..db7cad0dc 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -401,14 +401,10 @@ extern "C" {
         GGML_OP_CLAMP,
         GGML_OP_CONV_1D,
         GGML_OP_CONV_2D,
-        GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
 
-        GGML_OP_CONV_1D_STAGE_0,  // internal
-        GGML_OP_CONV_1D_STAGE_1,  // internal
-
         GGML_OP_UPSCALE, // nearest interpolate
 
         GGML_OP_FLASH_ATTN,
@@ -481,6 +477,7 @@ extern "C" {
     struct ggml_tensor {
         enum ggml_type    type;
         enum ggml_backend backend;
+        struct ggml_backend_buffer * buffer;
 
         int     n_dims;
         int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -514,7 +511,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[12];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -1390,14 +1387,6 @@ extern "C" {
             int                   s,
             int                   d);
 
-    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
-
     GGML_API struct ggml_tensor * ggml_conv_2d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c857659ff..95f91e331 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -212,6 +212,9 @@ if (GGML_CUBLAS)
             set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
+        if (CMAKE_BUILD_TYPE MATCHES Debug)
+            set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
+        endif()
     else()
         message(WARNING "cuBLAS not found")
     endif()
@@ -249,8 +252,10 @@ endif()
 add_library(${TARGET}
     ggml.c
     ggml-alloc.c
+    ggml-backend.c
     ../include/ggml/ggml.h
     ../include/ggml/ggml-alloc.h
+    ../include/ggml/ggml-backend.h
     ${GGML_CUDA_SOURCES}
     ${GGML_OPENCL_SOURCES}
     ${GGML_METAL_SOURCES}
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index 805759db7..afb4e10cf 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -1,4 +1,5 @@
 #include "ggml-alloc.h"
+#include "ggml-backend.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
@@ -6,25 +7,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/types.h>
-            #include <sys/mman.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <memoryapi.h>
-#endif
-
 
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
 #define MAX_FREE_BLOCKS 256
 
 struct ggml_allocr {
+    ggml_backend_buffer_t buffer;
+    bool buffer_owned;
     void * data;
-    size_t size;
     size_t alignment;
     int n_free_blocks;
     struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
 
-static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    return ggml_nbytes(tensor);
-
-    UNUSED(alloc);
-}
-
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
-    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
+    return tensor->buffer == alloc->buffer;
 }
 
 static bool ggml_is_view(struct ggml_tensor * t) {
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
 }
 
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-#ifdef GGML_ALLOCATOR_DEBUG
     GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
     GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-#endif
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 
     tensor->data = addr;
     AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
+    tensor->buffer = alloc->buffer;
+    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
@@ -208,24 +185,27 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
-
     if (ggml_allocr_is_own(alloc, tensor) == false) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it
+        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, tensor->buffer, alloc->buffer);
         return;
     }
 
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
     AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
     AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 
+    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
+
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
 #endif
 
+    void * ptr = tensor->data;
+
     // see if we can merge with an existing block
     for (int i = 0; i < alloc->n_free_blocks; i++) {
         struct free_block * block = &alloc->free_blocks[i];
@@ -285,16 +265,27 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
     alloc->n_free_blocks = 1;
     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
     alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = alloc->size - align_offset;
+    alloc->free_blocks[0].size = alloc->buffer->size - align_offset;
 }
 
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
+
+    struct ggml_allocr * alloc = ggml_allocr_new_from_buffer(buffer);
+    alloc->alignment = alignment;
+    alloc->buffer_owned = true;
+
+    return alloc;
+}
+
+struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
     struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
     *alloc = (struct ggml_allocr){
-        /*.data          = */ data,
-        /*.size          = */ size,
-        /*.alignment     = */ alignment,
+        /*.buffer        = */ buffer,
+        /*.buffer_owned  = */ false,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
+        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
         /*.hash_table    = */ {{0}},
@@ -312,68 +303,19 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
     return alloc;
 }
 
-// OS specific functions to allocate and free uncommitted virtual memory
-static void * alloc_vmem(size_t size) {
-#if defined(_WIN32)
-    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
-#elif defined(_POSIX_MAPPED_FILES)
-    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
-    if (ptr == MAP_FAILED) {
-        return NULL;
-    }
-    return ptr;
-#else
-    // use a fixed address for other platforms
-    uintptr_t base_addr = (uintptr_t)-size - 0x100;
-    return (void *)base_addr;
-#endif
-}
-
-static void free_vmem(void * base_addr, size_t size) {
-#if defined(_WIN32)
-    VirtualFree(base_addr, 0, MEM_RELEASE);
-    UNUSED(size);
-#elif defined(_POSIX_MAPPED_FILES)
-    munmap(base_addr, size);
-#else
-    // nothing to do
-    UNUSED(base_addr);
-    UNUSED(size);
-#endif
-}
-
-// allocate uncommitted virtual memory to measure the size of the graph
-static void alloc_measure_vmem(void ** base_addr, size_t * size) {
-    // 128GB for 64-bit, 1GB for 32-bit
-    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
-    do {
-        *base_addr = alloc_vmem(*size);
-        if (*base_addr != NULL) {
-            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
-            return;
-        }
-        // try again with half the size
-        *size /= 2;
-    } while (*size > 0);
-
-    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
-}
-
-static void free_measure_vmem(void * base_addr, size_t size) {
-    free_vmem(base_addr, size);
-}
-
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
     struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
-    void * base_addr;
-    size_t size;
-
-    alloc_measure_vmem(&base_addr, &size);
+    // TODO: these should be set by the backend:
+    //  - get_alignment()
+    //  - get_alloc_size()
+    // TODO: support other backends
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr((void *)0x1000, (size_t)-0x1001);
 
     *alloc = (struct ggml_allocr){
-        /*.data          = */ base_addr,
-        /*.size          = */ size,
+        /*.buffer        = */ buffer,
+        /*.buffer_owned  = */ true,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
         /*.alignment     = */ alignment,
         /*.n_free_blocks = */ 0,
         /*.free_blocks   = */ {{0}},
@@ -393,8 +335,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 
 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->measure) {
-        free_measure_vmem(alloc->data, alloc->size);
+    if (alloc->buffer_owned) {
+        ggml_backend_buffer_free(alloc->buffer);
     }
     free(alloc);
 }
@@ -451,6 +393,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
         if (ggml_is_view(node)) {
             assert(node->view_src->data != NULL);
             node->data = (char *)node->view_src->data + node->view_offs;
+            node->buffer = node->view_src->buffer;
+            ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (ggml_op_can_inplace(node->op)) {
@@ -479,12 +423,16 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                 // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                 node->data = parent->data;
+                                node->buffer = parent->buffer;
+                                ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view
                                 return;
                             }
                         }
                         else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                             node->data = parent->data;
+                            node->buffer = parent->buffer;
+                            ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view
                             return;
                         }
                     }
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
     }
 }
 
-static size_t ggml_allocr_alloc_graph_tensors_n(
+size_t ggml_allocr_alloc_graph_n(
     struct ggml_allocr * alloc,
     struct ggml_cgraph ** graphs, int n_graphs,
     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -631,7 +579,11 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 }
 
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
+}
+
+size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
+    return alloc->max_size;
 }
 
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
new file mode 100644
index 000000000..d49c5e7a1
--- /dev/null
+++ b/src/ggml-backend.c
@@ -0,0 +1,506 @@
+#include "ggml-backend.h"
+#include "ggml-alloc.h"
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED(x) (void)(x)
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+
+// backend buffer
+
+struct ggml_backend_buffer * ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size) {
+    struct ggml_backend_buffer * buffer = malloc(sizeof(struct ggml_backend_buffer));
+
+    GGML_ASSERT(interface.get_base != NULL);
+
+    (*buffer) = (struct ggml_backend_buffer) {
+        /* .interface = */ interface,
+        /* .backend   = */ backend,
+        /* .context   = */ context,
+        /* .size      = */ size,
+    };
+
+    return buffer;
+}
+
+void ggml_backend_buffer_free(struct ggml_backend_buffer * buffer) {
+    if (buffer->interface.free_buffer != NULL) {
+        buffer->interface.free_buffer(buffer);
+    }
+}
+
+size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+    if (buffer->interface.get_alignment) {
+        return buffer->interface.get_alignment(buffer);
+    }
+    return 64;
+}
+
+void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return buffer->interface.get_base(buffer);
+}
+
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->interface.get_alloc_size) {
+        return buffer->interface.get_alloc_size(buffer, tensor);
+    }
+    return ggml_nbytes(tensor);
+}
+
+void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->interface.init_tensor) {
+        buffer->interface.init_tensor(buffer, tensor);
+    }
+}
+
+void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    if (buffer->interface.free_tensor) {
+        buffer->interface.free_tensor(buffer, tensor);
+    }
+}
+
+// backend copy
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
+    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
+    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
+    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
+
+    if (src == dst) {
+        return;
+    }
+
+    // TODO: allow backends to support copy to/from same backend
+
+    if (get_backend(dst)->interface.cpy_tensor_from != NULL) {
+        get_backend(dst)->interface.cpy_tensor_from(get_backend(dst)->context, src, dst);
+    } else if (get_backend(src)->interface.cpy_tensor_to != NULL) {
+        get_backend(src)->interface.cpy_tensor_to(get_backend(src)->context, src, dst);
+    } else {
+        // shouldn't be hit when copying from/to CPU
+        #ifndef NDEBUG
+        fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
+        #endif
+        size_t nbytes = ggml_nbytes(src);
+        void * data = malloc(nbytes);
+        ggml_backend_tensor_get(src, data, 0, nbytes);
+        ggml_backend_tensor_set(dst, data, 0, nbytes);
+        free(data);
+    }
+}
+
+// backend CPU
+
+struct ggml_backend_cpu_context {
+    int n_threads;
+    void * work_data;
+    size_t work_size;
+};
+
+static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+    return "CPU";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+    free(cpu_ctx->work_data);
+    free(cpu_ctx);
+    free(backend);
+}
+
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
+
+static size_t ggml_backend_cpu_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+    return TENSOR_ALIGNMENT;
+    UNUSED(buffer);
+}
+
+static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)buffer->context;
+}
+
+static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = {
+    /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
+    /* .get_alignment  = */ ggml_backend_cpu_buffer_get_alignment,
+    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL,
+    /* .free_tensor    = */ NULL,
+};
+
+// for buffers from ptr, free is not called
+static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_ptr = {
+    /* .free_buffer    = */ NULL,
+    /* .get_alignment  = */ ggml_backend_cpu_buffer_get_alignment,
+    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL,
+    /* .free_tensor    = */ NULL,
+};
+
+
+static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
+    void * data = malloc(size + TENSOR_ALIGNMENT);
+    return ggml_backend_buffer_init(cpu_backend_buffer_interface, backend, data, size);
+}
+
+static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
+    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+struct ggml_backend_cpu_plan {
+    struct ggml_cplan cplan;
+    struct ggml_cgraph cgraph;
+};
+
+static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_backend_cpu_plan * cpu_plan = malloc(sizeof(struct ggml_backend_cpu_plan));
+
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cgraph = *cgraph;
+
+    if (cpu_plan->cplan.work_size > 0) {
+        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+    }
+
+    return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) {
+    struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan;
+
+    free(cpu_plan->cplan.work_data);
+    free(cpu_plan);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) {
+    struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan;
+
+    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+
+    if (cpu_ctx->work_size < cplan.work_size) {
+        // TODO: may be faster to free and use malloc to avoid the copy
+        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
+        cpu_ctx->work_size = cplan.work_size;
+    }
+
+    cplan.work_data = cpu_ctx->work_data;
+
+    ggml_graph_compute(cgraph, &cplan);
+}
+
+static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return true;
+    UNUSED(backend);
+    UNUSED(op);
+}
+
+static struct ggml_backend_interface cpu_backend_interface = {
+    /* .get_name            = */ ggml_backend_cpu_name,
+    /* .free                = */ ggml_backend_cpu_free,
+    /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
+    /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_cpu_synchronize,
+    /* .cpy_tensor_from     = */ ggml_backend_cpu_cpy_tensor_from,
+    /* .cpy_tensor_to       = */ ggml_backend_cpu_cpy_tensor_to,
+    /* .graph_plan_create   = */ ggml_backend_cpu_graph_plan_create,
+    /* .graph_plan_free     = */ ggml_backend_cpu_graph_plan_free,
+    /* .graph_plan_compute  = */ ggml_backend_cpu_graph_plan_compute,
+    /* .graph_compute       = */ ggml_backend_cpu_graph_compute,
+    /* .supports_op         = */ ggml_backend_cpu_supports_op,
+};
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+    ctx->n_threads = GGML_DEFAULT_N_THREADS;
+    ctx->work_data = NULL;
+    ctx->work_size = 0;
+
+    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend_s));
+
+    *cpu_backend = (struct ggml_backend_s) {
+        /* .interface = */ cpu_backend_interface,
+        /* .context   = */ ctx
+    };
+    return cpu_backend;
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->n_threads = n_threads;
+}
+
+struct ggml_backend_buffer * ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+    // TODO: NULL backend?
+    // TODO: no free
+    return ggml_backend_buffer_init(cpu_backend_buffer_interface_from_ptr, NULL, ptr, size);
+}
+
+#if 0
+// splits
+
+struct ggml_graph_splits ggml_graph_split_init(void) {
+    struct ggml_graph_splits splits = {0};
+    return splits;
+}
+
+// TODO: this can be removed after allocating the graphs in a ggml_context
+void ggml_graph_splits_free(struct ggml_graph_splits * splits) {
+    for (int i = 0; i < splits->n_splits; i++) {
+        if (splits->splits[i].graph) {
+            free(splits->splits[i].graph);
+        }
+    }
+}
+
+static void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, va_list args) {
+    GGML_ASSERT(splits->n_splits < GGML_MAX_SPLITS);
+
+    struct ggml_graph_split * split = &splits->splits[splits->n_splits];
+
+
+    if (splits->n_splits == 0) {
+        // always add the first split
+        int i = 0;
+        while (inputs[i] != NULL) {
+            GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
+            split->src_inputs[i] = *inputs[i];
+            split->dst_inputs[i] = *inputs[i];
+            i++;
+        }
+        split->src_inputs[i] = NULL;
+        split->dst_inputs[i] = NULL;
+        split->ctx = ctx;
+    }
+    // check if the split is on the same context as the previous one
+    else if (splits->n_splits > 0 && splits->splits[splits->n_splits - 1].ctx == ctx) {
+        // add to the previous split
+        char name[GGML_MAX_NAME - 2];
+        int n = vsnprintf(name, sizeof(name), fmt, args);
+        char new_name[GGML_MAX_NAME];
+        snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name);
+        strcpy(splits->splits[splits->n_splits - 1].name, new_name);
+        return;
+    } else {
+        // add a new split
+        int i = 0;
+        while (inputs[i] != NULL) {
+            GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
+            split->src_inputs[i] = *inputs[i];
+            split->dst_inputs[i] = ggml_dup_tensor(ctx, *inputs[i]);
+            ggml_format_name(split->dst_inputs[i], "%s (split output)", split->src_inputs[i]->name);
+            // TODO: maybe support different layouts in ggml_backend_cpy_tensor instead
+            for (int j = 0; j < GGML_MAX_DIMS; j++) {
+                split->dst_inputs[i]->nb[j] = split->src_inputs[i]->nb[j];
+            }
+            ggml_set_name(split->dst_inputs[i], ggml_get_name(*inputs[i]));
+            *inputs[i] = split->dst_inputs[i];
+            i++;
+        }
+        split->src_inputs[i] = NULL;
+        split->dst_inputs[i] = NULL;
+        split->ctx = ctx;
+    }
+
+    vsnprintf(split->name, GGML_MAX_NAME, fmt, args);
+    split->graph = NULL;
+    splits->n_splits++;
+}
+
+void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** input, struct ggml_context * ctx, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    ggml_graph_splits_add_n_va(splits, input, ctx, fmt, args);
+    va_end(args);
+}
+
+void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    ggml_graph_splits_add_n_va(splits, (struct ggml_tensor**[2]){ input, NULL }, ctx, fmt, args);
+    va_end(args);
+}
+
+void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output) {
+    struct ggml_tensor *last_outputs[2] = { output, NULL };
+    struct ggml_tensor ** outputs;
+
+    for (int i = 0; i < splits->n_splits; i++) {
+        struct ggml_graph_split * split = &splits->splits[i];
+
+        if (i < splits->n_splits - 1) {
+            outputs = splits->splits[i + 1].src_inputs;
+        } else {
+            outputs = last_outputs;
+        }
+
+        // build the graph
+        // TODO: allocate graphs in context
+        split->graph = (struct ggml_cgraph *) malloc(sizeof(struct ggml_cgraph));
+        memset(split->graph, 0, sizeof(struct ggml_cgraph));
+        for (int j = 0; outputs[j] != NULL; j++) {
+            ggml_build_forward_expand(split->graph, outputs[j]);
+        }
+
+        for (int j = 1; j < split->graph->n_nodes; j++) {
+            if (split->graph->nodes[j]->backend != split->graph->nodes[0]->backend) {
+                fprintf(stderr, "split %s: node %s has different backend (%s) than the first node (%s)\n",
+                    split->name, split->graph->nodes[j]->name,
+                    ggml_backend_name(split->graph->nodes[j]->backend_s),
+                    ggml_backend_name(split->graph->nodes[0]->backend_s));
+            }
+        }
+        for (int j = 1; j < split->graph->n_leafs; j++) {
+            if (split->graph->leafs[j]->backend != split->graph->leafs[0]->backend) {
+                fprintf(stderr, "split %s: leaf %s has different backend (%s) than the first leaf (%s)\n",
+                    split->name, split->graph->leafs[j]->name,
+                    ggml_backend_name(split->graph->leafs[j]->backend_s),
+                    ggml_backend_name(split->graph->leafs[0]->backend_s));
+            }
+        }
+    }
+}
+
+void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
+    uint64_t copy_us = 0;
+    uint64_t compute_cpu_us = 0;
+    uint64_t compute_gpu_us = 0;
+    int n_nodes = 0;
+    for (int i = 0; i < splits->n_splits; i++) {
+        struct ggml_graph_split * split = &splits->splits[i];
+
+        //printf("computing split %i (%s) on backend %s (%i nodes)\n", i, split->name, ggml_backend_name(split->dst_inputs[0]->backend), split->graph->n_nodes);
+
+        // copy the input tensor to the backend
+        uint64_t copy_start_us = ggml_time_us();
+        for (int j = 0; split->src_inputs[j] != NULL; j++) {
+            //printf("\tcopying tensor %d (%s) (%s -> %s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_backend_name(split->src_inputs[j]->backend), ggml_backend_name(split->dst_inputs[j]->backend), ggml_nbytes(split->src_inputs[j]));
+            //printf("%p %p\n", split->src_inputs[j], split->dst_inputs[j]);
+            ggml_backend_tensor_copy(split->src_inputs[j], split->dst_inputs[j]);
+        }
+        // ggml_backend_synchronize(split->dst_inputs[0]->backend);
+        copy_us += ggml_time_us() - copy_start_us;
+
+#if 0
+        char split_filename[GGML_MAX_NAME];
+        snprintf(split_filename, GGML_MAX_NAME, "split_%i.dot", i);
+        ggml_graph_dump_dot(split->graph, NULL, split_filename);
+#endif
+        uint64_t start = ggml_time_us();
+        ggml_backend_graph_compute(split->dst_inputs[0]->backend_s, split->graph);
+        //ggml_backend_synchronize(split->dst_inputs[0]->backend);
+        uint64_t end = ggml_time_us();
+        if (strcmp(ggml_backend_name(split->dst_inputs[0]->backend_s), "CPU") == 0) {
+            compute_cpu_us += end - start;
+        } else {
+            compute_gpu_us += end - start;
+        }
+
+        n_nodes += split->graph->n_nodes;
+    }
+
+    //printf("ggml_graph_splits_compute: n_splits: %d, nodes: %d, copy: %.2fms, compute_cpu: %.2fms, compute_gpu: %.2fms\n", splits->n_splits, n_nodes, copy_us / 1000.0, compute_cpu_us / 1000.0, compute_gpu_us / 1000.0);
+    //exit(0);
+}
+
+void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
+    // splits of the same backend are allocated together to ensure that dependencies from one split to the next
+    // are not overwritten when there is another split from a different backend between them (e.g. inpSA in llama.cpp)
+    bool visited[GGML_MAX_SPLITS] = {false};
+    for (int i = 0; i < splits->n_splits; i++) {
+        if (!visited[i]) {
+            struct ggml_graph_split * split = &splits->splits[i];
+            struct ggml_context * ctx = split->ctx;
+            struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS];
+            struct ggml_tensor ** graph_inputs[GGML_MAX_SPLITS];
+            struct ggml_tensor ** graph_outputs[GGML_MAX_SPLITS];
+            int n_graphs = 0;
+
+            for (int j = i; j < splits->n_splits; j++) {
+                if (splits->splits[j].ctx == ctx) {
+                    graph_inputs[n_graphs] = splits->splits[j].dst_inputs;
+                    graph_outputs[n_graphs] = j < splits->n_splits - 1 ? splits->splits[j + 1].src_inputs : NULL;
+                    backend_graphs[n_graphs] = splits->splits[j].graph;
+                    visited[j] = true;
+                    n_graphs++;
+                }
+            }
+
+            struct ggml_allocr * alloc = NULL;
+            ggml_allocr_alloc_graph_n(alloc, backend_graphs, n_graphs, graph_inputs, graph_outputs);
+        }
+    }
+}
+#endif
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 989c419cd..74b443e6c 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -7,6 +7,8 @@
 #include <atomic>
 #include <assert.h>
 
+#define GGML_CUDA_FORCE_DMMV // FIXME: ggml_cuda_op_mul_mat_vec_q produces wrong results with GPT-2
+
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@@ -419,6 +421,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
 
 // dmmv = dequantize_mul_mat_vec
 #ifndef GGML_CUDA_DMMV_X
@@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
     reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
+    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int r = y[row];
+
+    // copy x[r*ncols + col] to dst[row*ncols + col]
+    const int xi = r*ncols + col;
+    const int di = row*ncols + col;
+
+    const int ib = xi/qk; // block index
+    const int iqs = (xi%qk)/qr; // quant index
+    const int iybs = di - di%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(x, ib, iqs, v);
+
+    dst[iybs + iqs + 0]        = v.x;
+    dst[iybs + iqs + y_offset] = v.y;
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -4555,6 +4586,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
     dst[i] = scale * x[i];
 }
 
+
+template<int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+    const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(block_num_x, nrows, 1);
+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
+}
+
 static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
     const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
     add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5703,7 +5743,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = cudaMemcpyDeviceToDevice;
-        struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
         CUDA_CHECK(cudaGetDevice(&id));
         src_ptr = (char *) extra->data_device[id];
@@ -5739,6 +5779,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
     }
 }
 
+static void ggml_cuda_op_repeat(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    const size_t nb00 = src0->nb[0];
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+    const size_t nb03 = src0->nb[3];
+
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // TODO: very inefficient, implement in a kernel
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                CUDA_CHECK(cudaMemcpyAsync(
+                                              (char *)  dst_d + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0,
+                                        (const char *) src0_d + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01,
+                                        ne00*nb0, cudaMemcpyDeviceToDevice, stream));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    (void) src1;
+    (void) src1_d;
+}
+
+static void ggml_cuda_op_get_rows(
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+    const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+
+    const int ncols = src0->ne[0];
+    const int nrows = ggml_nelements(src1);
+
+    const int * src1_i32 = (const int *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
 inline void ggml_cuda_op_add(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@@ -6343,7 +6484,14 @@ inline void ggml_cuda_op_scale(
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    const float scale = ((float *) src1->data)[0];
+    float scale;
+    // HACK: support for ggml backend interface
+    if (src1->backend == GGML_BACKEND_CPU) {
+        scale = ((float *) src1->data)[0];
+    } else {
+        // TODO: pass pointer to kernel instead of copying to host
+        CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
+    }
 
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
@@ -6362,9 +6510,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
     GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
 
-    struct ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    struct ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
 
     const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
@@ -6505,9 +6653,9 @@ static void ggml_cuda_op_mul_mat(
     const size_t q8_1_ts = sizeof(block_q8_1);
     const size_t q8_1_bs = QK8_1;
 
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    struct ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
 
     const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
@@ -6758,6 +6906,14 @@ static void ggml_cuda_op_mul_mat(
     }
 }
 
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
 static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
@@ -6812,13 +6968,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
     ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
@@ -6843,13 +6999,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 
-    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
     const int64_t row_stride_x = nb01 / sizeof(half);
@@ -6870,11 +7026,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-    if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
     } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    }else if (src0->type == GGML_TYPE_F32) {
+    } else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
@@ -6935,8 +7091,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 
     char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
     char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
@@ -6992,7 +7148,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
     const size_t nb1 = tensor->nb[1];
 
     ggml_backend backend = tensor->backend;
-    struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
 
     for (int64_t id = 0; id < g_device_count; ++id) {
@@ -7046,7 +7202,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
             CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
         }
 
-
         CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
 
         extra->data_device[id] = buf;
@@ -7085,17 +7240,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
     delete extra;
 }
 
-static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
 static size_t g_temp_tensor_extra_index = 0;
 
-static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
     if (g_temp_tensor_extras == nullptr) {
         g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
     }
 
     size_t alloc_index = g_temp_tensor_extra_index;
     g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
-    struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
     memset(extra, 0, sizeof(*extra));
 
     return extra;
@@ -7123,7 +7278,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
         return;
     }
 
-    struct ggml_tensor_extra_gpu * extra;
+    ggml_tensor_extra_gpu * extra;
 
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
@@ -7132,7 +7287,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
 
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
@@ -7141,7 +7296,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device];
         extra = ggml_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device] = src1_ddv;
@@ -7183,13 +7338,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
         CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
     }
 
-    struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
 
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW;
 
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
         size_t view_offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
@@ -7207,7 +7362,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
     GGML_ASSERT(ggml_is_contiguous(tensor));
 
-    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
 }
@@ -7264,58 +7419,47 @@ void ggml_cuda_free_scratch() {
     g_scratch_buffer = nullptr;
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
         || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
         || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
 
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
+        return false;
+    }
+
     switch (tensor->op) {
+        case GGML_OP_REPEAT:
+            func = ggml_cuda_repeat;
+            break;
+        case GGML_OP_GET_ROWS:
+            func = ggml_cuda_get_rows;
+            break;
         case GGML_OP_DUP:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_dup;
             break;
         case GGML_OP_ADD:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_add;
             break;
         case GGML_OP_MUL:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_mul;
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_GELU:
-                    if (!any_on_device) {
-                        return false;
-                    }
                     func = ggml_cuda_gelu;
                     break;
                 case GGML_UNARY_OP_SILU:
-                    if (!any_on_device) {
-                        return false;
-                    }
                     func = ggml_cuda_silu;
                     break;
                 default:
                     return false;
             } break;
         case GGML_OP_NORM:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_norm;
             break;
         case GGML_OP_RMS_NORM:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
@@ -7325,54 +7469,30 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_mul_mat;
             break;
         case GGML_OP_SCALE:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_scale;
             break;
         case GGML_OP_CPY:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_cpy;
             break;
         case GGML_OP_CONT:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_dup;
             break;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_nop;
             break;
         case GGML_OP_DIAG_MASK_INF:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_diag_mask_inf;
             break;
         case GGML_OP_SOFT_MAX:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_soft_max;
             break;
         case GGML_OP_ROPE:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_rope;
             break;
         case GGML_OP_ALIBI:
-            if (!any_on_device) {
-                return false;
-            }
             func = ggml_cuda_alibi;
             break;
         default:
@@ -7400,3 +7520,240 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     snprintf(description, description_size, "%s", prop.name);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED(x) (void)(x)
+
+struct ggml_backend_cuda_context {
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+    return GGML_CUDA_NAME;
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    delete cuda_ctx;
+    delete backend;
+}
+
+struct ggml_cuda_buffer_context {
+    void * device;
+
+    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+    size_t temp_tensor_extra_index = 0;
+
+    ~ggml_cuda_buffer_context() {
+        delete[] temp_tensor_extras;
+    }
+
+    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+        if (temp_tensor_extras == nullptr) {
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+        }
+
+        size_t alloc_index = temp_tensor_extra_index;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+        memset(extra, 0, sizeof(*extra));
+
+        return extra;
+    }
+};
+
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
+    CUDA_CHECK(cudaFree(ctx->device));
+    delete ctx;
+}
+
+static size_t ggml_backend_cuda_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+    return 128;
+    UNUSED(buffer);
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
+    return ctx->device;
+}
+
+static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    return ggml_nbytes_split(tensor, nrows_split);
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
+    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+    extra->data_device[g_main_device] = tensor->data;
+
+    tensor->backend = GGML_BACKEND_GPU;
+    tensor->extra = extra;
+
+    // initialize padding to 0 to avoid possible NaN values
+    size_t original_size = ggml_nbytes(tensor);
+    size_t size = ggml_backend_cuda_buffer_get_alloc_size(buffer, tensor);
+
+    if (size > original_size && tensor->view_src == nullptr) {
+        CUDA_CHECK(cudaMemset((char *) tensor->data + original_size, 0, size - original_size));
+    }
+
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = {
+    /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_alignment  = */ ggml_backend_cuda_buffer_get_alignment,
+    /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
+    /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
+    /* .init_tensor    = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .free_tensor    = */ NULL,
+};
+
+static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
+    ggml_cuda_buffer_context * ctx = new ggml_cuda_buffer_context;
+    CUDA_CHECK(cudaMalloc(&ctx->device, size));
+    return ggml_backend_buffer_init(cuda_backend_buffer_interface, backend, ctx, size);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    //GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+    UNUSED(backend);
+}
+
+static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    GGML_ASSERT(!"not implemented");
+
+    return nullptr;
+
+    UNUSED(backend);
+    UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+#include <vector>
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        // views of allocated tensors don't call init_tensor, handle them here
+        // TODO: handle in ggml-alloc
+        if (node->extra == nullptr) {
+            GGML_ASSERT(node->view_src != nullptr);
+            GGML_ASSERT(node->view_src->backend == GGML_BACKEND_GPU);
+            ggml_backend_cuda_buffer_init_tensor(node->buffer, node);
+        }
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr && node->src[j]->extra == nullptr) {
+                GGML_ASSERT(node->src[j]->view_src != nullptr);
+                GGML_ASSERT(node->src[j]->view_src->backend == GGML_BACKEND_GPU);
+                ggml_backend_cuda_buffer_init_tensor(node->src[j]->buffer, node->src[j]);
+            }
+        }
+
+        bool ok = ggml_cuda_compute_forward(&params, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+
+#if 0
+        if (node->type == GGML_TYPE_F32) {
+            cudaDeviceSynchronize();
+            std::vector<float> tmp(ggml_nelements(node), 0.0f);
+            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            printf("\n%s (%s) (%s %s): ", node->name, ggml_op_name(node->op),
+                ggml_type_name(node->src[0]->type),
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none");
+            double sum = 0.0;
+            double sq_sum = 0.0;
+            for (int i = 0; i < ggml_nelements(node); i++) {
+                //printf("%f ", tmp[i]);
+                sum += tmp[i];
+                sq_sum += tmp[i]*tmp[i];
+            }
+            //printf("\n");
+            printf("sum: %f, ", sum);
+            printf("sq_sum: %f\n", sq_sum);
+        }
+#endif
+    }
+
+    UNUSED(backend);
+}
+
+static ggml_backend_interface cuda_backend_interface = {
+    /* .get_name            = */ ggml_backend_cuda_name,
+    /* .free                = */ ggml_backend_cuda_free,
+    /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
+    /* .set_tensor_async    = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_cuda_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_cuda_synchronize,
+    /* .cpy_tensor_from     = */ nullptr,
+    /* .cpy_tensor_to       = */ nullptr,
+    /* .graph_plan_create   = */ ggml_backend_cuda_graph_plan_create,
+    /* .graph_plan_free     = */ ggml_backend_cuda_graph_plan_free,
+    /* .graph_plan_compute  = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_compute       = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op         = */ nullptr,
+};
+
+ggml_backend_t ggml_backend_cuda_init() {
+    ggml_init_cublas(); // TODO: remove from ggml.c
+
+    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;
+
+    ggml_backend_t cuda_backend = new ggml_backend_s;
+    *cuda_backend = (ggml_backend_s){
+        /* .interface = */ cuda_backend_interface,
+        /* .context   = */ ctx
+    };
+    return cuda_backend;
+}
diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h
index fda704b66..81ee9a2e9 100644
--- a/src/ggml-cuda.h
+++ b/src/ggml-cuda.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-backend.h"
 
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
@@ -42,6 +43,10 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 
+// backend API
+GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
+
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/src/ggml.c b/src/ggml.c
index b72069087..b1d11ba5c 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4081,16 +4081,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ALIBI",
     "CLAMP",
     "CONV_1D",
-    "CONV_TRANSPOSE_1D",
     "CONV_2D",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
 
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
-
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
@@ -4116,7 +4112,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4167,16 +4163,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "alibi(x)",
     "clamp(x)",
     "conv_1d(x)",
-    "conv_transpose_1d(x)",
     "conv_2d(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
 
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
-
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
@@ -4202,7 +4194,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4231,10 +4223,7 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_DIAG_MASK_INF          ] = true;
         p[GGML_OP_DIAG_MASK_ZERO         ] = true;
         p[GGML_OP_CONV_1D                ] = true;
-        p[GGML_OP_CONV_1D_STAGE_0        ] = true;
-        p[GGML_OP_CONV_1D_STAGE_1        ] = true;
         p[GGML_OP_CONV_2D                ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -4951,6 +4940,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
     *result = (struct ggml_tensor) {
         /*.type         =*/ type,
         /*.backend      =*/ GGML_BACKEND_CPU,
+        /*.buffer       =*/ NULL,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
@@ -4983,6 +4973,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
     }
 
+    if (view_src != NULL) {
+        result->backend = view_src->backend;
+        result->buffer  = view_src->buffer;
+    }
+
     ctx->n_objects++;
 
     return result;
@@ -5797,7 +5792,7 @@ static struct ggml_tensor * ggml_mul_impl(
         bool inplace) {
     // TODO: support less-strict constraint
     //       GGML_ASSERT(ggml_can_repeat(b, a));
-    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+    //GGML_ASSERT(ggml_can_repeat_rows(b, a));
 
     bool is_node = false;
 
@@ -7514,17 +7509,14 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-// im2col: [N, IC, IL] => [N, OL, IC*K]
-// a: [OC，IC, K]
-// b: [N, IC, IL]
-// result: [N, OL, IC*K]
-static struct ggml_tensor * ggml_conv_1d_stage_0(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                   s0,
-    int                   p0,
-    int                   d0) {
+GGML_API struct ggml_tensor * ggml_conv_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
     GGML_ASSERT(a->ne[1] == b->ne[1]);
     bool is_node = false;
 
@@ -7533,54 +7525,16 @@ static struct ggml_tensor * ggml_conv_1d_stage_0(
         is_node = true;
     }
 
-    const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
     const int64_t ne[4] = {
-        a->ne[1] * a->ne[0],
-        OL,
-        b->ne[2],
-        1,
+        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+        a->ne[2], 1, 1,
     };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
 
     int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_CONV_1D_STAGE_0;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_conv_1d_stage_1
-
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// a: [OC, IC, K]
-// b: [N, OL, IC * K]
-// result: [N, OC, OL]
-static struct ggml_tensor * ggml_conv_1d_stage_1(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b) {
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        b->ne[1],
-        a->ne[2],
-        b->ne[2],
-        1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op = GGML_OP_CONV_1D_STAGE_1;
+    result->op = GGML_OP_CONV_1D;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7588,53 +7542,6 @@ static struct ggml_tensor * ggml_conv_1d_stage_1(
     return result;
 }
 
-// ggml_conv_1d
-
-GGML_API struct ggml_tensor * ggml_conv_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
-    result = ggml_conv_1d_stage_1(ctx, a, result);
-    return result;
-}
-
-// GGML_API struct ggml_tensor * ggml_conv_1d(
-//         struct ggml_context * ctx,
-//         struct ggml_tensor  * a,
-//         struct ggml_tensor  * b,
-//         int                   s0,
-//         int                   p0,
-//         int                   d0) {
-//     GGML_ASSERT(ggml_is_matrix(b));
-//     GGML_ASSERT(a->ne[1] == b->ne[1]);
-//     bool is_node = false;
-
-//     if (a->grad || b->grad) {
-//         GGML_ASSERT(false); // TODO: implement backward
-//         is_node = true;
-//     }
-
-//     const int64_t ne[4] = {
-//         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-//         a->ne[2], 1, 1,
-//     };
-//     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-//     int32_t params[] = { s0, p0, d0 };
-//     ggml_set_op_params(result, params, sizeof(params));
-
-//     result->op = GGML_OP_CONV_1D;
-//     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-//     result->src[0] = a;
-//     result->src[1] = b;
-
-//     return result;
-// }
-
 // ggml_conv_1d_ph
 
 struct ggml_tensor* ggml_conv_1d_ph(
@@ -7646,50 +7553,6 @@ struct ggml_tensor* ggml_conv_1d_ph(
     return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
 
-// ggml_conv_transpose_1d
-
-static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
-
-GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-
-    GGML_ASSERT(p0 == 0);
-    GGML_ASSERT(d0 == 1);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_CONV_TRANSPOSE_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
 // ggml_conv_2d
 
 struct ggml_tensor * ggml_conv_2d(
@@ -13829,7 +13692,7 @@ static void ggml_compute_forward_rope_back(
 
 // ggml_compute_forward_conv_1d
 
-static void ggml_compute_forward_conv_1d_f16_f32(
+static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -13847,33 +13710,42 @@ static void ggml_compute_forward_conv_1d_f16_f32(
     const int nth = params->nth;
 
     const int nk = ne00;
+    const int nh = nk/2;
 
-    // size of the convolution row - the kernel size unrolled across all input channels
-    const int ew0 = nk*ne01;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+    const int ew0 = ggml_up32(ne01);
 
+    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+        // prepare kernel data (src0)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            ggml_fp16_t * dst_data = wdata;
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ew0 + i01] = src[i00];
+                    }
+                }
+            }
+        }
 
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
+        // prepare source data (src1)
+        {
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
 
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
-                    }
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                ggml_fp16_t * dst_data = wdata;
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
                 }
             }
         }
@@ -13886,7 +13758,7 @@ static void ggml_compute_forward_conv_1d_f16_f32(
     }
 
     // total rows in dst
-    const int nr = ne2;
+    const int nr = ne02;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -13895,22 +13767,23 @@ static void ggml_compute_forward_conv_1d_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
-
-            for (int i0 = 0; i0 < ne0; i0++) {
-                ggml_vec_dot_f16(ew0, dst_data + i0,
-                        (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
-                        (ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        for (int64_t i0 = 0; i0 < ne10; ++i0) {
+            dst_data[i0] = 0;
+            for (int k = -nh; k <= nh; k++) {
+                float v = 0.0f;
+                ggml_vec_dot_f16(ew0, &v,
+                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
+                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
+
+                dst_data[i0] += v;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_1d_f32(
+static void ggml_compute_forward_conv_1d_s1_ph_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -13928,32 +13801,42 @@ static void ggml_compute_forward_conv_1d_f32(
     const int nth = params->nth;
 
     const int nk = ne00;
+    const int nh = nk/2;
 
-    const int ew0 = nk*ne01;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+    const int ew0 = ggml_up32(ne01);
 
+    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(float));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        float * const wdata = (float *) params->wdata + 0;
+        // prepare kernel data (src0)
+        {
+            float * const wdata = (float *) params->wdata + 0;
 
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            float * dst_data = wdata;
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    float * dst_data = wdata + i02*ew0*ne00;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ew0 + i01] = src[i00];
+                    }
+                }
+            }
+        }
 
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
+        // prepare source data (src1)
+        {
+            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
 
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
-                    }
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                float * dst_data = wdata;
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
                 }
             }
         }
@@ -13975,225 +13858,35 @@ static void ggml_compute_forward_conv_1d_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    float * const wdata = (float *) params->wdata + 0;
-
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
-
-            for (int i0 = 0; i0 < ne0; i0++) {
-                ggml_vec_dot_f32(ew0, dst_data + i0,
-                        (float *) ((char *) src0->data + i1*nb02),
-                        (float *)                wdata + i2*nb2 + i0*ew0);
-            }
-        }
-    }
-}
-
-static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
-                             ggml_fp16_t * A,
-                             ggml_fp16_t * B,
-                             float * C,
-                             const int ith, const int nth) {
-    // does not seem to make a difference
-    int64_t m0, m1, n0, n1;
-    // patches per thread
-    if (m > n) {
-        n0 = 0;
-        n1 = n;
-
-        // total patches in dst
-        const int np = m;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        m0 = dp*ith;
-        m1 = MIN(m0 + dp, np);
-    } else {
-        m0 = 0;
-        m1 = m;
-
-        // total patches in dst
-        const int np = n;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        n0 = dp*ith;
-        n1 = MIN(n0 + dp, np);
-    }
-
-    // block-tiling attempt
-    int64_t blck_n = 16;
-    int64_t blck_m = 16;
-
-    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
-    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
-    // if (blck_size > 0) {
-    //     blck_0 = 4;
-    //     blck_1 = blck_size / blck_0;
-    //     if (blck_1 < 0) {
-    //         blck_1 = 1;
-    //     }
-    //     // blck_0 = (int64_t)sqrt(blck_size);
-    //     // blck_1 = blck_0;
-    // }
-    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
-
-    for (int j = n0; j < n1; j+=blck_n) {
-        for (int i = m0; i < m1; i+=blck_m) {
-            // printf("i j k => %d %d %d\n", i, j, K);
-            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
-                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
-                    ggml_vec_dot_f16(k,
-                                    C + ii*n + jj,
-                                    A + ii * k,
-                                    B + jj * k);
-                }
-            }
-        }
-    }
-}
-
-// src0: kernel [OC, IC, K]
-// src1: signal [N, IC, IL]
-// dst:  result [N, OL, IC*K]
-static void ggml_compute_forward_conv_1d_stage_0_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int64_t N  = ne12;
-    const int64_t IC = ne11;
-    const int64_t IL = ne10;
-
-    const int64_t K = ne00;
-
-    const int64_t OL = ne1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IL] => [N, OL, IC*K]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t iol = 0; iol < OL; iol++) {
-                for (int64_t iic = ith; iic < IC; iic+=nth) {
-
-                    // micro kernel
-                    ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
-                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
-
-                    for (int64_t ik = 0; ik < K; ik++) {
-                        const int64_t iil = iol*s0 + ik*d0 - p0;
-
-                        if (!(iil < 0 || iil >= IL)) {
-                            dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
-                        }
-                    }
-                }
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dst_data = (float *)((char *) dst->data + i1*nb1);
+        for (int64_t i0 = 0; i0 < ne10; ++i0) {
+            dst_data[i0] = 0;
+            for (int k = -nh; k <= nh; k++) {
+                float v = 0.0f;
+                ggml_vec_dot_f32(ew0, &v,
+                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
+                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
+
+                dst_data[i0] += v;
             }
         }
     }
 }
 
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// src0: [OC, IC, K]
-// src1: [N, OL, IC * K]
-// result: [N, OC, OL]
-static void ggml_compute_forward_conv_1d_stage_1_f16(
+static void ggml_compute_forward_conv_1d_s1_ph(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    const int N = ne12;
-    const int OL = ne11;
-
-    const int OC = ne02;
-    const int IC = ne01;
-    const int K  = ne00;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    int64_t m = OC;
-    int64_t n = OL;
-    int64_t k = IC * K;
-
-    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-    for (int i = 0; i < N; i++) {
-        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
-        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-
-static void ggml_compute_forward_conv_1d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch(src0->type) {
+    switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -14202,43 +13895,7 @@ static void ggml_compute_forward_conv_1d(
     }
 }
 
-static void ggml_compute_forward_conv_1d_stage_0(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch(src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_conv_1d_stage_1(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch(src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_transpose_1d
-
-static void ggml_compute_forward_conv_transpose_1d_f16_f32(
+static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14255,38 +13912,43 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00*ne01*ne02;
+    const int nk = ne00;
+    const int nh = nk/2;
+
+    const int ew0 = ggml_up32(ne01);
 
+    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        // prepare kernel data (src0)
         {
             ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
+                        dst_data[i00*ew0 + i01] = src[i00];
                     }
                 }
             }
         }
 
-        // permute source data (src1) from (L x Cin) to (Cin x L)
+        // prepare source data (src1)
         {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            ggml_fp16_t * dst_data = wdata;
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
 
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
+                ggml_fp16_t * dst_data = wdata;
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
                 }
             }
         }
@@ -14298,10 +13960,8 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
         return;
     }
 
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
     // total rows in dst
-    const int nr = ne1;
+    const int nr = ne02;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -14310,26 +13970,23 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f16(ne02, &v,
-                        (ggml_fp16_t *)    wdata_src + i1n,
-                        (ggml_fp16_t *) wdata_kernel + i00*ne02);
-                dst_data[i10*s0 + i00] += v;
+        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
+            dst_data[i0/2] = 0;
+            for (int k = -nh; k <= nh; k++) {
+                float v = 0.0f;
+                ggml_vec_dot_f16(ew0, &v,
+                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
+                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
+
+                dst_data[i0/2] += v;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_transpose_1d_f32(
+static void ggml_compute_forward_conv_1d_s2_ph_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14346,24 +14003,29 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00*ne01*ne02;
+    const int nk = ne00;
+    const int nh = nk/2;
+
+    const int ew0 = ggml_up32(ne01);
 
+    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(float));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
+        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        // prepare kernel data (src0)
         {
             float * const wdata = (float *) params->wdata + 0;
 
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i01*ne00*ne02;
+                    float * dst_data = wdata + i02*ew0*ne00;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
+                        dst_data[i00*ew0 + i01] = src[i00];
                     }
                 }
             }
@@ -14371,13 +14033,13 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
 
         // prepare source data (src1)
         {
-            float * const wdata = (float *) params->wdata + nk;
-            float * dst_data = wdata;
+            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
 
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
+                float * dst_data = wdata;
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = src[i10];
+                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
                 }
             }
         }
@@ -14389,10 +14051,8 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
         return;
     }
 
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
     // total rows in dst
-    const int nr = ne1;
+    const int nr = ne02;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -14401,26 +14061,23 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    float * const wdata     = (float *) params->wdata + 0;
-    float * const wdata_src = wdata + nk;
-
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        float * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f32(ne02, &v,
-                        wdata_src + i1n,
-                        wdata_kernel + i00*ne02);
-                dst_data[i10*s0 + i00] += v;
+        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
+            dst_data[i0/2] = 0;
+            for (int k = -nh; k <= nh; k++) {
+                float v = 0.0f;
+                ggml_vec_dot_f32(ew0, &v,
+                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
+                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
+
+                dst_data[i0/2] += v;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_transpose_1d(
+static void ggml_compute_forward_conv_1d_s2_ph(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14428,11 +14085,11 @@ static void ggml_compute_forward_conv_transpose_1d(
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -14441,6 +14098,27 @@ static void ggml_compute_forward_conv_transpose_1d(
     }
 }
 
+// ggml_compute_forward_conv_1d
+
+static void ggml_compute_forward_conv_1d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+    GGML_ASSERT(d0 == 1); // dilation not supported
+    GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
+    if (s0 == 1) {
+        ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
+    } else if (s0 == 2) {
+        ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
+    } else {
+        GGML_ASSERT(false); // only stride 1 and 2 supported
+    }
+}
+
 // ggml_compute_forward_conv_2d
 
 static void ggml_compute_forward_conv_2d_f16_f32(
@@ -14483,22 +14161,20 @@ static void ggml_compute_forward_conv_2d_f16_f32(
         {
             ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-            for (int i13 = 0; i13 < ne13; i13++) {
-                for (int i12 = 0; i12 < ne12; i12++) {
-                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
-                    ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
-
-                    for (int i1 = 0; i1 < ne1; i1++) {
-                        for (int i0 = 0; i0 < ne0; i0++) {
-                            for (int ik1 = 0; ik1 < nk1; ik1++) {
-                                for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                    const int idx0 = i0*s0 + ik0*d0 - p0;
-                                    const int idx1 = i1*s1 + ik1*d1 - p1;
-
-                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
-                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                            GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
-                                    }
+            for (int i12 = 0; i12 < ne12; i12++) {
+                const float * const src = (float *)((char *) src1->data + i12*nb12);
+                ggml_fp16_t * dst_data = wdata;
+
+                for (int i1 = 0; i1 < ne1; i1++) {
+                    for (int i0 = 0; i0 < ne0; i0++) {
+                        for (int ik1 = 0; ik1 < nk1; ik1++) {
+                            for (int ik0 = 0; ik0 < nk0; ik0++) {
+                                const int idx0 = i0*s0 + ik0*d0 - p0;
+                                const int idx1 = i1*s1 + ik1*d1 - p1;
+
+                                if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                                    dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                        GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
                                 }
                             }
                         }
@@ -16781,18 +16457,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case GGML_OP_CONV_1D_STAGE_0:
-            {
-                ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CONV_1D_STAGE_1:
-            {
-                ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
         case GGML_OP_CONV_2D:
             {
                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17718,22 +17382,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_1D_STAGE_0:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_1D_STAGE_1:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_CONV_2D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -18575,68 +18227,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     GGML_ASSERT(node->src[1]->ne[2] == 1);
                     GGML_ASSERT(node->src[1]->ne[3] == 1);
 
-                    const int64_t ne00 = node->src[0]->ne[0];
-                    const int64_t ne01 = node->src[0]->ne[1];
-                    const int64_t ne02 = node->src[0]->ne[2];
-
-                    const int64_t ne10 = node->src[1]->ne[0];
-                    const int64_t ne11 = node->src[1]->ne[1];
-
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t nk  = ne00;
-                    const int64_t ew0 = nk * ne01;
-
-                    UNUSED(ne02);
-                    UNUSED(ne10);
-                    UNUSED(ne11);
-
                     size_t cur = 0;
+                    const int nk = node->src[0]->ne[0];
 
                     if (node->src[0]->type == GGML_TYPE_F16 &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
+                            node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(
+                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
+                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
+                                );
                     } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(float)*(ne0*ne1*ew0);
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-
-                    work_size = MAX(work_size, cur);
-                } break;
-            case GGML_OP_CONV_1D_STAGE_0:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case GGML_OP_CONV_1D_STAGE_1:
-                {
-                    n_tasks = n_threads;
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_1D:
-                {
-                    n_tasks = n_threads;
-
-                    GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
-
-                    const int64_t ne00 = node->src[0]->ne[0];  // K
-                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
-                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
-
-                    const int64_t ne10 = node->src[1]->ne[0];  // L
-                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
-
-                    size_t cur = 0;
-                    if (node->src[0]->type == GGML_TYPE_F16 &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
-                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(float)*ne00*ne01*ne02;
-                        cur += sizeof(float)*ne10*ne11;
+                            node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*(
+                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
+                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
+                                );
                     } else {
                         GGML_ASSERT(false);
                     }
@@ -19796,6 +19401,9 @@ static enum ggml_opt_result ggml_opt_adam(
 
     // run the optimizer
     for (int t = 0; t < params.adam.n_iter; ++t) {
+        if (cancel) {
+            break;
+        }
         opt->iter = iter0 + t + 1;
         GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a1cedf0f8..804689fb7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -176,11 +176,11 @@ endif()
 #
 # test-grad0
 
-set(TEST_TARGET test-grad0)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+#set(TEST_TARGET test-grad0)
+#add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+#target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+#add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
 
 #
 # test-opt
@@ -350,8 +350,8 @@ set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_
 #
 # test-xpos
 
-set(TEST_TARGET test-xpos)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+#set(TEST_TARGET test-xpos)
+#add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+#target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+#add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

From c05714fbd0dd6213c07546001aa07df1c3b56f13 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 3 Oct 2023 15:15:25 +0200
Subject: [PATCH 02/23] move get_alignment from buffer to backend

---
 examples/gpt-2/main.cpp     |  6 ++++--
 include/ggml/ggml-backend.h |  3 ++-
 src/ggml-backend.c          | 29 ++++++++++++-----------------
 src/ggml-cuda.cu            | 12 ++++++------
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 184eb8e9a..e9d20b522 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -833,7 +833,9 @@ int main(int argc, char ** argv) {
     struct ggml_allocr * allocr = NULL;
     // allocate the compute buffer
     {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+         // alignment required by the backend
+        size_t align = ggml_backend_get_alignment(model.backend);
+        allocr = ggml_allocr_new_measure(align);
 
         // create the worst case graph for memory usage estimation
         int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
@@ -841,7 +843,7 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
 
         // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
 
         // recreate the allocator with the required memory
         ggml_allocr_free(allocr);
diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
index 17e5a38a7..c71f50225 100644
--- a/include/ggml/ggml-backend.h
+++ b/include/ggml/ggml-backend.h
@@ -14,7 +14,6 @@ extern "C" {
 
     struct ggml_backend_buffer_interface {
         void   (*free_buffer)   (ggml_backend_buffer_t buffer);
-        size_t (*get_alignment) (ggml_backend_buffer_t buffer);
         void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
         size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
         void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
@@ -49,6 +48,7 @@ extern "C" {
 
         // buffer allocation
         ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+        size_t                (*get_alignment)(ggml_backend_t backend);
 
         // tensor data access
         // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
@@ -82,6 +82,7 @@ extern "C" {
     static inline const char * ggml_backend_name(ggml_backend_t backend) { return backend->interface.get_name(backend); }
     static inline void ggml_backend_free(ggml_backend_t backend) { backend->interface.free(backend); }
     static inline ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return backend->interface.alloc_buffer(backend, size); }
+    static inline size_t ggml_backend_get_alignment(ggml_backend_t backend) { return backend->interface.get_alignment(backend); }
     static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); }
     static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); }
     static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); }
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index d49c5e7a1..da0d9c639 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -34,10 +34,7 @@ void ggml_backend_buffer_free(struct ggml_backend_buffer * buffer) {
 }
 
 size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
-    if (buffer->interface.get_alignment) {
-        return buffer->interface.get_alignment(buffer);
-    }
-    return 64;
+    return ggml_backend_get_alignment(buffer->backend);
 }
 
 void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -131,13 +128,6 @@ static void ggml_backend_cpu_free(ggml_backend_t backend) {
     free(backend);
 }
 
-static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-
-static size_t ggml_backend_cpu_buffer_get_alignment(ggml_backend_buffer_t buffer) {
-    return TENSOR_ALIGNMENT;
-    UNUSED(buffer);
-}
-
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
@@ -149,29 +139,33 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
 static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = {
     /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
-    /* .get_alignment  = */ ggml_backend_cpu_buffer_get_alignment,
     /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
     /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
-    /* .init_tensor    = */ NULL,
-    /* .free_tensor    = */ NULL,
+    /* .init_tensor    = */ NULL, // no initialization required
+    /* .free_tensor    = */ NULL, // no cleanup required
 };
 
 // for buffers from ptr, free is not called
 static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_ptr = {
-    /* .free_buffer    = */ NULL,
-    /* .get_alignment  = */ ggml_backend_cpu_buffer_get_alignment,
+    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
     /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
     /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
     /* .init_tensor    = */ NULL,
     /* .free_tensor    = */ NULL,
 };
 
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
 static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
-    void * data = malloc(size + TENSOR_ALIGNMENT);
+    void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned
     return ggml_backend_buffer_init(cpu_backend_buffer_interface, backend, data, size);
 }
 
+static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
+    return TENSOR_ALIGNMENT;
+    UNUSED(backend);
+}
+
 static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -270,6 +264,7 @@ static struct ggml_backend_interface cpu_backend_interface = {
     /* .get_name            = */ ggml_backend_cpu_name,
     /* .free                = */ ggml_backend_cpu_free,
     /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_cpu_get_alignment,
     /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
     /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
     /* .synchronize         = */ ggml_backend_cpu_synchronize,
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 74b443e6c..9d596e15b 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -7572,11 +7572,6 @@ static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     delete ctx;
 }
 
-static size_t ggml_backend_cuda_buffer_get_alignment(ggml_backend_buffer_t buffer) {
-    return 128;
-    UNUSED(buffer);
-}
-
 static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
     return ctx->device;
@@ -7614,7 +7609,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
 
 static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = {
     /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_alignment  = */ ggml_backend_cuda_buffer_get_alignment,
     /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
     /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
     /* .init_tensor    = */ ggml_backend_cuda_buffer_init_tensor,
@@ -7627,6 +7621,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
     return ggml_backend_buffer_init(cuda_backend_buffer_interface, backend, ctx, size);
 }
 
+static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
+    return 128;
+    UNUSED(backend);
+}
+
 static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -7733,6 +7732,7 @@ static ggml_backend_interface cuda_backend_interface = {
     /* .get_name            = */ ggml_backend_cuda_name,
     /* .free                = */ ggml_backend_cuda_free,
     /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_cuda_get_alignment,
     /* .set_tensor_async    = */ ggml_backend_cuda_set_tensor_async,
     /* .get_tensor_async    = */ ggml_backend_cuda_get_tensor_async,
     /* .synchronize         = */ ggml_backend_cuda_synchronize,

From da82697cfd03cb58b78c089a3c5d74094592e679 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 3 Oct 2023 16:03:21 +0200
Subject: [PATCH 03/23] ggml-cuda : fix ggml_cuda_op_mul_mat_vec_q

---
 src/ggml-cuda.cu | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 9d596e15b..163708c1e 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -7,8 +7,6 @@
 #include <atomic>
 #include <assert.h>
 
-#define GGML_CUDA_FORCE_DMMV // FIXME: ggml_cuda_op_mul_mat_vec_q produces wrong results with GPT-2
-
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@@ -6733,7 +6731,8 @@ static void ggml_cuda_op_mul_mat(
         if (convert_src1_to_q8_1) {
             src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
 
-            if (split && src1_on_device && src1_is_contiguous) {
+            // FIXME: why split only? src1 never gets quantized, breaks ggml-backend/GPT-2
+            if (/*split &&*/ src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
@@ -7582,7 +7581,16 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff
     int64_t row_high = ggml_nrows(tensor);
     int64_t nrows_split = row_high - row_low;
 
-    return ggml_nbytes_split(tensor, nrows_split);
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+    int64_t ne0 = tensor->ne[0];
+
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+            * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+    }
+
+    return size;
 
     UNUSED(buffer);
 }
@@ -7601,7 +7609,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
     size_t size = ggml_backend_cuda_buffer_get_alloc_size(buffer, tensor);
 
     if (size > original_size && tensor->view_src == nullptr) {
-        CUDA_CHECK(cudaMemset((char *) tensor->data + original_size, 0, size - original_size));
+        CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, size - original_size, g_cudaStreams[g_main_device][0]));
     }
 
     UNUSED(buffer);
@@ -7629,7 +7637,7 @@ static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
 static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    //GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
     CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
 
@@ -7675,7 +7683,6 @@ static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_gr
     UNUSED(plan);
 }
 
-#include <vector>
 static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_compute_params params = {};
     params.type = GGML_TASK_COMPUTE;
@@ -7708,17 +7715,19 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
             cudaDeviceSynchronize();
             std::vector<float> tmp(ggml_nelements(node), 0.0f);
             cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s): ", node->name, ggml_op_name(node->op),
+            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
                 ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none");
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+                node->src[0]->name,
+                node->src[1] ? node->src[1]->name : "none");
             double sum = 0.0;
             double sq_sum = 0.0;
             for (int i = 0; i < ggml_nelements(node); i++) {
-                //printf("%f ", tmp[i]);
+                printf("%f ", tmp[i]);
                 sum += tmp[i];
                 sq_sum += tmp[i]*tmp[i];
             }
-            //printf("\n");
+            printf("\n");
             printf("sum: %f, ", sum);
             printf("sq_sum: %f\n", sq_sum);
         }

From 3cf87a3e7f969147f2b4a19bce1075b2c719ee9a Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 3 Oct 2023 16:25:08 +0200
Subject: [PATCH 04/23] gpt-2 : better check for CPU backend when settings
 n_threads

---
 examples/gpt-2/main.cpp | 8 +++-----
 src/ggml-cuda.cu        | 9 +++++----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index e9d20b522..87cdf9065 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -760,11 +760,9 @@ bool gpt2_eval(
     ggml_allocr_alloc_graph(allocr, gf);
 
     // run the computation
-#ifndef GGML_USE_CUBLAS
-    // FIXME: the backend may be CPU even if CUDA is enabled
-    // if (model.backend.id == GGML_BACKEND_ID_CPU)
-    ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-#endif
+    if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
     ggml_backend_graph_compute(model.backend, gf);
 
     //if (n_past%100 == 0) {
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 163708c1e..87c44da8f 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -62,6 +62,7 @@
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -1576,7 +1577,7 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
 }
 
 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
+static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
     const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
     const int row = blockDim.y*blockIdx.y + threadIdx.y;
 
@@ -4586,7 +4587,7 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
 
 
 template<int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const void * x, const int * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
+static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
     const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
     const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
     const dim3 block_nums(block_num_x, nrows, 1);
@@ -5810,7 +5811,7 @@ static void ggml_cuda_op_repeat(
     GGML_ASSERT(nb0  == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
 
-    // TODO: very inefficient, implement in a kernel
+    // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
     for                         (int i3 = 0; i3 < nr3;  i3++) {
         for                     (int k3 = 0; k3 < ne03; k3++) {
             for                 (int i2 = 0; i2 < nr2;  i2++) {
@@ -5847,7 +5848,7 @@ static void ggml_cuda_op_get_rows(
     const int ncols = src0->ne[0];
     const int nrows = ggml_nelements(src1);
 
-    const int * src1_i32 = (const int *) src1_d;
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
 
     switch (src0->type) {
         case GGML_TYPE_F16:

From 319b4bc8563a9bbf983ae5f6487e78db1e0d8ca2 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 3 Oct 2023 22:11:06 +0200
Subject: [PATCH 05/23] .gitignore : add .clangd

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index d7e11716a..a66ac17df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ compile_commands.json
 CMakeSettings.json
 .vs/
 .vscode/
+.clangd
 
 .exrc
 .cache
@@ -32,4 +33,4 @@ zig-cache/
 
 *.sw?
 
-__pycache__/
\ No newline at end of file
+__pycache__/

From b527d48cacdbeab7aed9412773d935eb6ad55c68 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 4 Oct 2023 15:17:29 +0200
Subject: [PATCH 06/23] merge master

---
 examples/CMakeLists.txt |   6 +-
 include/ggml/ggml.h     |  12 +
 src/ggml-alloc.c        |   4 -
 src/ggml.c              | 793 ++++++++++++++++++++++++++++++----------
 4 files changed, 608 insertions(+), 207 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c0201c131..e3404fb8b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,11 +19,11 @@ target_link_libraries(common-ggml PRIVATE ggml)
 target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_subdirectory(gpt-2)
-#add_subdirectory(gpt-j)
+add_subdirectory(gpt-j)
 add_subdirectory(whisper)
 add_subdirectory(mnist)
-#add_subdirectory(gpt-neox)
-#add_subdirectory(dolly-v2)
+add_subdirectory(gpt-neox)
+add_subdirectory(dolly-v2)
 add_subdirectory(replit)
 add_subdirectory(mpt)
 add_subdirectory(starcoder)
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index db7cad0dc..a26b9119b 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -401,10 +401,14 @@ extern "C" {
         GGML_OP_CLAMP,
         GGML_OP_CONV_1D,
         GGML_OP_CONV_2D,
+        GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
 
+        GGML_OP_CONV_1D_STAGE_0,  // internal
+        GGML_OP_CONV_1D_STAGE_1,  // internal
+
         GGML_OP_UPSCALE, // nearest interpolate
 
         GGML_OP_FLASH_ATTN,
@@ -1387,6 +1391,14 @@ extern "C" {
             int                   s,
             int                   d);
 
+    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   p0,
+            int                   d0);
+
     GGML_API struct ggml_tensor * ggml_conv_2d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index afb4e10cf..44cb97481 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -585,7 +585,3 @@ size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph *
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
     return alloc->max_size;
 }
-
-size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
-    return alloc->max_size;
-}
diff --git a/src/ggml.c b/src/ggml.c
index b1d11ba5c..aabe2e4df 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4081,12 +4081,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ALIBI",
     "CLAMP",
     "CONV_1D",
+    "CONV_TRANSPOSE_1D",
     "CONV_2D",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
 
+    "CONV_1D_STAGE_0",
+    "CONV_1D_STAGE_1",
+
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
@@ -4112,7 +4116,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4163,12 +4167,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "alibi(x)",
     "clamp(x)",
     "conv_1d(x)",
+    "conv_transpose_1d(x)",
     "conv_2d(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
 
+    "conv_1d_stage_0(x)",
+    "conv_1d_stage_1(x)",
+
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
@@ -4194,7 +4202,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -4223,7 +4231,10 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_DIAG_MASK_INF          ] = true;
         p[GGML_OP_DIAG_MASK_ZERO         ] = true;
         p[GGML_OP_CONV_1D                ] = true;
+        p[GGML_OP_CONV_1D_STAGE_0        ] = true;
+        p[GGML_OP_CONV_1D_STAGE_1        ] = true;
         p[GGML_OP_CONV_2D                ] = true;
+        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -4939,8 +4950,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 
     *result = (struct ggml_tensor) {
         /*.type         =*/ type,
-        /*.backend      =*/ GGML_BACKEND_CPU,
-        /*.buffer       =*/ NULL,
+        /*.backend      =*/ view_src ? view_src->backend : GGML_BACKEND_CPU,
+        /*.buffer       =*/ view_src ? view_src->buffer : NULL,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
@@ -4973,11 +4984,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
     }
 
-    if (view_src != NULL) {
-        result->backend = view_src->backend;
-        result->buffer  = view_src->buffer;
-    }
-
     ctx->n_objects++;
 
     return result;
@@ -5792,7 +5798,7 @@ static struct ggml_tensor * ggml_mul_impl(
         bool inplace) {
     // TODO: support less-strict constraint
     //       GGML_ASSERT(ggml_can_repeat(b, a));
-    //GGML_ASSERT(ggml_can_repeat_rows(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
 
     bool is_node = false;
 
@@ -7509,14 +7515,17 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-GGML_API struct ggml_tensor * ggml_conv_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
+// im2col: [N, IC, IL] => [N, OL, IC*K]
+// a: [OC，IC, K]
+// b: [N, IC, IL]
+// result: [N, OL, IC*K]
+static struct ggml_tensor * ggml_conv_1d_stage_0(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int                   s0,
+    int                   p0,
+    int                   d0) {
     GGML_ASSERT(a->ne[1] == b->ne[1]);
     bool is_node = false;
 
@@ -7525,16 +7534,54 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         is_node = true;
     }
 
+    const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
     const int64_t ne[4] = {
-        ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-        a->ne[2], 1, 1,
+        a->ne[1] * a->ne[0],
+        OL,
+        b->ne[2],
+        1,
     };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
 
     int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = GGML_OP_CONV_1D;
+    result->op = GGML_OP_CONV_1D_STAGE_0;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_1d_stage_1
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// a: [OC, IC, K]
+// b: [N, OL, IC * K]
+// result: [N, OC, OL]
+static struct ggml_tensor * ggml_conv_1d_stage_1(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b) {
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        b->ne[1],
+        a->ne[2],
+        b->ne[2],
+        1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    result->op = GGML_OP_CONV_1D_STAGE_1;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7542,6 +7589,53 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
     return result;
 }
 
+// ggml_conv_1d
+
+GGML_API struct ggml_tensor * ggml_conv_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
+    result = ggml_conv_1d_stage_1(ctx, a, result);
+    return result;
+}
+
+// GGML_API struct ggml_tensor * ggml_conv_1d(
+//         struct ggml_context * ctx,
+//         struct ggml_tensor  * a,
+//         struct ggml_tensor  * b,
+//         int                   s0,
+//         int                   p0,
+//         int                   d0) {
+//     GGML_ASSERT(ggml_is_matrix(b));
+//     GGML_ASSERT(a->ne[1] == b->ne[1]);
+//     bool is_node = false;
+
+//     if (a->grad || b->grad) {
+//         GGML_ASSERT(false); // TODO: implement backward
+//         is_node = true;
+//     }
+
+//     const int64_t ne[4] = {
+//         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
+//         a->ne[2], 1, 1,
+//     };
+//     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+
+//     int32_t params[] = { s0, p0, d0 };
+//     ggml_set_op_params(result, params, sizeof(params));
+
+//     result->op = GGML_OP_CONV_1D;
+//     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+//     result->src[0] = a;
+//     result->src[1] = b;
+
+//     return result;
+// }
+
 // ggml_conv_1d_ph
 
 struct ggml_tensor* ggml_conv_1d_ph(
@@ -7553,6 +7647,50 @@ struct ggml_tensor* ggml_conv_1d_ph(
     return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
 
+// ggml_conv_transpose_1d
+
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(a->ne[3] == 1);
+
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(d0 == 1);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
+    };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+
+    int32_t params[] = { s0, p0, d0 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_CONV_TRANSPOSE_1D;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 // ggml_conv_2d
 
 struct ggml_tensor * ggml_conv_2d(
@@ -13692,7 +13830,7 @@ static void ggml_compute_forward_rope_back(
 
 // ggml_compute_forward_conv_1d
 
-static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
+static void ggml_compute_forward_conv_1d_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -13710,42 +13848,33 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     const int nth = params->nth;
 
     const int nk = ne00;
-    const int nh = nk/2;
 
-    const int ew0 = ggml_up32(ne01);
+    // size of the convolution row - the kernel size unrolled across all input channels
+    const int ew0 = nk*ne01;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
 
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
+        for (int64_t i11 = 0; i11 < ne11; i11++) {
+            const float * const src = (float *)((char *) src1->data + i11*nb11);
+            ggml_fp16_t * dst_data = wdata;
 
-        // prepare source data (src1)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                for (int64_t ik = 0; ik < nk; ik++) {
+                    const int idx0 = i0*s0 + ik*d0 - p0;
 
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                ggml_fp16_t * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                    if(!(idx0 < 0 || idx0 >= ne10)) {
+                        dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
+                    }
                 }
             }
         }
@@ -13758,7 +13887,7 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     }
 
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne2;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -13767,23 +13896,22 @@ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f16(ew0, &v,
-                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
+    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+
+    for (int i2 = 0; i2 < ne2; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                ggml_vec_dot_f16(ew0, dst_data + i0,
+                        (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
+                        (ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_1d_s1_ph_f32(
+static void ggml_compute_forward_conv_1d_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -13801,42 +13929,32 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
     const int nth = params->nth;
 
     const int nk = ne00;
-    const int nh = nk/2;
 
-    const int ew0 = ggml_up32(ne01);
+    const int ew0 = nk*ne01;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
 
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(float));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
-        {
-            float * const wdata = (float *) params->wdata + 0;
+        float * const wdata = (float *) params->wdata + 0;
 
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
+        for (int64_t i11 = 0; i11 < ne11; i11++) {
+            const float * const src = (float *)((char *) src1->data + i11*nb11);
+            float * dst_data = wdata;
 
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                for (int64_t ik = 0; ik < nk; ik++) {
+                    const int idx0 = i0*s0 + ik*d0 - p0;
 
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
+                    if(!(idx0 < 0 || idx0 >= ne10)) {
+                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
+                    }
                 }
             }
         }
@@ -13858,35 +13976,225 @@ static void ggml_compute_forward_conv_1d_s1_ph_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
+    float * const wdata = (float *) params->wdata + 0;
+
+    for (int i2 = 0; i2 < ne2; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
+
+            for (int i0 = 0; i0 < ne0; i0++) {
+                ggml_vec_dot_f32(ew0, dst_data + i0,
+                        (float *) ((char *) src0->data + i1*nb02),
+                        (float *)                wdata + i2*nb2 + i0*ew0);
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_1d_s1_ph(
+static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
+                             ggml_fp16_t * A,
+                             ggml_fp16_t * B,
+                             float * C,
+                             const int ith, const int nth) {
+    // does not seem to make a difference
+    int64_t m0, m1, n0, n1;
+    // patches per thread
+    if (m > n) {
+        n0 = 0;
+        n1 = n;
+
+        // total patches in dst
+        const int np = m;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        m0 = dp*ith;
+        m1 = MIN(m0 + dp, np);
+    } else {
+        m0 = 0;
+        m1 = m;
+
+        // total patches in dst
+        const int np = n;
+
+        // patches per thread
+        const int dp = (np + nth - 1)/nth;
+
+        // patch range for this thread
+        n0 = dp*ith;
+        n1 = MIN(n0 + dp, np);
+    }
+
+    // block-tiling attempt
+    int64_t blck_n = 16;
+    int64_t blck_m = 16;
+
+    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
+    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
+    // if (blck_size > 0) {
+    //     blck_0 = 4;
+    //     blck_1 = blck_size / blck_0;
+    //     if (blck_1 < 0) {
+    //         blck_1 = 1;
+    //     }
+    //     // blck_0 = (int64_t)sqrt(blck_size);
+    //     // blck_1 = blck_0;
+    // }
+    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
+
+    for (int j = n0; j < n1; j+=blck_n) {
+        for (int i = m0; i < m1; i+=blck_m) {
+            // printf("i j k => %d %d %d\n", i, j, K);
+            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
+                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
+                    ggml_vec_dot_f16(k,
+                                    C + ii*n + jj,
+                                    A + ii * k,
+                                    B + jj * k);
+                }
+            }
+        }
+    }
+}
+
+// src0: kernel [OC, IC, K]
+// src1: signal [N, IC, IL]
+// dst:  result [N, OL, IC*K]
+static void ggml_compute_forward_conv_1d_stage_0_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    switch (src0->type) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N  = ne12;
+    const int64_t IC = ne11;
+    const int64_t IL = ne10;
+
+    const int64_t K = ne00;
+
+    const int64_t OL = ne1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IL] => [N, OL, IC*K]
+    {
+        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iol = 0; iol < OL; iol++) {
+                for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+                    // micro kernel
+                    ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
+                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
+
+                    for (int64_t ik = 0; ik < K; ik++) {
+                        const int64_t iil = iol*s0 + ik*d0 - p0;
+
+                        if (!(iil < 0 || iil >= IL)) {
+                            dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+// src0: [OC, IC, K]
+// src1: [N, OL, IC * K]
+// result: [N, OC, OL]
+static void ggml_compute_forward_conv_1d_stage_1_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne12;
+    const int OL = ne11;
+
+    const int OC = ne02;
+    const int IC = ne01;
+    const int K  = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OL;
+    int64_t k = IC * K;
+
+    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
+    for (int i = 0; i < N; i++) {
+        ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
+        ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
+static void ggml_compute_forward_conv_1d(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch(src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -13895,7 +14203,43 @@ static void ggml_compute_forward_conv_1d_s1_ph(
     }
 }
 
-static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
+static void ggml_compute_forward_conv_1d_stage_0(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch(src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_conv_1d_stage_1(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    switch(src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_conv_transpose_1d
+
+static void ggml_compute_forward_conv_transpose_1d_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -13912,43 +14256,38 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
+    const int nk = ne00*ne01*ne02;
 
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
         {
             ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
+                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
+                        dst_data[i00*ne02 + i02] = src[i00];
                     }
                 }
             }
         }
 
-        // prepare source data (src1)
+        // permute source data (src1) from (L x Cin) to (Cin x L)
         {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
+            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            ggml_fp16_t * dst_data = wdata;
 
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
-                ggml_fp16_t * dst_data = wdata;
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
+                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
                 }
             }
         }
@@ -13960,8 +14299,10 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
         return;
     }
 
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne1;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -13970,23 +14311,26 @@ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
+    ggml_fp16_t * const wdata_src = wdata + nk;
+
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f16(ew0, &v,
-                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
+        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f16(ne02, &v,
+                        (ggml_fp16_t *)    wdata_src + i1n,
+                        (ggml_fp16_t *) wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_1d_s2_ph_f32(
+static void ggml_compute_forward_conv_transpose_1d_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14003,29 +14347,24 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
+    const int nk = ne00*ne01*ne02;
 
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
     GGML_ASSERT(nb00 == sizeof(float));
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
         memset(params->wdata, 0, params->wsize);
 
-        // prepare kernel data (src0)
+        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
         {
             float * const wdata = (float *) params->wdata + 0;
 
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 for (int64_t i01 = 0; i01 < ne01; i01++) {
                     const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
+                    float * dst_data = wdata + i01*ne00*ne02;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
+                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
                     }
                 }
             }
@@ -14033,13 +14372,13 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
 
         // prepare source data (src1)
         {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
+            float * const wdata = (float *) params->wdata + nk;
+            float * dst_data = wdata;
 
             for (int64_t i11 = 0; i11 < ne11; i11++) {
                 const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
                 for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
+                    dst_data[i10*ne11 + i11] = src[i10];
                 }
             }
         }
@@ -14051,8 +14390,10 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
         return;
     }
 
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+
     // total rows in dst
-    const int nr = ne02;
+    const int nr = ne1;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -14061,23 +14402,26 @@ static void ggml_compute_forward_conv_1d_s2_ph_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    float * const wdata     = (float *) params->wdata + 0;
+    float * const wdata_src = wdata + nk;
+
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
+        float * wdata_kernel = wdata + i1*ne02*ne00;
+        for (int i10 = 0; i10 < ne10; i10++) {
+            const int i1n = i10*ne11;
+            for (int i00 = 0; i00 < ne00; i00++) {
+                float v = 0;
+                ggml_vec_dot_f32(ne02, &v,
+                        wdata_src + i1n,
+                        wdata_kernel + i00*ne02);
+                dst_data[i10*s0 + i00] += v;
             }
         }
     }
 }
 
-static void ggml_compute_forward_conv_1d_s2_ph(
+static void ggml_compute_forward_conv_transpose_1d(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -14085,11 +14429,11 @@ static void ggml_compute_forward_conv_1d_s2_ph(
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
+                ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -14098,27 +14442,6 @@ static void ggml_compute_forward_conv_1d_s2_ph(
     }
 }
 
-// ggml_compute_forward_conv_1d
-
-static void ggml_compute_forward_conv_1d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-    GGML_ASSERT(d0 == 1); // dilation not supported
-    GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported
-    if (s0 == 1) {
-        ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst);
-    } else if (s0 == 2) {
-        ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst);
-    } else {
-        GGML_ASSERT(false); // only stride 1 and 2 supported
-    }
-}
-
 // ggml_compute_forward_conv_2d
 
 static void ggml_compute_forward_conv_2d_f16_f32(
@@ -14161,20 +14484,22 @@ static void ggml_compute_forward_conv_2d_f16_f32(
         {
             ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
 
-            for (int i12 = 0; i12 < ne12; i12++) {
-                const float * const src = (float *)((char *) src1->data + i12*nb12);
-                ggml_fp16_t * dst_data = wdata;
-
-                for (int i1 = 0; i1 < ne1; i1++) {
-                    for (int i0 = 0; i0 < ne0; i0++) {
-                        for (int ik1 = 0; ik1 < nk1; ik1++) {
-                            for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                const int idx0 = i0*s0 + ik0*d0 - p0;
-                                const int idx1 = i1*s1 + ik1*d1 - p1;
-
-                                if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
-                                    dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                        GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+            for (int i13 = 0; i13 < ne13; i13++) {
+                for (int i12 = 0; i12 < ne12; i12++) {
+                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
+                    ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
+
+                    for (int i1 = 0; i1 < ne1; i1++) {
+                        for (int i0 = 0; i0 < ne0; i0++) {
+                            for (int ik1 = 0; ik1 < nk1; ik1++) {
+                                for (int ik0 = 0; ik0 < nk0; ik0++) {
+                                    const int idx0 = i0*s0 + ik0*d0 - p0;
+                                    const int idx1 = i1*s1 + ik1*d1 - p1;
+
+                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
+                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
+                                            GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+                                    }
                                 }
                             }
                         }
@@ -16457,6 +16782,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
+        case GGML_OP_CONV_1D_STAGE_0:
+            {
+                ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_1D_STAGE_1:
+            {
+                ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
         case GGML_OP_CONV_2D:
             {
                 ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17382,10 +17719,22 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case GGML_OP_CONV_1D_STAGE_0:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case GGML_OP_CONV_1D_STAGE_1:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_CONV_2D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -18227,21 +18576,68 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     GGML_ASSERT(node->src[1]->ne[2] == 1);
                     GGML_ASSERT(node->src[1]->ne[3] == 1);
 
+                    const int64_t ne00 = node->src[0]->ne[0];
+                    const int64_t ne01 = node->src[0]->ne[1];
+                    const int64_t ne02 = node->src[0]->ne[2];
+
+                    const int64_t ne10 = node->src[1]->ne[0];
+                    const int64_t ne11 = node->src[1]->ne[1];
+
+                    const int64_t ne0 = node->ne[0];
+                    const int64_t ne1 = node->ne[1];
+                    const int64_t nk  = ne00;
+                    const int64_t ew0 = nk * ne01;
+
+                    UNUSED(ne02);
+                    UNUSED(ne10);
+                    UNUSED(ne11);
+
                     size_t cur = 0;
-                    const int nk = node->src[0]->ne[0];
 
                     if (node->src[0]->type == GGML_TYPE_F16 &&
-                            node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(ggml_fp16_t)*(
-                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
-                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
-                                );
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
                     } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                            node->src[1]->type == GGML_TYPE_F32) {
-                        cur = sizeof(float)*(
-                                nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] +
-                                ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1]
-                                );
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*(ne0*ne1*ew0);
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_1D_STAGE_0:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_1D_STAGE_1:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_CONV_TRANSPOSE_1D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src[0]->ne[3] == 1);
+                    GGML_ASSERT(node->src[1]->ne[2] == 1);
+                    GGML_ASSERT(node->src[1]->ne[3] == 1);
+
+                    const int64_t ne00 = node->src[0]->ne[0];  // K
+                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
+
+                    const int64_t ne10 = node->src[1]->ne[0];  // L
+                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
+
+                    size_t cur = 0;
+                    if (node->src[0]->type == GGML_TYPE_F16 &&
+                        node->src[1]->type == GGML_TYPE_F32) {
+                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
+                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
+                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
+                               node->src[1]->type == GGML_TYPE_F32) {
+                        cur += sizeof(float)*ne00*ne01*ne02;
+                        cur += sizeof(float)*ne10*ne11;
                     } else {
                         GGML_ASSERT(false);
                     }
@@ -19401,9 +19797,6 @@ static enum ggml_opt_result ggml_opt_adam(
 
     // run the optimizer
     for (int t = 0; t < params.adam.n_iter; ++t) {
-        if (cancel) {
-            break;
-        }
         opt->iter = iter0 + t + 1;
         GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
 

From abf2669e765cca1162e2b5fc3a7427da9e7e0d51 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 4 Oct 2023 15:23:09 +0200
Subject: [PATCH 07/23] restore tests/CMakeLists.txt

---
 tests/CMakeLists.txt | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 804689fb7..a1cedf0f8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -176,11 +176,11 @@ endif()
 #
 # test-grad0
 
-#set(TEST_TARGET test-grad0)
-#add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-#target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-#add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+set(TEST_TARGET test-grad0)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
 
 #
 # test-opt
@@ -350,8 +350,8 @@ set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_
 #
 # test-xpos
 
-#set(TEST_TARGET test-xpos)
-#add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-#target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-#add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-#set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+set(TEST_TARGET test-xpos)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

From a1fd06c7c243b8350e0951f8fcc1716a9010bcf4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 5 Oct 2023 15:50:06 +0300
Subject: [PATCH 08/23] ggml-backend : code style suggestions (#551)

* ggml-backend : code style suggestions

* ggml-backend : move ggml_backend and ggml_backend_buffer in the source file

* ggml-backend : move structs back to header + rename type

* ggml-backend : remove obsolete comment

* fix leak in ggml_backend_buffer_free

* ggml-backend : re-introduce typedefs as a declaration of intent

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 .gitignore                  |   1 +
 examples/gpt-2/main.cpp     |   4 ++
 include/ggml/ggml-alloc.h   |  12 ++--
 include/ggml/ggml-backend.h | 118 +++++++++++++++++++++------------
 include/ggml/ggml.h         |   7 +-
 src/ggml-alloc.c            |   4 +-
 src/ggml-backend.c          | 128 ++++++++++++++++++++++++++++--------
 src/ggml-cuda.cu            |  41 ++++++------
 8 files changed, 216 insertions(+), 99 deletions(-)

diff --git a/.gitignore b/.gitignore
index a66ac17df..35c37674d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ build-sanitize-thread/
 build-cov/
 build-ci-debug/
 build-ci-release/
+build-cublas/
 out/
 tmp/
 models/
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 87cdf9065..a046b19ea 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -75,9 +75,12 @@ struct gpt2_model {
 
     //
     struct ggml_context * ctx;
+
     ggml_backend_t backend = NULL;
+
     ggml_backend_buffer_t buffer_w;
     ggml_backend_buffer_t buffer_kv;
+
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
@@ -333,6 +336,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
         // allocate buffer and tensors
         model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
+
         ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
         ggml_allocr_alloc(alloc, model.memory_k);
         ggml_allocr_alloc(alloc, model.memory_v);
diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h
index c87139491..e38758878 100644
--- a/include/ggml/ggml-alloc.h
+++ b/include/ggml/ggml-alloc.h
@@ -16,18 +16,18 @@ GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_bu
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
 
-GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
-GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API void   ggml_allocr_free       (struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure (struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset      (struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
-GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
+GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
+
 GGML_API size_t ggml_allocr_alloc_graph_n(
                     struct ggml_allocr * alloc,
                     struct ggml_cgraph ** graphs, int n_graphs,
                     struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
 
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
index c71f50225..96a1ab201 100644
--- a/include/ggml/ggml-backend.h
+++ b/include/ggml/ggml-backend.h
@@ -5,55 +5,74 @@
 #ifdef  __cplusplus
 extern "C" {
 #endif
-    typedef struct ggml_backend_s * ggml_backend_t;
-
-    // backend buffer
+    struct ggml_backend;
     struct ggml_backend_buffer;
+
+    // type-erased backend-specific types / wrappers
+    typedef void * ggml_backend_context_t;
+    typedef void * ggml_backend_graph_plan_t;
+    typedef void * ggml_backend_buffer_context_t;
+
+    // avoid accessing internals of these types
+    typedef struct ggml_backend        * ggml_backend_t;
     typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef void * ggml_buffer_context_t;
 
-    struct ggml_backend_buffer_interface {
+    //
+    // backend buffer
+    //
+
+    struct ggml_backend_buffer_i {
         void   (*free_buffer)   (ggml_backend_buffer_t buffer);
         void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
         size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
         void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
         void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
-
     };
 
+    // TODO: hide behind API
     struct ggml_backend_buffer {
-        struct ggml_backend_buffer_interface interface;
-        ggml_backend_t backend;
-        ggml_buffer_context_t context;
+        struct ggml_backend_buffer_i interface;
+
+        ggml_backend_t                backend;
+        ggml_backend_buffer_context_t context;
+
         size_t size;
     };
 
     // backend buffer functions
-    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size);
-    GGML_API void   ggml_backend_buffer_free(ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer);
-    GGML_API void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+            struct ggml_backend                  * backend,
+            struct ggml_backend_buffer_i           interface,
+                   ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
     GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
 
+    //
     // backend
-    typedef void * ggml_backend_context_t;
-    typedef void * ggml_graph_plan_t;
+    //
 
-    struct ggml_backend_interface {
+    struct ggml_backend_i {
         const char * (*get_name)(ggml_backend_t backend);
 
         void (*free)(ggml_backend_t backend);
 
         // buffer allocation
         ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
-        size_t                (*get_alignment)(ggml_backend_t backend);
+
+        // get buffer alignment
+        size_t (*get_alignment)(ggml_backend_t backend);
 
         // tensor data access
         // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
-        void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         void (*synchronize)     (ggml_backend_t backend);
 
         // (optional) copy tensor between different backends, allow for single-copy tranfers
@@ -61,9 +80,10 @@ extern "C" {
         void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
 
         // compute graph with a plan
-        ggml_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-        void              (*graph_plan_free)   (ggml_backend_t backend, ggml_graph_plan_t plan);
-        void              (*graph_plan_compute)(ggml_backend_t backend, ggml_graph_plan_t plan);
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
         // compute graph without a plan
         void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
@@ -71,35 +91,49 @@ extern "C" {
         bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
     };
 
-    struct ggml_backend_s {
-        struct ggml_backend_interface interface;
+    // TODO: hide behind API
+    struct ggml_backend {
+        struct ggml_backend_i interface;
+
         ggml_backend_context_t context;
     };
 
     // backend helper functions
-    static inline ggml_backend_t get_backend(const struct ggml_tensor * tensor) { return tensor->buffer->backend; }
-
-    static inline const char * ggml_backend_name(ggml_backend_t backend) { return backend->interface.get_name(backend); }
-    static inline void ggml_backend_free(ggml_backend_t backend) { backend->interface.free(backend); }
-    static inline ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { return backend->interface.alloc_buffer(backend, size); }
-    static inline size_t ggml_backend_get_alignment(ggml_backend_t backend) { return backend->interface.get_alignment(backend); }
-    static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); }
-    static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); }
-    static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { get_backend(tensor)->interface.set_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); }
-    static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { get_backend(tensor)->interface.get_tensor_async(get_backend(tensor), tensor, data, offset, size); get_backend(tensor)->interface.synchronize(get_backend(tensor)); }
-    static inline void ggml_backend_synchronize(ggml_backend_t backend) { backend->interface.synchronize(backend); }
-    static inline ggml_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); }
-    static inline void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); }
-    static inline void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); }
-    static inline void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); }
-    static inline bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { return backend->interface.supports_op(backend, op); }
+    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
+
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+
+    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
 
     // tensor copy between different backends
     GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
 
+    //
     // CPU backend
+    //
+
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
     GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+
     GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
 
     ///////////////////////////
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index a26b9119b..5e7f39dc4 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -326,7 +326,7 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
 
-    enum ggml_backend {
+    enum ggml_backend_type {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_GPU = 10,
         GGML_BACKEND_GPU_SPLIT = 20,
@@ -479,8 +479,9 @@ extern "C" {
 
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type    type;
-        enum ggml_backend backend;
+        enum ggml_type         type;
+        enum ggml_backend_type backend;
+
         struct ggml_backend_buffer * buffer;
 
         int     n_dims;
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index 44cb97481..3f53c4c82 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -62,7 +62,7 @@ struct free_block {
 #define MAX_FREE_BLOCKS 256
 
 struct ggml_allocr {
-    ggml_backend_buffer_t buffer;
+    struct ggml_backend_buffer * buffer;
     bool buffer_owned;
     void * data;
     size_t alignment;
@@ -265,7 +265,7 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
     alloc->n_free_blocks = 1;
     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
     alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = alloc->buffer->size - align_offset;
+    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
 }
 
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index da0d9c639..9e5dc8c9a 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -1,19 +1,24 @@
 #include "ggml-backend.h"
 #include "ggml-alloc.h"
+
 #include <assert.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define UNUSED(x) (void)(x)
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define UNUSED GGML_UNUSED
 
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 // backend buffer
 
-struct ggml_backend_buffer * ggml_backend_buffer_init(struct ggml_backend_buffer_interface interface, ggml_backend_t backend, ggml_buffer_context_t context, size_t size) {
-    struct ggml_backend_buffer * buffer = malloc(sizeof(struct ggml_backend_buffer));
+ggml_backend_buffer_t ggml_backend_buffer_init(
+        struct ggml_backend                  * backend,
+        struct ggml_backend_buffer_i           interface,
+               ggml_backend_buffer_context_t   context,
+               size_t                          size) {
+    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
 
     GGML_ASSERT(interface.get_base != NULL);
 
@@ -27,10 +32,11 @@ struct ggml_backend_buffer * ggml_backend_buffer_init(struct ggml_backend_buffer
     return buffer;
 }
 
-void ggml_backend_buffer_free(struct ggml_backend_buffer * buffer) {
+void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->interface.free_buffer != NULL) {
         buffer->interface.free_buffer(buffer);
     }
+    free(buffer);
 }
 
 size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
@@ -41,6 +47,10 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
     return buffer->interface.get_base(buffer);
 }
 
+size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
+    return buffer->size;
+}
+
 size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     if (buffer->interface.get_alloc_size) {
         return buffer->interface.get_alloc_size(buffer, tensor);
@@ -60,6 +70,70 @@ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_t
     }
 }
 
+// backend
+
+ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
+    return tensor->buffer->backend;
+}
+
+const char * ggml_backend_name(ggml_backend_t backend) {
+    return backend->interface.get_name(backend);
+}
+
+void ggml_backend_free(ggml_backend_t backend) {
+    backend->interface.free(backend);
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
+    return backend->interface.alloc_buffer(backend, size);
+}
+
+size_t ggml_backend_get_alignment(ggml_backend_t backend) {
+    return backend->interface.get_alignment(backend);
+}
+
+void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->interface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->interface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->interface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+    ggml_get_backend(tensor)->interface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_get_backend(tensor)->interface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+    ggml_get_backend(tensor)->interface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_synchronize(ggml_backend_t backend) {
+    backend->interface.synchronize(backend);
+}
+
+ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->interface.graph_plan_create(backend, cgraph);
+}
+
+void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->interface.graph_plan_free(backend, plan);
+}
+
+void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    backend->interface.graph_plan_compute(backend, plan);
+}
+
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    backend->interface.graph_compute(backend, cgraph);
+}
+
+bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return backend->interface.supports_op(backend, op);
+}
+
 // backend copy
 
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -90,10 +164,10 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
 
     // TODO: allow backends to support copy to/from same backend
 
-    if (get_backend(dst)->interface.cpy_tensor_from != NULL) {
-        get_backend(dst)->interface.cpy_tensor_from(get_backend(dst)->context, src, dst);
-    } else if (get_backend(src)->interface.cpy_tensor_to != NULL) {
-        get_backend(src)->interface.cpy_tensor_to(get_backend(src)->context, src, dst);
+    if (ggml_get_backend(dst)->interface.cpy_tensor_from != NULL) {
+        ggml_get_backend(dst)->interface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+    } else if (ggml_get_backend(src)->interface.cpy_tensor_to != NULL) {
+        ggml_get_backend(src)->interface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
     } else {
         // shouldn't be hit when copying from/to CPU
         #ifndef NDEBUG
@@ -137,7 +211,7 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     UNUSED(buffer);
 }
 
-static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = {
+static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
     /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
     /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
     /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -146,7 +220,7 @@ static struct ggml_backend_buffer_interface cpu_backend_buffer_interface = {
 };
 
 // for buffers from ptr, free is not called
-static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_ptr = {
+static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
     /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
     /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
     /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -156,9 +230,10 @@ static struct ggml_backend_buffer_interface cpu_backend_buffer_interface_from_pt
 
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
-static struct ggml_backend_buffer * ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
+static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
     void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned
-    return ggml_backend_buffer_init(cpu_backend_buffer_interface, backend, data, size);
+                                                   // TODO: maybe use GGML_ALIGNED_MALLOC?
+    return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
 }
 
 static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
@@ -201,15 +276,15 @@ static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_t
     UNUSED(backend);
 }
 
-struct ggml_backend_cpu_plan {
+struct ggml_backend_plan_cpu {
     struct ggml_cplan cplan;
     struct ggml_cgraph cgraph;
 };
 
-static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_backend_cpu_plan * cpu_plan = malloc(sizeof(struct ggml_backend_cpu_plan));
+    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
     cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
     cpu_plan->cgraph = *cgraph;
@@ -221,8 +296,8 @@ static ggml_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backe
     return cpu_plan;
 }
 
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) {
-    struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan;
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
     free(cpu_plan->cplan.work_data);
     free(cpu_plan);
@@ -230,8 +305,8 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_graph_
     UNUSED(backend);
 }
 
-static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) {
-    struct ggml_backend_cpu_plan * cpu_plan = (struct ggml_backend_cpu_plan *)plan;
+static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
     ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
 
@@ -260,7 +335,7 @@ static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct gg
     UNUSED(op);
 }
 
-static struct ggml_backend_interface cpu_backend_interface = {
+static struct ggml_backend_i cpu_backend_i = {
     /* .get_name            = */ ggml_backend_cpu_name,
     /* .free                = */ ggml_backend_cpu_free,
     /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
@@ -279,14 +354,15 @@ static struct ggml_backend_interface cpu_backend_interface = {
 
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+
     ctx->n_threads = GGML_DEFAULT_N_THREADS;
     ctx->work_data = NULL;
     ctx->work_size = 0;
 
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend_s));
+    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
 
-    *cpu_backend = (struct ggml_backend_s) {
-        /* .interface = */ cpu_backend_interface,
+    *cpu_backend = (struct ggml_backend) {
+        /* .interface = */ cpu_backend_i,
         /* .context   = */ ctx
     };
     return cpu_backend;
@@ -297,10 +373,10 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
-struct ggml_backend_buffer * ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     // TODO: NULL backend?
     // TODO: no free
-    return ggml_backend_buffer_init(cpu_backend_buffer_interface_from_ptr, NULL, ptr, size);
+    return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
 
 #if 0
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index 87c44da8f..d65e2143e 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -7147,7 +7147,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
 
     const size_t nb1 = tensor->nb[1];
 
-    ggml_backend backend = tensor->backend;
+    ggml_backend_type backend = tensor->backend;
     ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
     memset(extra, 0, sizeof(*extra));
 
@@ -7525,9 +7525,9 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
 
 // backend interface
 
-#define UNUSED(x) (void)(x)
+#define UNUSED GGML_UNUSED
 
-struct ggml_backend_cuda_context {
+struct ggml_backend_context_cuda {
 };
 
 static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
@@ -7537,18 +7537,18 @@ static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
     delete cuda_ctx;
     delete backend;
 }
 
-struct ggml_cuda_buffer_context {
+struct ggml_backend_buffer_context_cuda {
     void * device;
 
     ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
     size_t temp_tensor_extra_index = 0;
 
-    ~ggml_cuda_buffer_context() {
+    ~ggml_backend_buffer_context_cuda() {
         delete[] temp_tensor_extras;
     }
 
@@ -7567,13 +7567,13 @@ struct ggml_cuda_buffer_context {
 };
 
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
     CUDA_CHECK(cudaFree(ctx->device));
     delete ctx;
 }
 
 static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
     return ctx->device;
 }
 
@@ -7597,7 +7597,7 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff
 }
 
 static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    ggml_cuda_buffer_context * ctx = (ggml_cuda_buffer_context *)buffer->context;
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
     ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
 
     extra->data_device[g_main_device] = tensor->data;
@@ -7616,7 +7616,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
     UNUSED(buffer);
 }
 
-static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = {
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
     /* .free_buffer    = */ ggml_backend_cuda_buffer_free_buffer,
     /* .get_base       = */ ggml_backend_cuda_buffer_get_base,
     /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
@@ -7625,9 +7625,9 @@ static struct ggml_backend_buffer_interface cuda_backend_buffer_interface = {
 };
 
 static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
-    ggml_cuda_buffer_context * ctx = new ggml_cuda_buffer_context;
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
     CUDA_CHECK(cudaMalloc(&ctx->device, size));
-    return ggml_backend_buffer_init(cuda_backend_buffer_interface, backend, ctx, size);
+    return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
 }
 
 static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
@@ -7661,7 +7661,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     UNUSED(backend);
 }
 
-static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
     GGML_ASSERT(!"not implemented");
 
     return nullptr;
@@ -7670,14 +7670,14 @@ static ggml_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t back
     UNUSED(cgraph);
 }
 
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_graph_plan_t plan) {
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     GGML_ASSERT(!"not implemented");
 
     UNUSED(backend);
     UNUSED(plan);
 }
 
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_graph_plan_t plan) {
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     GGML_ASSERT(!"not implemented");
 
     UNUSED(backend);
@@ -7738,7 +7738,7 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
     UNUSED(backend);
 }
 
-static ggml_backend_interface cuda_backend_interface = {
+static ggml_backend_i cuda_backend_i = {
     /* .get_name            = */ ggml_backend_cuda_name,
     /* .free                = */ ggml_backend_cuda_free,
     /* .alloc_buffer        = */ ggml_backend_cuda_alloc_buffer,
@@ -7758,12 +7758,13 @@ static ggml_backend_interface cuda_backend_interface = {
 ggml_backend_t ggml_backend_cuda_init() {
     ggml_init_cublas(); // TODO: remove from ggml.c
 
-    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;
+    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
 
-    ggml_backend_t cuda_backend = new ggml_backend_s;
-    *cuda_backend = (ggml_backend_s){
-        /* .interface = */ cuda_backend_interface,
+    ggml_backend_t cuda_backend = new ggml_backend;
+    *cuda_backend = (ggml_backend){
+        /* .interface = */ cuda_backend_i,
         /* .context   = */ ctx
     };
+
     return cuda_backend;
 }

From c4cd2d74a2cbf87010f7f7d708885663f05ce1fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 5 Oct 2023 15:53:52 +0300
Subject: [PATCH 09/23] gpt-2 : add comments about KV allocation

---
 examples/gpt-2/main.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index a046b19ea..4b8f20321 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -334,13 +334,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
         printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
 
-        // allocate buffer and tensors
+        // create a backend buffer (can be in host or device memory)
         model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
 
-        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
-        ggml_allocr_alloc(alloc, model.memory_k);
-        ggml_allocr_alloc(alloc, model.memory_v);
-        ggml_allocr_free(alloc);
+        // allocate the tensors into the backend buffer
+        // TODO: better API for this
+        {
+            ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
+
+            // this updates the pointers in the tensors to point to the correct location in the buffer
+            // this is necessary since the ggml_context is .no_alloc == true
+            ggml_allocr_alloc(alloc, model.memory_k);
+            ggml_allocr_alloc(alloc, model.memory_v);
+
+            ggml_allocr_free(alloc);
+        }
     }
 
     // load weights

From d8b3efc794c6776be9c981de3e62cbe56f1376db Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 5 Oct 2023 14:56:44 +0200
Subject: [PATCH 10/23] add ggml_backend_is_cpu

---
 examples/gpt-2/main.cpp     | 18 +++++++++++-------
 include/ggml/ggml-backend.h |  2 ++
 src/ggml-backend.c          |  4 ++++
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 4b8f20321..e7bab0ba1 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -415,13 +415,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
                 return false;
             }
 
-            // read into a temporary buffer first, then copy to the tensor
-            // TODO: read directly into the tensor if the backend is CPU
-            read_buf.resize(ggml_nbytes(tensor));
-            fin.read(read_buf.data(), ggml_nbytes(tensor));
-
             ggml_allocr_alloc(alloc, tensor);
-            ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+
+            if (ggml_backend_is_cpu(model.backend)) {
+                // for the CPU backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(ggml_nbytes(tensor));
+                fin.read(read_buf.data(), ggml_nbytes(tensor));
+                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+            }
 
             // GPT-2 models share the WTE tensor as the LM head
             if (name == "model/wte" && has_lm_head == false) {
@@ -772,7 +776,7 @@ bool gpt2_eval(
     ggml_allocr_alloc_graph(allocr, gf);
 
     // run the computation
-    if (strcmp(ggml_backend_name(model.backend), "CPU") == 0) {
+    if (ggml_backend_is_cpu(model.backend)) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
     ggml_backend_graph_compute(model.backend, gf);
diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
index 96a1ab201..606ea5e4d 100644
--- a/include/ggml/ggml-backend.h
+++ b/include/ggml/ggml-backend.h
@@ -132,6 +132,8 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
+    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+
     GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 
     GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index 9e5dc8c9a..8e3628a2c 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -368,6 +368,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     return cpu_backend;
 }
 
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend->interface.get_name == ggml_backend_cpu_name;
+}
+
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
     ctx->n_threads = n_threads;

From 3dbc43a403fa799aec4fc6049be079d98aa0a0af Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 5 Oct 2023 15:07:42 +0200
Subject: [PATCH 11/23] add backend check to ggml_backend_cpu_set_n_threads

---
 src/ggml-backend.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index 8e3628a2c..f9c58aeef 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -373,13 +373,14 @@ bool ggml_backend_is_cpu(ggml_backend_t backend) {
 }
 
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
     struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
     ctx->n_threads = n_threads;
 }
 
 ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     // TODO: NULL backend?
-    // TODO: no free
     return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
 

From b74ffd5397f7905b4b38c99d06b6c845f773a06b Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 5 Oct 2023 16:13:24 +0200
Subject: [PATCH 12/23] backend cpu: fix buffer alignment

---
 src/ggml-backend.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index f9c58aeef..c5bc03280 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -231,8 +231,9 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
 static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
-    void * data = malloc(size + TENSOR_ALIGNMENT); // malloc may return an address that is not aligned
-                                                   // TODO: maybe use GGML_ALIGNED_MALLOC?
+    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
+    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
+
     return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
 }
 

From b4ec9787e256b320cf8892b77e358d40877ace98 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 01:38:45 +0200
Subject: [PATCH 13/23] fix CUDA_ARCHITECTURES for mmq

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 95f91e331..bcfb4b23b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -306,7 +306,7 @@ endif()
 
 if (GGML_CUDA_SOURCES)
     message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES "52;61")
+    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES "52;61;70")
     set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
     if (NOT MSVC)
         target_link_libraries(ggml PUBLIC stdc++)

From 25ce18aecb32a4a5078839f77b70799ba323c7c8 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 01:46:46 +0200
Subject: [PATCH 14/23] ggml-alloc : better handle view initialization

---
 src/ggml-alloc.c | 35 ++++++++++++++++++++++++-----------
 src/ggml.c       |  4 ++--
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index 3f53c4c82..bb027e2f9 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -379,7 +379,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_ROPE:
         case GGML_OP_RMS_NORM:
         case GGML_OP_SOFT_MAX:
-        case GGML_OP_CONT:
             return true;
 
         default:
@@ -387,14 +386,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
     }
 }
 
+static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
+    assert(view->view_src != NULL && view->view_src->data != NULL);
+    view->backend = view->view_src->backend;
+    view->buffer  = view->view_src->buffer;
+    view->data    = (char *)view->view_src->data + view->view_offs;
+
+    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
+    // due to the ggml_tensor_extra_gpu ring buffer overwriting the with the KV cache extras
+    assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
+    ggml_backend_buffer_init_tensor(alloc->buffer, view);
+}
+
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
     struct hash_node * ht = alloc->hash_table;
     if (node->data == NULL) {
         if (ggml_is_view(node)) {
-            assert(node->view_src->data != NULL);
-            node->data = (char *)node->view_src->data + node->view_offs;
-            node->buffer = node->view_src->buffer;
-            ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view
+            init_view(alloc, node);
         } else {
             // see if we can reuse a parent's buffer (inplace)
             if (ggml_op_can_inplace(node->op)) {
@@ -422,17 +430,15 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                 // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                 // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                 AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->data = parent->data;
-                                node->buffer = parent->buffer;
-                                ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view
+                                node->view_src = parent;
+                                init_view(alloc, node);
                                 return;
                             }
                         }
                         else {
                             AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->data = parent->data;
-                            node->buffer = parent->buffer;
-                            ggml_backend_buffer_init_tensor(alloc->buffer, node); // TODO: change to init_view
+                            node->view_src = parent;
+                            init_view(alloc, node);
                             return;
                         }
                     }
@@ -461,6 +467,10 @@ size_t ggml_allocr_alloc_graph_n(
             if (ggml_is_view(node)) {
                 struct ggml_tensor * view_src = node->view_src;
                 hash_get(ht, view_src)->n_views += 1;
+                if (node->buffer == NULL && node->data != NULL) {
+                    // view of a pre-allocated tensor, didn't call init_view() yet
+                    init_view(alloc, node);
+                }
             }
 
             for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -469,6 +479,9 @@ size_t ggml_allocr_alloc_graph_n(
                     break;
                 }
                 hash_get(ht, parent)->n_children += 1;
+                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
+                    init_view(alloc, parent);
+                }
             }
         }
     }
diff --git a/src/ggml.c b/src/ggml.c
index aabe2e4df..b606d7cc3 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4950,8 +4950,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 
     *result = (struct ggml_tensor) {
         /*.type         =*/ type,
-        /*.backend      =*/ view_src ? view_src->backend : GGML_BACKEND_CPU,
-        /*.buffer       =*/ view_src ? view_src->buffer : NULL,
+        /*.backend      =*/ GGML_BACKEND_CPU,
+        /*.buffer       =*/ NULL,
         /*.n_dims       =*/ n_dims,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },

From b42e19c4de695fe11f225e0d826a5cbd533d23e1 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 01:47:59 +0200
Subject: [PATCH 15/23] ggml-cuda : fix padding clearing

---
 src/ggml-alloc.c |  2 +-
 src/ggml-cuda.cu | 47 ++++++++++++++++++++++++++++-------------------
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index bb027e2f9..a749c810d 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -393,7 +393,7 @@ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
     view->data    = (char *)view->view_src->data + view->view_offs;
 
     // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
-    // due to the ggml_tensor_extra_gpu ring buffer overwriting the with the KV cache extras
+    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
     assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
     ggml_backend_buffer_init_tensor(alloc->buffer, view);
 }
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index d65e2143e..f21cb1a1c 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -7586,9 +7586,11 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff
 
     int64_t ne0 = tensor->ne[0];
 
-    if (ne0 % MATRIX_ROW_PADDING != 0) {
-        size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
-            * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+        }
     }
 
     return size;
@@ -7598,6 +7600,14 @@ static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buff
 
 static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->backend == buffer->backend);
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
     ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
 
     extra->data_device[g_main_device] = tensor->data;
@@ -7605,12 +7615,18 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
     tensor->backend = GGML_BACKEND_GPU;
     tensor->extra = extra;
 
-    // initialize padding to 0 to avoid possible NaN values
-    size_t original_size = ggml_nbytes(tensor);
-    size_t size = ggml_backend_cuda_buffer_get_alloc_size(buffer, tensor);
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
 
-    if (size > original_size && tensor->view_src == nullptr) {
-        CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, size - original_size, g_cudaStreams[g_main_device][0]));
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
+        }
     }
 
     UNUSED(buffer);
@@ -7690,18 +7706,11 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
     params.ith = 0;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
-        // views of allocated tensors don't call init_tensor, handle them here
-        // TODO: handle in ggml-alloc
-        if (node->extra == nullptr) {
-            GGML_ASSERT(node->view_src != nullptr);
-            GGML_ASSERT(node->view_src->backend == GGML_BACKEND_GPU);
-            ggml_backend_cuda_buffer_init_tensor(node->buffer, node);
-        }
+
+        assert(node->backend == GGML_BACKEND_GPU);
         for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr && node->src[j]->extra == nullptr) {
-                GGML_ASSERT(node->src[j]->view_src != nullptr);
-                GGML_ASSERT(node->src[j]->view_src->backend == GGML_BACKEND_GPU);
-                ggml_backend_cuda_buffer_init_tensor(node->src[j]->buffer, node->src[j]);
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->backend == GGML_BACKEND_GPU);
             }
         }
 

From 94b05299b117142817f1f6cca1abea8e1133663b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 6 Oct 2023 10:51:58 +0300
Subject: [PATCH 16/23] ggml-backend : metal (#552)

* ggml-backend : metal (WIP)

* ggml-backend : metal (adapt CPU backend)

* ggml-backend : working metal

* ggml-backend : clean-up metal implementation

* ggml-backend : add ggml_backend_is_metal()
---
 examples/gpt-2/CMakeLists.txt |   5 ++
 examples/gpt-2/main.cpp       |  55 +++++++++-----
 include/ggml/ggml-backend.h   |   7 +-
 src/CMakeLists.txt            |   2 +-
 src/ggml-backend.c            |  16 +++-
 src/ggml-cuda.h               |   1 -
 src/ggml-metal.h              |  17 ++++-
 src/ggml-metal.m              | 135 +++++++++++++++++++++++++++++++++-
 8 files changed, 210 insertions(+), 28 deletions(-)

diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt
index 2307a7dd9..6ddada061 100644
--- a/examples/gpt-2/CMakeLists.txt
+++ b/examples/gpt-2/CMakeLists.txt
@@ -18,6 +18,11 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 if (GGML_CUBLAS)
     add_compile_definitions(GGML_USE_CUBLAS)
 endif()
+
 if (GGML_CLBLAST)
     add_compile_definitions(GGML_USE_CLBLAST)
 endif()
+
+if (GGML_METAL)
+    add_compile_definitions(GGML_USE_METAL)
+endif()
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index e7bab0ba1..25725a1d1 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -6,6 +6,10 @@
 #include "ggml-cuda.h"
 #endif
 
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
 #include "common.h"
 #include "common-ggml.h"
 
@@ -22,6 +26,13 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
     int32_t n_vocab = 50257;
@@ -234,6 +245,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
     }
 #endif
 
+#ifdef GGML_USE_METAL
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
     if (!model.backend) {
         // fallback to CPU backend
         fprintf(stderr, "%s: using CPU backend\n", __func__);
@@ -521,9 +543,8 @@ struct ggml_cgraph * gpt2_graph(
             // [ 768, N]
             cur = ggml_add(ctx0,
                     ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    //ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                        cur,
+                        model.layers[il].ln_1_g),
                     model.layers[il].ln_1_b);
         }
 
@@ -541,8 +562,8 @@ struct ggml_cgraph * gpt2_graph(
                     cur);
 
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
+                    cur,
+                    model.layers[il].c_attn_attn_b);
         }
 
         // self-attention
@@ -649,8 +670,8 @@ struct ggml_cgraph * gpt2_graph(
                     cur);
 
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
+                    cur,
+                    model.layers[il].c_attn_proj_b);
         }
 
         // add the input
@@ -668,9 +689,8 @@ struct ggml_cgraph * gpt2_graph(
                 // [ 768, N]
                 cur = ggml_add(ctx0,
                         ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        //ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+                            cur,
+                            model.layers[il].ln_2_g),
                         model.layers[il].ln_2_b);
             }
 
@@ -687,8 +707,8 @@ struct ggml_cgraph * gpt2_graph(
                     cur);
 
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
+                    cur,
+                    model.layers[il].c_mlp_fc_b);
 
             // GELU activation
             // [3072, N]
@@ -707,8 +727,8 @@ struct ggml_cgraph * gpt2_graph(
                     cur);
 
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
+                    cur,
+                    model.layers[il].c_mlp_proj_b);
         }
 
         // input for next layer
@@ -724,9 +744,8 @@ struct ggml_cgraph * gpt2_graph(
         // [ 768, N]
         inpL = ggml_add(ctx0,
                 ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                //ggml_repeat(ctx0, model.ln_f_b, inpL));
+                    inpL,
+                    model.ln_f_g),
                 model.ln_f_b);
     }
 
@@ -778,6 +797,8 @@ bool gpt2_eval(
     // run the computation
     if (ggml_backend_is_cpu(model.backend)) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    } else if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_threads(model.backend, n_threads);
     }
     ggml_backend_graph_compute(model.backend, gf);
 
diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
index 606ea5e4d..36457e991 100644
--- a/include/ggml/ggml-backend.h
+++ b/include/ggml/ggml-backend.h
@@ -132,14 +132,17 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-
     GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 
     GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
 
     ///////////////////////////
 
+    // TODO: we should probably do something better here
+    GGML_API bool ggml_backend_is_cpu  (ggml_backend_t backend);
+    GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend);
+    GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
 #if 0
     // graph splitting
     #define GGML_MAX_SPLITS 200
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bcfb4b23b..b225597ed 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -229,7 +229,7 @@ if (GGML_METAL)
     set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)
 
     add_compile_definitions(GGML_USE_METAL)
-    add_compile_definitions(GGML_METAL_NDEBUG)
+    #add_compile_definitions(GGML_METAL_NDEBUG)
 
     # get full path to the file
     #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index c5bc03280..187a149c4 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -369,10 +369,6 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     return cpu_backend;
 }
 
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend->interface.get_name == ggml_backend_cpu_name;
-}
-
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
@@ -385,6 +381,18 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
     return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
 
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend->interface.get_name == ggml_backend_cpu_name;
+}
+
+bool ggml_backend_is_cuda(ggml_backend_t backend) {
+    return strcmp(ggml_backend_name(backend), "CUDA") == 0;
+}
+
+bool ggml_backend_is_metal(ggml_backend_t backend) {
+    return strcmp(ggml_backend_name(backend), "Metal") == 0;
+}
+
 #if 0
 // splits
 
diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h
index 81ee9a2e9..57adc9cf3 100644
--- a/src/ggml-cuda.h
+++ b/src/ggml-cuda.h
@@ -46,7 +46,6 @@ GGML_API void   ggml_cuda_get_device_description(int device, char * description,
 // backend API
 GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
 
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/src/ggml-metal.h b/src/ggml-metal.h
index 790cf0bf7..bc6773a6e 100644
--- a/src/ggml-metal.h
+++ b/src/ggml-metal.h
@@ -20,6 +20,7 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-backend.h"
 
 #include <stddef.h>
 #include <stdbool.h>
@@ -35,10 +36,15 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+//
+// internal API
+// temporary exposed to user-code
+//
 
 struct ggml_metal_context;
 
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
@@ -83,6 +89,15 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/ggml-metal.m b/src/ggml-metal.m
index 866fed434..a06b738a1 100644
--- a/src/ggml-metal.m
+++ b/src/ggml-metal.m
@@ -151,8 +151,6 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
     }
 }
 
-
-
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
     GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 
@@ -1371,3 +1369,136 @@ void ggml_metal_graph_compute(
 
     }
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+    return "Metal";
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_free(ggml_backend_t backend) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    ggml_metal_free(ctx);
+    free(backend);
+}
+
+static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *)buffer->context;
+}
+
+static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    free(buffer->context);
+    UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i metal_backend_buffer_i = {
+    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
+    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
+    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+    /* .init_tensor    = */ NULL, // no initialization required
+    /* .free_tensor    = */ NULL, // no cleanup required
+};
+
+static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    void * data = ggml_metal_host_malloc(size);
+
+    // TODO: set proper name of the buffers
+    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
+
+    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
+    return 32;
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy((char *)tensor->data + offset, data, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+    memcpy(data, (const char *)tensor->data + offset, size);
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+
+    ggml_metal_graph_compute(metal_ctx, cgraph);
+}
+
+static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    return true;
+    UNUSED(backend);
+    UNUSED(op);
+}
+
+static struct ggml_backend_i metal_backend_i = {
+    /* .get_name            = */ ggml_backend_metal_name,
+    /* .free                = */ ggml_backend_metal_free,
+    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
+    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
+    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
+    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
+    /* .synchronize         = */ ggml_backend_metal_synchronize,
+    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
+    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
+    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
+    /* .graph_plan_free     = */ NULL,
+    /* .graph_plan_compute  = */ NULL,
+    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
+    /* .supports_op         = */ ggml_backend_metal_supports_op,
+};
+
+ggml_backend_t ggml_backend_metal_init(void) {
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+
+    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
+
+    *metal_backend = (struct ggml_backend) {
+        /* .interface = */ metal_backend_i,
+        /* .context   = */ ctx,
+    };
+
+    return metal_backend;
+}
+
+void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    ggml_metal_set_n_cb(ctx, n_threads);
+}

From ce797df6c3e03f23e6ba6222ca54f2111c12f93e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 6 Oct 2023 10:57:23 +0300
Subject: [PATCH 17/23] gpt-2 : take advantage of Metal unified memory

---
 examples/gpt-2/main.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 25725a1d1..0b379a960 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -439,8 +439,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
             ggml_allocr_alloc(alloc, tensor);
 
-            if (ggml_backend_is_cpu(model.backend)) {
-                // for the CPU backend, we can read directly into the tensor
+            if (ggml_backend_is_cpu  (model.backend) ||
+                ggml_backend_is_metal(model.backend)) {
+                // for the CPU and Metal backend, we can read directly into the tensor
                 fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
             } else {
                 // read into a temporary buffer first, then copy to device memory

From e8bc940ac6a2698fa14cd9ea8c7f463f8936b62d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 6 Oct 2023 10:59:01 +0300
Subject: [PATCH 18/23] gpt-2 : remove TODO + update comment

---
 examples/gpt-2/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 0b379a960..f86fbf9f3 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -360,12 +360,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
 
         // allocate the tensors into the backend buffer
-        // TODO: better API for this
         {
             ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
 
             // this updates the pointers in the tensors to point to the correct location in the buffer
             // this is necessary since the ggml_context is .no_alloc == true
+            // note that the buffer can actually be a device buffer, depending on the backend
             ggml_allocr_alloc(alloc, model.memory_k);
             ggml_allocr_alloc(alloc, model.memory_v);
 

From 5ca14cedb4bbe692710a0e507bc59b0949bed11d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 6 Oct 2023 11:05:36 +0300
Subject: [PATCH 19/23] ggml-backend : fix ggml_backend_is_xxx() interface

---
 examples/gpt-2/main.cpp     |  6 +++++-
 include/ggml/ggml-backend.h |  7 ++-----
 src/ggml-backend.c          | 16 ++++------------
 src/ggml-metal.h            |  2 ++
 src/ggml-metal.m            |  4 ++++
 5 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index f86fbf9f3..7a2d5374d 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -798,9 +798,13 @@ bool gpt2_eval(
     // run the computation
     if (ggml_backend_is_cpu(model.backend)) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    } else if (ggml_backend_is_metal(model.backend)) {
+    }
+#ifdef GGML_USE_METAL
+    // TODO: not great - what should we do?
+    if (ggml_backend_is_metal(model.backend)) {
         ggml_backend_metal_set_n_threads(model.backend, n_threads);
     }
+#endif
     ggml_backend_graph_compute(model.backend, gf);
 
     //if (n_past%100 == 0) {
diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
index 36457e991..22d324e9d 100644
--- a/include/ggml/ggml-backend.h
+++ b/include/ggml/ggml-backend.h
@@ -132,17 +132,14 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
+    GGML_API bool ggml_backend_is_cpu  (ggml_backend_t backend);
+
     GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 
     GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
 
     ///////////////////////////
 
-    // TODO: we should probably do something better here
-    GGML_API bool ggml_backend_is_cpu  (ggml_backend_t backend);
-    GGML_API bool ggml_backend_is_cuda (ggml_backend_t backend);
-    GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-
 #if 0
     // graph splitting
     #define GGML_MAX_SPLITS 200
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index 187a149c4..c5bc03280 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -369,6 +369,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     return cpu_backend;
 }
 
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    return backend->interface.get_name == ggml_backend_cpu_name;
+}
+
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
@@ -381,18 +385,6 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
     return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
 
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend->interface.get_name == ggml_backend_cpu_name;
-}
-
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return strcmp(ggml_backend_name(backend), "CUDA") == 0;
-}
-
-bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return strcmp(ggml_backend_name(backend), "Metal") == 0;
-}
-
 #if 0
 // splits
 
diff --git a/src/ggml-metal.h b/src/ggml-metal.h
index bc6773a6e..cb5646587 100644
--- a/src/ggml-metal.h
+++ b/src/ggml-metal.h
@@ -96,6 +96,8 @@ void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgrap
 
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
 GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads);
 
 #ifdef __cplusplus
diff --git a/src/ggml-metal.m b/src/ggml-metal.m
index a06b738a1..055d137f8 100644
--- a/src/ggml-metal.m
+++ b/src/ggml-metal.m
@@ -1497,6 +1497,10 @@ ggml_backend_t ggml_backend_metal_init(void) {
     return metal_backend;
 }
 
+bool ggml_backend_is_metal(ggml_backend_t backend) {
+    return backend->interface.get_name == ggml_backend_metal_name;
+}
+
 void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) {
     struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
 

From b22916c3a3f4d0006e85ca4ee145456966fe553b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 6 Oct 2023 11:33:32 +0300
Subject: [PATCH 20/23] gpt-2 : fix build

---
 examples/gpt-2/main.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 7a2d5374d..61b53296b 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -439,8 +439,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
             ggml_allocr_alloc(alloc, tensor);
 
-            if (ggml_backend_is_cpu  (model.backend) ||
-                ggml_backend_is_metal(model.backend)) {
+            if (ggml_backend_is_cpu  (model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+                ) {
                 // for the CPU and Metal backend, we can read directly into the tensor
                 fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
             } else {
@@ -800,7 +803,6 @@ bool gpt2_eval(
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
 #ifdef GGML_USE_METAL
-    // TODO: not great - what should we do?
     if (ggml_backend_is_metal(model.backend)) {
         ggml_backend_metal_set_n_threads(model.backend, n_threads);
     }

From 01710cc037206eb1c66347b3df3888487d97551d Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 14:46:20 +0200
Subject: [PATCH 21/23] ggml-cuda : cleanup, fix case for src1 not contiguous

---
 src/ggml-cuda.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
index f21cb1a1c..c8c36c573 100644
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -6732,8 +6732,7 @@ static void ggml_cuda_op_mul_mat(
         if (convert_src1_to_q8_1) {
             src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
 
-            // FIXME: why split only? src1 never gets quantized, breaks ggml-backend/GPT-2
-            if (/*split &&*/ src1_on_device && src1_is_contiguous) {
+            if (src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
@@ -6815,7 +6814,7 @@ static void ggml_cuda_op_mul_mat(
                     GGML_ASSERT(false);
                 }
 
-                if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
                     quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }

From 9cb2626053cd532021342f2fc1ddd8de42535718 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 18:24:56 +0200
Subject: [PATCH 22/23] remove commented code

---
 include/ggml/ggml-backend.h |  49 +--------
 src/ggml-alloc.c            |   8 +-
 src/ggml-backend.c          | 202 +-----------------------------------
 3 files changed, 6 insertions(+), 253 deletions(-)

diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h
index 22d324e9d..9e0567c6b 100644
--- a/include/ggml/ggml-backend.h
+++ b/include/ggml/ggml-backend.h
@@ -132,56 +132,11 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API bool ggml_backend_is_cpu  (ggml_backend_t backend);
+    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
 
     GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
-    ///////////////////////////
-
-#if 0
-    // graph splitting
-    #define GGML_MAX_SPLITS 200
-    #define GGML_MAX_SPLIT_INPUTS 4
-
-    struct ggml_graph_split {
-        char name[GGML_MAX_NAME];
-        struct ggml_context * ctx;
-        struct ggml_tensor  * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
-        struct ggml_tensor  * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
-        struct ggml_cgraph  * graph;
-    };
-
-    // TODO: this shouldn't be fixed size, allocate from ggml_context
-    struct ggml_graph_splits {
-        int n_splits;
-        struct ggml_graph_split splits[GGML_MAX_SPLITS];
-    };
-
-    // TODO: allocate in ggml_context
-    GGML_API struct ggml_graph_splits ggml_graph_split_init(void);
-
-    // this won't be needed once we can allocate graphs from a ggml_context
-    GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits);
-
-    // add a split to the graph - single and multiple inputs versions
-    GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...);
-    GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...);
-
-    // build graphs for all splits
-    GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output);
-
-    // compute
-    GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
-
-    // graph tensor allocator
-    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx);
-    GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
-
-    // automatically split a graph into multiple graphs based on the location of the tensors
-    GGML_API struct ggml_graph_splits ggml_graph_split(struct ggml_cgraph * graph, struct ggml_context * ctx);
-#endif
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
 
 #ifdef  __cplusplus
 }
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index a749c810d..e1b4377d6 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -269,7 +269,7 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
 }
 
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
 
     struct ggml_allocr * alloc = ggml_allocr_new_from_buffer(buffer);
     alloc->alignment = alignment;
@@ -306,11 +306,7 @@ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * bu
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
     struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
 
-    // TODO: these should be set by the backend:
-    //  - get_alignment()
-    //  - get_alloc_size()
-    // TODO: support other backends
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr((void *)0x1000, (size_t)-0x1001);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, (void *)0x1000, (size_t)-0x1001);
 
     *alloc = (struct ggml_allocr){
         /*.buffer        = */ buffer,
diff --git a/src/ggml-backend.c b/src/ggml-backend.c
index c5bc03280..f9e53a8a0 100644
--- a/src/ggml-backend.c
+++ b/src/ggml-backend.c
@@ -380,204 +380,6 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
-    // TODO: NULL backend?
-    return ggml_backend_buffer_init(NULL, cpu_backend_buffer_i_from_ptr, ptr, size);
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
+    return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
-
-#if 0
-// splits
-
-struct ggml_graph_splits ggml_graph_split_init(void) {
-    struct ggml_graph_splits splits = {0};
-    return splits;
-}
-
-// TODO: this can be removed after allocating the graphs in a ggml_context
-void ggml_graph_splits_free(struct ggml_graph_splits * splits) {
-    for (int i = 0; i < splits->n_splits; i++) {
-        if (splits->splits[i].graph) {
-            free(splits->splits[i].graph);
-        }
-    }
-}
-
-static void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, va_list args) {
-    GGML_ASSERT(splits->n_splits < GGML_MAX_SPLITS);
-
-    struct ggml_graph_split * split = &splits->splits[splits->n_splits];
-
-
-    if (splits->n_splits == 0) {
-        // always add the first split
-        int i = 0;
-        while (inputs[i] != NULL) {
-            GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
-            split->src_inputs[i] = *inputs[i];
-            split->dst_inputs[i] = *inputs[i];
-            i++;
-        }
-        split->src_inputs[i] = NULL;
-        split->dst_inputs[i] = NULL;
-        split->ctx = ctx;
-    }
-    // check if the split is on the same context as the previous one
-    else if (splits->n_splits > 0 && splits->splits[splits->n_splits - 1].ctx == ctx) {
-        // add to the previous split
-        char name[GGML_MAX_NAME - 2];
-        int n = vsnprintf(name, sizeof(name), fmt, args);
-        char new_name[GGML_MAX_NAME];
-        snprintf(new_name, sizeof(new_name), "%.*s,%s", GGML_MAX_NAME - n - 2, splits->splits[splits->n_splits - 1].name, name);
-        strcpy(splits->splits[splits->n_splits - 1].name, new_name);
-        return;
-    } else {
-        // add a new split
-        int i = 0;
-        while (inputs[i] != NULL) {
-            GGML_ASSERT(i < GGML_MAX_SPLIT_INPUTS);
-            split->src_inputs[i] = *inputs[i];
-            split->dst_inputs[i] = ggml_dup_tensor(ctx, *inputs[i]);
-            ggml_format_name(split->dst_inputs[i], "%s (split output)", split->src_inputs[i]->name);
-            // TODO: maybe support different layouts in ggml_backend_cpy_tensor instead
-            for (int j = 0; j < GGML_MAX_DIMS; j++) {
-                split->dst_inputs[i]->nb[j] = split->src_inputs[i]->nb[j];
-            }
-            ggml_set_name(split->dst_inputs[i], ggml_get_name(*inputs[i]));
-            *inputs[i] = split->dst_inputs[i];
-            i++;
-        }
-        split->src_inputs[i] = NULL;
-        split->dst_inputs[i] = NULL;
-        split->ctx = ctx;
-    }
-
-    vsnprintf(split->name, GGML_MAX_NAME, fmt, args);
-    split->graph = NULL;
-    splits->n_splits++;
-}
-
-void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** input, struct ggml_context * ctx, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    ggml_graph_splits_add_n_va(splits, input, ctx, fmt, args);
-    va_end(args);
-}
-
-void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    ggml_graph_splits_add_n_va(splits, (struct ggml_tensor**[2]){ input, NULL }, ctx, fmt, args);
-    va_end(args);
-}
-
-void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output) {
-    struct ggml_tensor *last_outputs[2] = { output, NULL };
-    struct ggml_tensor ** outputs;
-
-    for (int i = 0; i < splits->n_splits; i++) {
-        struct ggml_graph_split * split = &splits->splits[i];
-
-        if (i < splits->n_splits - 1) {
-            outputs = splits->splits[i + 1].src_inputs;
-        } else {
-            outputs = last_outputs;
-        }
-
-        // build the graph
-        // TODO: allocate graphs in context
-        split->graph = (struct ggml_cgraph *) malloc(sizeof(struct ggml_cgraph));
-        memset(split->graph, 0, sizeof(struct ggml_cgraph));
-        for (int j = 0; outputs[j] != NULL; j++) {
-            ggml_build_forward_expand(split->graph, outputs[j]);
-        }
-
-        for (int j = 1; j < split->graph->n_nodes; j++) {
-            if (split->graph->nodes[j]->backend != split->graph->nodes[0]->backend) {
-                fprintf(stderr, "split %s: node %s has different backend (%s) than the first node (%s)\n",
-                    split->name, split->graph->nodes[j]->name,
-                    ggml_backend_name(split->graph->nodes[j]->backend_s),
-                    ggml_backend_name(split->graph->nodes[0]->backend_s));
-            }
-        }
-        for (int j = 1; j < split->graph->n_leafs; j++) {
-            if (split->graph->leafs[j]->backend != split->graph->leafs[0]->backend) {
-                fprintf(stderr, "split %s: leaf %s has different backend (%s) than the first leaf (%s)\n",
-                    split->name, split->graph->leafs[j]->name,
-                    ggml_backend_name(split->graph->leafs[j]->backend_s),
-                    ggml_backend_name(split->graph->leafs[0]->backend_s));
-            }
-        }
-    }
-}
-
-void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
-    uint64_t copy_us = 0;
-    uint64_t compute_cpu_us = 0;
-    uint64_t compute_gpu_us = 0;
-    int n_nodes = 0;
-    for (int i = 0; i < splits->n_splits; i++) {
-        struct ggml_graph_split * split = &splits->splits[i];
-
-        //printf("computing split %i (%s) on backend %s (%i nodes)\n", i, split->name, ggml_backend_name(split->dst_inputs[0]->backend), split->graph->n_nodes);
-
-        // copy the input tensor to the backend
-        uint64_t copy_start_us = ggml_time_us();
-        for (int j = 0; split->src_inputs[j] != NULL; j++) {
-            //printf("\tcopying tensor %d (%s) (%s -> %s) (%lu bytes)\n", j, split->src_inputs[j]->name, ggml_backend_name(split->src_inputs[j]->backend), ggml_backend_name(split->dst_inputs[j]->backend), ggml_nbytes(split->src_inputs[j]));
-            //printf("%p %p\n", split->src_inputs[j], split->dst_inputs[j]);
-            ggml_backend_tensor_copy(split->src_inputs[j], split->dst_inputs[j]);
-        }
-        // ggml_backend_synchronize(split->dst_inputs[0]->backend);
-        copy_us += ggml_time_us() - copy_start_us;
-
-#if 0
-        char split_filename[GGML_MAX_NAME];
-        snprintf(split_filename, GGML_MAX_NAME, "split_%i.dot", i);
-        ggml_graph_dump_dot(split->graph, NULL, split_filename);
-#endif
-        uint64_t start = ggml_time_us();
-        ggml_backend_graph_compute(split->dst_inputs[0]->backend_s, split->graph);
-        //ggml_backend_synchronize(split->dst_inputs[0]->backend);
-        uint64_t end = ggml_time_us();
-        if (strcmp(ggml_backend_name(split->dst_inputs[0]->backend_s), "CPU") == 0) {
-            compute_cpu_us += end - start;
-        } else {
-            compute_gpu_us += end - start;
-        }
-
-        n_nodes += split->graph->n_nodes;
-    }
-
-    //printf("ggml_graph_splits_compute: n_splits: %d, nodes: %d, copy: %.2fms, compute_cpu: %.2fms, compute_gpu: %.2fms\n", splits->n_splits, n_nodes, copy_us / 1000.0, compute_cpu_us / 1000.0, compute_gpu_us / 1000.0);
-    //exit(0);
-}
-
-void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits) {
-    // splits of the same backend are allocated together to ensure that dependencies from one split to the next
-    // are not overwritten when there is another split from a different backend between them (e.g. inpSA in llama.cpp)
-    bool visited[GGML_MAX_SPLITS] = {false};
-    for (int i = 0; i < splits->n_splits; i++) {
-        if (!visited[i]) {
-            struct ggml_graph_split * split = &splits->splits[i];
-            struct ggml_context * ctx = split->ctx;
-            struct ggml_cgraph * backend_graphs[GGML_MAX_SPLITS];
-            struct ggml_tensor ** graph_inputs[GGML_MAX_SPLITS];
-            struct ggml_tensor ** graph_outputs[GGML_MAX_SPLITS];
-            int n_graphs = 0;
-
-            for (int j = i; j < splits->n_splits; j++) {
-                if (splits->splits[j].ctx == ctx) {
-                    graph_inputs[n_graphs] = splits->splits[j].dst_inputs;
-                    graph_outputs[n_graphs] = j < splits->n_splits - 1 ? splits->splits[j + 1].src_inputs : NULL;
-                    backend_graphs[n_graphs] = splits->splits[j].graph;
-                    visited[j] = true;
-                    n_graphs++;
-                }
-            }
-
-            struct ggml_allocr * alloc = NULL;
-            ggml_allocr_alloc_graph_n(alloc, backend_graphs, n_graphs, graph_inputs, graph_outputs);
-        }
-    }
-}
-#endif

From 1ad7c5ee442b797a2ce1992772991ecc216b54b9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 6 Oct 2023 18:46:29 +0200
Subject: [PATCH 23/23] rename ggml_backend_metal_set_n_threads to n_cb

---
 examples/gpt-2/main.cpp | 2 +-
 src/ggml-metal.h        | 2 +-
 src/ggml-metal.m        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 61b53296b..0acb3a1b1 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -804,7 +804,7 @@ bool gpt2_eval(
     }
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_threads(model.backend, n_threads);
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
     }
 #endif
     ggml_backend_graph_compute(model.backend, gf);
diff --git a/src/ggml-metal.h b/src/ggml-metal.h
index cb5646587..096b844e3 100644
--- a/src/ggml-metal.h
+++ b/src/ggml-metal.h
@@ -98,7 +98,7 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
-GGML_API void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads);
+GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 
 #ifdef __cplusplus
 }
diff --git a/src/ggml-metal.m b/src/ggml-metal.m
index 055d137f8..e56436394 100644
--- a/src/ggml-metal.m
+++ b/src/ggml-metal.m
@@ -1501,8 +1501,8 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
     return backend->interface.get_name == ggml_backend_metal_name;
 }
 
-void ggml_backend_metal_set_n_threads(ggml_backend_t backend, int n_threads) {
+void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
     struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
 
-    ggml_metal_set_n_cb(ctx, n_threads);
+    ggml_metal_set_n_cb(ctx, n_cb);
 }