From 5d124d0cb4ebf834aa136aade847092777078c35 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 15 Jun 2023 20:34:56 +0200
Subject: [PATCH 001/235] fix track_max_mem in
 forward_batch_wo_cache_flash_attn_train

---
 .../train-text-from-scratch.cpp                      | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 54dc2beed0080..828a2a9b76bda 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1445,17 +1445,22 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
+    bool track_max_mem = true;
+
     int last_buf = -1;
     size_t buf_offs[2] = { 0, 0 };
     size_t buf_size[2] = { size_buf_0,
                            size_buf_1 };
     void * buf_data[2] = { compute_buf_0,
                            compute_buf_1 };
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) {
+    size_t buf_maxs[2] = { 0, 0 };
+
+    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
         size_t last_offs = 0;
         last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
         if (last_buf >= 0) {
             buf_offs[last_buf] = last_offs;
+            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
         }
         if (buf >= 0) {
             size_t offs = buf_offs[buf];
@@ -1466,8 +1471,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         last_buf = buf;
     };
 
-    bool track_max_mem = false;
-    size_t buf_maxs[2] = { 0, 0 };
 
     auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
         if (buf < 0) return;
@@ -1903,6 +1906,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     *logits = t35;
 
+    clr_buf(0);
+    clr_buf(1);
+
     if (track_max_mem) {
         printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
         printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);

From d39c8e686375b4e2dedbf98e2e11b12b1aef2526 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 15 Jun 2023 21:07:56 +0200
Subject: [PATCH 002/235] remove unnecessary Adam(W) optimizer tensors.

reduces optimizer memory overhead from 7*modelsize to 2*modelsize.

additionally allows to optimize models with more than 2^31 parameters by replacing int with int64_t.

bumps training checkpoint file version, but old checkpoints can still be read.
new version with less tensors is saved.
---
 .../train-text-from-scratch.cpp               | 105 +++++++++++++++---
 ggml.c                                        |  96 ++++++++--------
 ggml.h                                        |   5 -
 3 files changed, 136 insertions(+), 70 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 828a2a9b76bda..60d2b57838e65 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2406,8 +2406,27 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     file->read_raw(tensor->data, ggml_nbytes(tensor));
 }
 
+void skip_tensor(struct llama_file * file) {
+    int32_t nd = file->read_u32();
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
+
+    uint32_t ne[4] = { 1, 1, 1, 1 };
+
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+
+    std::string name = file->read_string(name_len);
+
+    file->seek(-file->tell() & 31, SEEK_CUR);
+
+    size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
+    size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
+    file->seek(nbytes, SEEK_CUR);
+}
+
 void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 0;
+    const uint32_t version = 1;
     GGML_ASSERT(opt->nx   >= 0);
     GGML_ASSERT(opt->iter >= 0);
     file->write_u32(version);
@@ -2418,14 +2437,10 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                GGML_ASSERT(opt->adam.x != NULL);
-                write_tensor(file, opt->adam.x);
-                write_tensor(file, opt->adam.g1);
-                write_tensor(file, opt->adam.g2);
+                GGML_ASSERT(opt->adam.m  != NULL);
+                GGML_ASSERT(opt->adam.v  != NULL);
                 write_tensor(file, opt->adam.m);
                 write_tensor(file, opt->adam.v);
-                write_tensor(file, opt->adam.mh);
-                write_tensor(file, opt->adam.vh);
                 write_tensor(file, opt->adam.pf);
                 file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
                 file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
@@ -2433,7 +2448,7 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
             } break;
         case GGML_OPT_LBFGS:
             {
-                GGML_ASSERT(opt->adam.x != NULL);
+                GGML_ASSERT(opt->lbfgs.x != NULL);
                 write_tensor(file, opt->lbfgs.x);
                 write_tensor(file, opt->lbfgs.xp);
                 write_tensor(file, opt->lbfgs.g);
@@ -2454,10 +2469,53 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     }
 }
 
-void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    uint32_t version = file->read_u32();
-    GGML_ASSERT(version == 0);
+void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    file->read_raw(&opt->params, sizeof(opt->params));
+    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+    file->read_raw(&opt->iter,   sizeof(opt->iter));
+    opt->just_initialized = (bool) file->read_u32();
 
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                skip_tensor(file);
+                skip_tensor(file);
+                skip_tensor(file);
+                read_tensor(file, opt->adam.m);
+                read_tensor(file, opt->adam.v);
+                skip_tensor(file);
+                skip_tensor(file);
+                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->lbfgs.x != NULL);
+                read_tensor(file, opt->lbfgs.x);
+                read_tensor(file, opt->lbfgs.xp);
+                read_tensor(file, opt->lbfgs.g);
+                read_tensor(file, opt->lbfgs.gp);
+                read_tensor(file, opt->lbfgs.d);
+                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+                read_tensor(file, opt->lbfgs.lmal);
+                read_tensor(file, opt->lbfgs.lmys);
+                read_tensor(file, opt->lbfgs.lms);
+                read_tensor(file, opt->lbfgs.lmy);
+                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
     file->read_raw(&opt->params, sizeof(opt->params));
     file->read_raw(&opt->nx,     sizeof(opt->nx));
     ggml_opt_init(ctx, opt, opt->params, opt->nx);
@@ -2468,13 +2526,8 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                read_tensor(file, opt->adam.x);
-                read_tensor(file, opt->adam.g1);
-                read_tensor(file, opt->adam.g2);
                 read_tensor(file, opt->adam.m);
                 read_tensor(file, opt->adam.v);
-                read_tensor(file, opt->adam.mh);
-                read_tensor(file, opt->adam.vh);
                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
@@ -2482,7 +2535,7 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
             } break;
         case GGML_OPT_LBFGS:
             {
-                GGML_ASSERT(opt->adam.x != NULL);
+                GGML_ASSERT(opt->lbfgs.x != NULL);
                 read_tensor(file, opt->lbfgs.x);
                 read_tensor(file, opt->lbfgs.xp);
                 read_tensor(file, opt->lbfgs.g);
@@ -2503,6 +2556,24 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     }
 }
 
+void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    uint32_t version = file->read_u32();
+    switch (version) {
+        case 0:
+            {
+                read_opt_context_v0(file, ctx, opt);
+            } break;
+        case 1:
+            {
+                read_opt_context_v1(file, ctx, opt);
+            } break;
+        default:
+            {
+                fprintf(stderr, "%s: unknown version %ud\n", __func__, version);
+            }
+    }
+}
+
 void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
     struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
diff --git a/ggml.c b/ggml.c
index b77f9926754ed..143f88d4a657c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17329,7 +17329,7 @@ static enum ggml_opt_result ggml_opt_adam(
     struct ggml_tensor * ps[GGML_MAX_PARAMS];
 
     int np = 0;
-    int nx = 0;
+    int64_t nx = 0;
     for (int i = 0; i < gf->n_nodes; ++i) {
         if (gf->nodes[i]->is_param) {
             GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
@@ -17355,19 +17355,11 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
 
-    float * x  = opt->adam.x->data;  // view of the parameters
-    float * g1 = opt->adam.g1->data; // gradient
-    float * g2 = opt->adam.g2->data; // gradient squared
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
-    float * mh = opt->adam.mh->data; // first moment hat
-    float * vh = opt->adam.vh->data; // second moment hat
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    // update view
-    ggml_opt_get_params(np, ps, x);
-
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
@@ -17412,43 +17404,61 @@ static enum ggml_opt_result ggml_opt_adam(
         UNUSED(t_start_cpu);
 
         {
-            // update the gradient
-            ggml_opt_get_grad(np, ps, g1);
+            int64_t i = 0;
+            for (int p = 0; p < np; ++p) {
+                const int64_t ne = ggml_nelements(ps[p]) ;
+                for (int64_t j = 0; j < ne; ++j) {
+                    float x = ggml_get_f32_1d(ps[p], j);
+                    float g = ggml_get_f32_1d(ps[p]->grad, j);
+                    m[i] = m[i]*beta1 +   g*(1.0f - beta1);
+                    v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
+                    float mh = m[i]*alpha/(1.0f - powf(beta1, opt->iter));
+                    float vh = v[i]*1.0f /(1.0f - powf(beta2, opt->iter));
+                    vh = sqrtf(vh) + eps;
+                    x  = x*(1.0f - decay) - mh/vh;
+                    ggml_set_f32_1d(ps[p], j, x);
+                    ++i;
+                }
+            }
+        }
+        // {
+        //     // update the gradient
+        //     ggml_opt_get_grad(np, ps, g1);
 
-            // m_t = beta1*m_t-1 + (1 - beta1)*g_t
-            ggml_vec_scale_f32(nx, m, beta1);
-            ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
+        //     // m_t = beta1*m_t-1 + (1 - beta1)*g_t
+        //     ggml_vec_scale_f32(nx, m, beta1);
+        //     ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
 
-            // g2 = g1^2
-            ggml_vec_sqr_f32  (nx, g2, g1);
+        //     // g2 = g1^2
+        //     ggml_vec_sqr_f32  (nx, g2, g1);
 
-            // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
-            ggml_vec_scale_f32(nx, v, beta2);
-            ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
+        //     // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
+        //     ggml_vec_scale_f32(nx, v, beta2);
+        //     ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
 
-            // m^hat = m_t / (1 - beta1^t)
-            // v^hat = v_t / (1 - beta2^t)
-            // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
-            // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
-            // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
-            // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
-            // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
-            ggml_vec_cpy_f32  (nx, mh, m);
-            ggml_vec_cpy_f32  (nx, vh, v);
+        //     // m^hat = m_t / (1 - beta1^t)
+        //     // v^hat = v_t / (1 - beta2^t)
+        //     // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
+        //     // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
+        //     // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
+        //     // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
+        //     // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
+        //     ggml_vec_cpy_f32  (nx, mh, m);
+        //     ggml_vec_cpy_f32  (nx, vh, v);
 
-            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
-            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
+        //     ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
+        //     ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
 
-            ggml_vec_sqrt_f32 (nx, vh, vh);
-            ggml_vec_acc1_f32 (nx, vh, eps);
+        //     ggml_vec_sqrt_f32 (nx, vh, vh);
+        //     ggml_vec_acc1_f32 (nx, vh, eps);
 
-            ggml_vec_div_f32  (nx, mh, mh, vh);
-            ggml_vec_scale_f32(nx, x,  1.0f - decay);
-            ggml_vec_sub_f32  (nx, x,  x,  mh);
+        //     ggml_vec_div_f32  (nx, mh, mh, vh);
+        //     ggml_vec_scale_f32(nx, x,  1.0f - decay);
+        //     ggml_vec_sub_f32  (nx, x,  x,  mh);
 
-            // update the parameters
-            ggml_opt_set_params(np, ps, x);
-        }
+        //     // update the parameters
+        //     ggml_opt_set_params(np, ps, x);
+        // }
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
@@ -17941,23 +17951,13 @@ GGML_API void ggml_opt_init(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                opt->adam.x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
                 opt->adam.m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
                 opt->adam.v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
                 opt->adam.pf = params.past > 0
                     ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
                     : NULL;
-                ggml_set_zero(opt->adam.x);
-                ggml_set_zero(opt->adam.g1);
-                ggml_set_zero(opt->adam.g2);
                 ggml_set_zero(opt->adam.m);
                 ggml_set_zero(opt->adam.v);
-                ggml_set_zero(opt->adam.mh);
-                ggml_set_zero(opt->adam.vh);
                 if (opt->adam.pf) {
                     ggml_set_zero(opt->adam.pf);
                 }
diff --git a/ggml.h b/ggml.h
index 9919cce7c263f..531b6cb07d81d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1537,13 +1537,8 @@ extern "C" {
         bool just_initialized;
 
         struct {
-            struct ggml_tensor * x;  // view of the parameters
-            struct ggml_tensor * g1; // gradient
-            struct ggml_tensor * g2; // gradient squared
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
-            struct ggml_tensor * mh; // first moment hat
-            struct ggml_tensor * vh; // second moment hat
             struct ggml_tensor * pf; // past function values
             float fx_best;
             float fx_prev;

From d395b19c8c400bc2f9197b95bdbae5122010370f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 15 Jun 2023 23:48:46 +0200
Subject: [PATCH 003/235] add gradient clipping to AdamW

---
 .../train-text-from-scratch.cpp               | 78 +++++++++++++++++--
 ggml.c                                        | 28 ++++++-
 ggml.h                                        |  1 +
 3 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 60d2b57838e65..a4a6b05b184b0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2430,7 +2430,8 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     GGML_ASSERT(opt->nx   >= 0);
     GGML_ASSERT(opt->iter >= 0);
     file->write_u32(version);
-    file->write_raw(&opt->params, sizeof(opt->params));
+    file->write_u32(opt->params.past);
+    file->write_u32(opt->params.lbfgs.m);
     file->write_raw(&opt->nx,     sizeof(opt->nx));
     file->write_raw(&opt->iter,   sizeof(opt->iter));
     file->write_u32((uint32_t)  opt->just_initialized);
@@ -2469,9 +2470,44 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     }
 }
 
+struct ggml_opt_params_v0 {
+    enum ggml_opt_type type;
+    int n_threads;
+    int past;
+    float delta;
+    int max_no_improvement;
+    bool print_forward_graph;
+    bool print_backward_graph;
+    struct {
+        int n_iter;
+        float sched;
+        float decay;
+        float alpha;
+        float beta1;
+        float beta2;
+        float eps;
+        float eps_f;
+        float eps_g;
+    } adam;
+    struct {
+        int m;
+        int n_iter;
+        int max_linesearch;
+        float eps;
+        float ftol;
+        float wolfe;
+        float min_step;
+        float max_step;
+        enum ggml_linesearch linesearch;
+    } lbfgs;
+};
+
 void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    file->read_raw(&opt->params, sizeof(opt->params));
-    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_params_v0 pv0;
+    file->read_raw(&pv0, sizeof(pv0));
+    opt->params.past = pv0.past;
+    opt->params.lbfgs.m = pv0.lbfgs.m;
+    file->read_raw(&opt->nx, sizeof(opt->nx));
     ggml_opt_init(ctx, opt, opt->params, opt->nx);
 
     file->read_raw(&opt->iter,   sizeof(opt->iter));
@@ -2516,7 +2552,8 @@ void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, st
 }
 
 void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    file->read_raw(&opt->params, sizeof(opt->params));
+    opt->params.past    = (int) file->read_u32();
+    opt->params.lbfgs.m = (int) file->read_u32();
     file->read_raw(&opt->nx,     sizeof(opt->nx));
     ggml_opt_init(ctx, opt, opt->params, opt->nx);
 
@@ -2558,6 +2595,7 @@ void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, st
 
 void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
     uint32_t version = file->read_u32();
+    printf("%s: opt context version %u\n", __func__, version);
     switch (version) {
         case 0:
             {
@@ -2569,7 +2607,7 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
             } break;
         default:
             {
-                fprintf(stderr, "%s: unknown version %ud\n", __func__, version);
+                fprintf(stderr, "%s: unknown version %u\n", __func__, version);
             }
     }
 }
@@ -2783,6 +2821,9 @@ struct train_params {
     int   adam_n_iter;
     float adam_alpha;
     float adam_decay;
+    float adam_beta1;
+    float adam_beta2;
+    float adam_gclip;
 
     int mem_model_gb;
     int mem_compute_gb;
@@ -2830,6 +2871,9 @@ struct train_params get_default_train_params() {
     params.adam_n_iter       = 16;
     params.adam_alpha        = 1e-3f;
     params.adam_decay        = 1e-3f;
+    params.adam_beta1        = 0.9f;
+    params.adam_beta2        = 0.999f;
+    params.adam_gclip        = 1.0f;
 
     params.mem_model_gb   = 2;
     params.mem_compute_gb = 24;
@@ -2877,6 +2921,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
+    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
+    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
@@ -3066,6 +3113,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--adam-beta1") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_beta1 = std::stof(argv[i]);
+        } else if (arg == "--adam-beta2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_beta2 = std::stof(argv[i]);
+        } else if (arg == "--adam-gclip") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_gclip = std::stof(argv[i]);
         } else if (arg == "--mem-model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3212,6 +3277,9 @@ int main(int argc, char ** argv) {
     opt_params_adam.adam.sched  = 1.0f;
     opt_params_adam.adam.alpha  = params.adam_alpha;
     opt_params_adam.adam.decay  = params.adam_decay;
+    opt_params_adam.adam.beta1  = params.adam_beta1;
+    opt_params_adam.adam.beta2  = params.adam_beta2;
+    opt_params_adam.adam.gclip  = params.adam_gclip;
 
     opt_params_lbfgs.print_forward_graph = false;
     opt_params_lbfgs.print_backward_graph = false;
diff --git a/ggml.c b/ggml.c
index 143f88d4a657c..19a194beb2542 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17354,6 +17354,7 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
+    const float gclip = params.adam.gclip;
 
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
@@ -17404,16 +17405,34 @@ static enum ggml_opt_result ggml_opt_adam(
         UNUSED(t_start_cpu);
 
         {
+            float gnorm = 1.0f;
+            if (gclip > 0.0f) {
+                // gradient clipping
+                ggml_float sum = 0.0;
+                for (int p = 0; p < np; ++p) {
+                    const int64_t ne = ggml_nelements(ps[p]);
+                    for (int64_t j = 0; j < ne; ++j) {
+                        float g = ggml_get_f32_1d(ps[p]->grad, j);
+                        sum += g*g;
+                    }
+                }
+                ggml_float norm = sqrt(sum);
+                if (norm > (ggml_float) gclip) {
+                    gnorm = (float) ((ggml_float) gclip / norm);
+                }
+            }
+            const float beta1h = alpha/(1.0f - powf(beta1, opt->iter));
+            const float beta2h =  1.0f/(1.0f - powf(beta2, opt->iter));
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
-                const int64_t ne = ggml_nelements(ps[p]) ;
+                const int64_t ne = ggml_nelements(ps[p]);
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
-                    float g = ggml_get_f32_1d(ps[p]->grad, j);
+                    float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
                     m[i] = m[i]*beta1 +   g*(1.0f - beta1);
                     v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
-                    float mh = m[i]*alpha/(1.0f - powf(beta1, opt->iter));
-                    float vh = v[i]*1.0f /(1.0f - powf(beta2, opt->iter));
+                    float mh = m[i]*beta1h;
+                    float vh = v[i]*beta2h;
                     vh = sqrtf(vh) + eps;
                     x  = x*(1.0f - decay) - mh/vh;
                     ggml_set_f32_1d(ps[p], j, x);
@@ -17902,6 +17921,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                         .eps    = 1e-8f,
                         .eps_f  = 1e-5f,
                         .eps_g  = 1e-3f,
+                        .gclip  = 0.0f,
                     },
                 };
             } break;
diff --git a/ggml.h b/ggml.h
index 531b6cb07d81d..460976468a056 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1509,6 +1509,7 @@ extern "C" {
             float eps;   // epsilon for numerical stability
             float eps_f; // epsilon for convergence test
             float eps_g; // epsilon for convergence test
+            float gclip; // gradient clipping
         } adam;
 
         // LBFGS parameters

From d7003a98cceda5fe5926baf3dcad666a311dbe40 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 17 Jun 2023 18:56:27 +0200
Subject: [PATCH 004/235] Fix reset of unused g->nodes and g->grads to NULL

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a4a6b05b184b0..c76a80c757ba0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1403,8 +1403,8 @@ void graph_set_leafs_grads(struct ggml_cgraph * g) {
         }
     }
     for (int i=n_nodes; i < g->n_nodes; ++i) {
-        g->nodes[n_nodes] = NULL;
-        g->grads[n_nodes] = NULL;
+        g->nodes[i] = NULL;
+        g->grads[i] = NULL;
     }
     g->n_nodes = n_nodes;
 }

From 6e3f95bf06ea102ac71e7b5bae5ddfaae7c89bc4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:06:05 +0200
Subject: [PATCH 005/235] implement gradient checkpointing for training

reduces memory overhead from O(n_layer) to O(sqrt(n_layer))

as explained in readme of https://github.com/cybertronai/gradient-checkpointing
---
 .../train-text-from-scratch.cpp               | 607 +++++++++++++++++-
 1 file changed, 597 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c76a80c757ba0..faa60ec8bf320 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1921,6 +1921,556 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     return t36;
 }
 
+struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
+        struct my_llama_model * model,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_tensor  * * logits,
+        struct ggml_tensor    * tokens_input,
+        struct ggml_tensor    * targets,
+        void                  * compute_buf_0,
+        void                  * compute_buf_1,
+        void                  * compute_buf_2,
+        void                  * compute_buf_3,
+        size_t                  size_buf_0,
+        size_t                  size_buf_1,
+        size_t                  size_buf_2,
+        size_t                  size_buf_3,
+        const  int              n_tokens,
+        const  int              n_batch) {
+
+    // implements gradient-checkpointing as explained in readme of https://github.com/cybertronai/gradient-checkpointing
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    const int n_past = 0;
+    const int N = n_tokens;
+
+    gf->n_nodes = 0;
+    gf->n_leafs = 0;
+    gf->perf_runs = 0;
+    gf->perf_cycles = 0;
+    gf->perf_time_us = 0;
+
+    const auto & hparams = model->hparams;
+    //const int n_ctx      = hparams.n_ctx;
+    const int n_vocab    = hparams.n_vocab;
+    const int n_embd     = hparams.n_embd;
+    const int n_layer    = hparams.n_layer;
+    const int n_head     = hparams.n_head;
+    const int n_rot      = hparams.n_rot;
+    const int n_ff       = get_n_ff(&hparams);
+    const int rope_mode  = 0;
+
+    bool track_max_mem = true;
+
+    int last_buf = -1;
+    size_t buf_offs[4] = { 0, 0, 0, 0 };
+    size_t buf_size[4] = { size_buf_0,
+                           size_buf_1,
+                           size_buf_2,
+                           size_buf_3 };
+    void * buf_data[4] = { compute_buf_0,
+                           compute_buf_1,
+                           compute_buf_2,
+                           compute_buf_3 };
+    size_t buf_maxs[4] = { 0, 0, 0, 0 };
+
+    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
+        size_t last_offs = 0;
+        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+        if (last_buf >= 0) {
+            buf_offs[last_buf] = last_offs;
+            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
+        }
+        if (buf >= 0) {
+            size_t offs = buf_offs[buf];
+            size_t size = buf_size[buf];
+            void * data = buf_data[buf];
+            ggml_set_scratch(ctx0, { offs, size, data, });
+        }
+        last_buf = buf;
+    };
+
+
+    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
+        if (buf < 0) return;
+        if (track_max_mem) {
+            size_t last_offs = 0;
+            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+            if (last_buf >= 0) {
+                buf_offs[last_buf] = last_offs;
+                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
+            }
+        }
+        buf_offs[buf] = 0;
+        if (track_max_mem && last_buf >= 0) {
+            size_t offs = buf_offs[last_buf];
+            size_t size = buf_size[last_buf];
+            void * data = buf_data[last_buf];
+            ggml_set_scratch(ctx0, { offs, size, data, });
+        }
+    };
+
+
+    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = n_embd/n_head;
+        int64_t ne1 = N;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = 0;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = n_embd/n_head;
+        int64_t ne1 = N;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = nb3*ne3;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
+        int64_t ne0 = N;
+        int64_t ne1 = n_embd/n_head;
+        int64_t ne2 = n_head;
+        int64_t ne3 = n_batch;
+        size_t  nb0 = ggml_element_size(t);
+        size_t  nb1 = nb0*ne0;
+        size_t  nb2 = nb1*ne1;
+        size_t  nb3 = nb2*ne2;
+        size_t offset = 2*nb3*ne3;
+        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
+    };
+
+    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
+        if (a == NULL) {
+            return b;
+        } else {
+            return ggml_add_inplace(ctx0, a, b);
+        }
+    };
+
+    use_buf(-1);
+
+    model->tok_embeddings->grad    = NULL;
+    model->norm->grad              = NULL;
+    model->output->grad            = NULL;
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        layer.attention_norm->grad = NULL;
+        layer.wq->grad             = NULL;
+        layer.wk->grad             = NULL;
+        layer.wv->grad             = NULL;
+        layer.wo->grad             = NULL;
+        layer.ffn_norm->grad       = NULL;
+        layer.w1->grad             = NULL;
+        layer.w2->grad             = NULL;
+        layer.w3->grad             = NULL;
+    }
+
+    clr_buf(0);
+    clr_buf(1);
+    clr_buf(2);
+    clr_buf(3);
+
+    use_buf(-1);
+
+    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
+    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+
+    use_buf(-1);
+
+    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
+
+
+    std::vector<int> checkpoints;
+    // for (int il = 0; il < n_layer; ++il) {
+    //     checkpoints.push_back(il);
+    // }
+    // n_check: number of layers between checkpoints
+    int n_check = (int)(sqrtf(n_layer) + 0.5f);
+    printf("%s: n_check = %d\n", __func__, n_check);
+    for (int chk = n_check-1; chk+1 < n_layer; chk += n_check) {
+        checkpoints.push_back(chk);
+    }
+
+    for (int i = 0; i < checkpoints.size(); ++i) {
+        printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
+    }
+
+    // example for 16 layers:
+    // inp  ~    implicit zeroth checkpoint == input
+    // L00 f 4b
+    // L01 f 4b
+    // L02 f 4b
+    // L03 fc4b  first checkpoint
+    // L04 f 3b
+    // L05 f 3b
+    // L06 f 3b
+    // L07 fc3b  second checkpoint
+    // L08 f 2b
+    // L09 f 2b
+    // L10 f 2b
+    // L11 fc2b  third checkpoint
+    // L12 f 1b
+    // L13 f 1b
+    // L14 f 1b
+    // L15 f 1b
+
+    // need to remember these for the backward pass
+    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
+    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
+
+    struct ggml_tensor * cur = t01;
+
+
+    int chk_idx = 0;
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        // tensors with values necessary for backward pass are in persistent buf(-1)
+        // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
+        bool is_checkpoint = (chk_idx < checkpoints.size() && il == checkpoints[chk_idx]);
+        if (is_checkpoint) {
+            printf("%s: layer %d is_checkpoint\n", __func__, il);
+            chk_idx += 1;
+        }
+        const int prs = 0; // in first forward pass even persistent tensors are only temporary
+        const int tmp = 0; // temporary
+        // nxt is required to compute next layer.
+        // for checkpoints we need to remember this for usage in backward pass,
+        // otherwise temporary until next of this kind
+        const int nxt = is_checkpoint ? -1 : 1;
+        clr_buf(0);
+        use_buf(prs); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+        use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        use_buf(prs); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        use_buf(prs); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        use_buf(prs); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        use_buf(tmp); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
+        use_buf(prs); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
+        use_buf(tmp); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
+        clr_buf( 1);
+        use_buf(nxt); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
+
+        // only t30L is remembered for checkpointing in first forward pass
+        if (is_checkpoint) {
+            t30L[il] = t30;
+        }
+        cur = t30;
+    }
+    clr_buf(0);
+    use_buf(0);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
+    use_buf(-1);
+    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
+
+    *gb = *gf;
+
+    // t36->grad gets set to one by optimizer, so we need the tensor.
+    // initialize it with 1.0f to make sure.
+    use_buf(-1);
+    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
+
+    use_buf(0);
+    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
+    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
+    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
+    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
+
+    use_buf(-1);
+
+    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
+    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
+
+    clr_buf(1);
+    use_buf(1);
+    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
+
+    struct ggml_tensor * back_layer_inp = t31;
+    struct ggml_tensor * grad_layer_inp = NULL;
+
+    printf("%s: checkpoints.size() = %zu\n", __func__, checkpoints.size());
+    chk_idx = checkpoints.size()-1;
+    int avail_begin = n_layer;
+    int avail_end = n_layer;
+    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    for (int k = 0; k < n_layer; ++k) {
+        // second forward pass for checkpointing
+        int il = n_layer-1-k;
+        if (il < avail_begin) {
+            // make sure, that txxL[il] is available
+            // forward pass from last checkpoint
+            GGML_ASSERT(chk_idx >= -1);
+            int begin = (chk_idx == -1)
+                        ? 0
+                        : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
+            int end   = (chk_idx+1 < checkpoints.size())
+                        ? (checkpoints[chk_idx+1] + 1)
+                        : n_layer;
+            GGML_ASSERT(begin <= il);
+            GGML_ASSERT(il < end);
+            cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
+            clr_buf(2);
+            printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
+            for (int i = begin; i < end; ++i) {
+                struct my_llama_layer & layer = model->layers[i];
+                const int prs = 2; // persistent until next checkpoint
+                const int tmp = 0; // temporary for this layer
+                const bool is_checkpoint = (i == end-1);
+                clr_buf(0);
+                use_buf(prs); struct ggml_tensor * t02 = expand(gb, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t03 = expand(gb, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
+                use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+                use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+                use_buf(prs); struct ggml_tensor * t14 = expand(gb, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+                use_buf(prs); struct ggml_tensor * t15 = expand(gb, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+                use_buf(prs); struct ggml_tensor * t16 = expand(gb, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+                use_buf(tmp); struct ggml_tensor * t17 = expand(gb, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t18 = expand(gb, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t19 = expand(gb, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t20 = expand(gb, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t21 = expand(gb, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t22 = expand(gb, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t23 = expand(gb, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t24 = expand(gb, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t25 = expand(gb, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t26 = expand(gb, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t27 = expand(gb, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
+                use_buf(prs); struct ggml_tensor * t28 = expand(gb, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
+                use_buf(tmp); struct ggml_tensor * t29 = expand(gb, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
+                if (t30L[i] == NULL) {
+                    use_buf(prs); struct ggml_tensor * t30 = expand(gb, ggml_add      (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
+                    t30L[i] = t30;
+                    cur     = t30;
+                }
+                t02L[i] = t02;
+                t03L[i] = t03;
+                t04L[i] = t04;
+                t05L[i] = t05;
+                t06L[i] = t06;
+                t07L[i] = t07;
+                t08L[i] = t08;
+                t09L[i] = t09;
+                t10L[i] = t10;
+                t11L[i] = t11;
+                t12L[i] = t12;
+                t13L[i] = t13;
+                t14L[i] = t14;
+                t15L[i] = t15;
+                t16L[i] = t16;
+                t17L[i] = t17;
+                t18L[i] = t18;
+                t19L[i] = t19;
+                t20L[i] = t20;
+                t21L[i] = t21;
+                t22L[i] = t22;
+                t23L[i] = t23;
+                t24L[i] = t24;
+                t25L[i] = t25;
+                t26L[i] = t26;
+                t27L[i] = t27;
+                t28L[i] = t28;
+                t29L[i] = t29;
+            }
+            --chk_idx;
+            avail_begin = begin;
+            avail_end   = end;
+            printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+        }
+        printf("%s: backward pass il=%d\n", __func__, il);
+
+        struct my_llama_layer & layer = model->layers[il];
+
+        struct ggml_tensor * t02 = t02L[il];
+        struct ggml_tensor * t03 = t03L[il];
+        struct ggml_tensor * t04 = t04L[il];
+        struct ggml_tensor * t05 = t05L[il];
+        struct ggml_tensor * t06 = t06L[il];
+        struct ggml_tensor * t07 = t07L[il];
+        struct ggml_tensor * t08 = t08L[il];
+        struct ggml_tensor * t09 = t09L[il];
+        struct ggml_tensor * t10 = t10L[il];
+        struct ggml_tensor * t11 = t11L[il];
+        struct ggml_tensor * t12 = t12L[il];
+        struct ggml_tensor * t13 = t13L[il];
+        struct ggml_tensor * t14 = t14L[il];
+        struct ggml_tensor * t15 = t15L[il];
+        struct ggml_tensor * t16 = t16L[il];
+        struct ggml_tensor * t17 = t17L[il];
+        struct ggml_tensor * t18 = t18L[il];
+        struct ggml_tensor * t19 = t19L[il];
+        struct ggml_tensor * t20 = t20L[il];
+        struct ggml_tensor * t21 = t21L[il];
+        struct ggml_tensor * t22 = t22L[il];
+        struct ggml_tensor * t23 = t23L[il];
+        struct ggml_tensor * t24 = t24L[il];
+        struct ggml_tensor * t25 = t25L[il];
+        struct ggml_tensor * t26 = t26L[il];
+        struct ggml_tensor * t27 = t27L[il];
+        struct ggml_tensor * t28 = t28L[il];
+        struct ggml_tensor * t29 = t29L[il];
+        struct ggml_tensor * t30 = t30L[il];
+
+        clr_buf(0);
+        use_buf(0);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        if (grad_layer_inp) {
+            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        }
+        clr_buf(1);
+        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
+        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
+        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
+        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
+        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
+        t24->grad = expand(gb, ggml_add_inplace(ctx0,
+                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
+                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
+        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
+        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
+        use_buf(1);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        grad_layer_inp = t21;
+        use_buf(0);
+        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
+        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
+        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
+        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
+        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
+        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
+        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
+        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
+        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
+        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
+        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
+        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
+        t04->grad = expand(gb, ggml_add_inplace(ctx0,
+                        ggml_add_inplace(ctx0,
+                            ggml_out_prod(ctx0, layer.wv, t11->grad),
+                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
+                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
+        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
+        use_buf(1);
+        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
+        back_layer_inp = t02;
+
+        use_buf(-1);
+        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
+        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
+        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
+        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
+        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
+        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
+        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
+        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
+        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
+    }
+    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    GGML_ASSERT(chk_idx == -2);
+    GGML_ASSERT(avail_begin == 0);
+    clr_buf(0);
+    use_buf(0);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    use_buf(-1);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+
+    *logits = t35;
+
+    clr_buf(0);
+    clr_buf(1);
+    clr_buf(2);
+    clr_buf(3);
+
+    if (track_max_mem) {
+        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
+        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
+        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
+        printf("%s: max size compute buf3: %zu\n", __func__, buf_maxs[3]);
+    }
+
+    // now that all grads are created, set the graph leafs and grads
+    graph_set_leafs_grads(gf);
+    graph_set_leafs_grads(gb);
+
+    return t36;
+}
+
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -2810,6 +3360,7 @@ struct train_params {
     bool use_adam;
     bool use_flash;
     bool use_scratch;
+    bool use_checkpointing;
 
     // only adam
     int   warmup;
@@ -2829,6 +3380,8 @@ struct train_params {
     int mem_compute_gb;
     int mem_compute0_gb;
     int mem_compute1_gb;
+    int mem_compute2_gb;
+    int mem_compute3_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2860,6 +3413,7 @@ struct train_params get_default_train_params() {
     params.use_adam               = true;
     params.use_flash              = true;
     params.use_scratch            = true;
+    params.use_checkpointing      = true;
 
     // only adam
     params.warmup            =  100;
@@ -2878,8 +3432,9 @@ struct train_params get_default_train_params() {
     params.mem_model_gb   = 2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 2;
-
+    params.mem_compute1_gb = 1;
+    params.mem_compute2_gb = 2;
+    params.mem_compute3_gb = 1;
     return params;
 }
 
@@ -2909,14 +3464,16 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention.\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention. Implies no-scratch and no-checkpointing.\n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-scratch               Don't use scratch buffers\n");
-    fprintf(stderr, "  --use-scratch              Use scratch buffers (default)\n");
-    fprintf(stderr, "  --warmup N                 Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-alpha N        Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
+    fprintf(stderr, "  --no-scratch               Don't use scratch buffers. Implies no-checkpointing.\n");
+    fprintf(stderr, "  --use-scratch              Use scratch buffers. Implies use-flash. (default)\n");
+    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
+    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing. Implies use-scratch and use-flash. (default)\n");
+    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
+    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
+    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
+    fprintf(stderr, "  --cos-decay-alpha N        Only for Adam optimizer. Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
@@ -2928,6 +3485,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
+    fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
+    fprintf(stderr, "  --mem-compute3 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute3_gb);
     fprintf(stderr, "\n");
 }
 
@@ -3065,6 +3624,10 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_scratch = false;
         } else if (arg == "--use-scratch") {
             params->use_scratch = true;
+        } else if (arg == "--no-checkpointing") {
+            params->use_checkpointing = false;
+        } else if (arg == "--use-checkpointing") {
+            params->use_checkpointing = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3155,6 +3718,18 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute1_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute2_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute3") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute3_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -3316,8 +3891,12 @@ int main(int argc, char ** argv) {
 
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
+    size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
+    size_t size_buf_3 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute3_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
+    uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
+    uint8_t * compute_buf_3 = new uint8_t[size_buf_3];
 
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
@@ -3376,7 +3955,15 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_scratch) {
+        if (params.use_checkpointing) {
+            loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
+                    &model, ctx0,
+                    gf, gb,
+                    &logits, tokens_input, target_probs,
+                    compute_buf_0, compute_buf_1, compute_buf_2, compute_buf_3,
+                    size_buf_0, size_buf_1, size_buf_2, size_buf_3,
+                    n_tokens, n_batch);
+        } else if (params.use_scratch) {
             loss = forward_batch_wo_cache_flash_attn_train(
                     &model, ctx0,
                     gf, gb,

From e05e4414ac2a66cdde0efa7799b3a3eac92863db Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 27 Jun 2023 17:43:00 +0200
Subject: [PATCH 006/235] remove unused compute buffer 3

---
 .../train-text-from-scratch.cpp               | 74 +++++++------------
 1 file changed, 27 insertions(+), 47 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index faa60ec8bf320..075e0307f4f64 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1932,11 +1932,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         void                  * compute_buf_0,
         void                  * compute_buf_1,
         void                  * compute_buf_2,
-        void                  * compute_buf_3,
         size_t                  size_buf_0,
         size_t                  size_buf_1,
         size_t                  size_buf_2,
-        size_t                  size_buf_3,
         const  int              n_tokens,
         const  int              n_batch) {
 
@@ -1966,16 +1964,14 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     bool track_max_mem = true;
 
     int last_buf = -1;
-    size_t buf_offs[4] = { 0, 0, 0, 0 };
-    size_t buf_size[4] = { size_buf_0,
+    size_t buf_offs[3] = { 0, 0, 0 };
+    size_t buf_size[3] = { size_buf_0,
                            size_buf_1,
-                           size_buf_2,
-                           size_buf_3 };
-    void * buf_data[4] = { compute_buf_0,
+                           size_buf_2 };
+    void * buf_data[3] = { compute_buf_0,
                            compute_buf_1,
-                           compute_buf_2,
-                           compute_buf_3 };
-    size_t buf_maxs[4] = { 0, 0, 0, 0 };
+                           compute_buf_2 };
+    size_t buf_maxs[3] = { 0, 0, 0 };
 
     auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
         size_t last_offs = 0;
@@ -2083,7 +2079,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     clr_buf(0);
     clr_buf(1);
     clr_buf(2);
-    clr_buf(3);
 
     use_buf(-1);
 
@@ -2112,22 +2107,22 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
     // example for 16 layers:
     // inp  ~    implicit zeroth checkpoint == input
-    // L00 f 4b
-    // L01 f 4b
+    // L00 f 4b  [
+    // L01 f 4b    4th second forward pass
     // L02 f 4b
-    // L03 fc4b  first checkpoint
-    // L04 f 3b
-    // L05 f 3b
+    // L03 fc4b  ] first checkpoint
+    // L04 f 3b  [
+    // L05 f 3b   3rd second forward pass
     // L06 f 3b
-    // L07 fc3b  second checkpoint
-    // L08 f 2b
-    // L09 f 2b
+    // L07 fc3b  ] second checkpoint
+    // L08 f 2b  [
+    // L09 f 2b   2nd second forward pass
     // L10 f 2b
-    // L11 fc2b  third checkpoint
-    // L12 f 1b
-    // L13 f 1b
+    // L11 fc2b  ] third checkpoint
+    // L12 f 1b  [
+    // L13 f 1b   1st second forward pass
     // L14 f 1b
-    // L15 f 1b
+    // L15 f 1b  ]
 
     // need to remember these for the backward pass
     std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
@@ -2162,7 +2157,6 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
     struct ggml_tensor * cur = t01;
 
-
     int chk_idx = 0;
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
@@ -2455,13 +2449,11 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     clr_buf(0);
     clr_buf(1);
     clr_buf(2);
-    clr_buf(3);
 
     if (track_max_mem) {
         printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
         printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
         printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
-        printf("%s: max size compute buf3: %zu\n", __func__, buf_maxs[3]);
     }
 
     // now that all grads are created, set the graph leafs and grads
@@ -3434,7 +3426,6 @@ struct train_params get_default_train_params() {
     params.mem_compute0_gb = 8;
     params.mem_compute1_gb = 1;
     params.mem_compute2_gb = 2;
-    params.mem_compute3_gb = 1;
     return params;
 }
 
@@ -3486,7 +3477,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
     fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
-    fprintf(stderr, "  --mem-compute3 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute3_gb);
     fprintf(stderr, "\n");
 }
 
@@ -3724,12 +3714,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute2_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute3") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute3_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -3892,11 +3876,9 @@ int main(int argc, char ** argv) {
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
     size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
-    size_t size_buf_3 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute3_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
     uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
-    uint8_t * compute_buf_3 = new uint8_t[size_buf_3];
 
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
@@ -3924,9 +3906,9 @@ int main(int argc, char ** argv) {
         }
 
         struct ggml_init_params cparams = {
-            /*.mem_size   =*/ compute_size,
-            /*.mem_buffer =*/ compute_addr,
-            /*.no_alloc   =*/ false,
+            compute_size, // mem_size
+            compute_addr, // mem_buffer
+            false,        // no_alloc
         };
         struct ggml_context * ctx0 = ggml_init(cparams);
 
@@ -3960,8 +3942,8 @@ int main(int argc, char ** argv) {
                     &model, ctx0,
                     gf, gb,
                     &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1, compute_buf_2, compute_buf_3,
-                    size_buf_0, size_buf_1, size_buf_2, size_buf_3,
+                    compute_buf_0, compute_buf_1, compute_buf_2,
+                    size_buf_0, size_buf_1, size_buf_2,
                     n_tokens, n_batch);
         } else if (params.use_scratch) {
             loss = forward_batch_wo_cache_flash_attn_train(
@@ -4082,9 +4064,9 @@ int main(int argc, char ** argv) {
         printf("---\n");
         for (int i=0; i<n_gen; ++i) {
             struct ggml_init_params cparams = {
-                /*.mem_size   =*/ compute_size,
-                /*.mem_buffer =*/ compute_addr,
-                /*.no_alloc   =*/ false,
+                compute_size, // .mem_size
+                compute_addr, // .mem_buffer
+                false,        // .no_alloc
             };
             struct ggml_context * ctx0 = ggml_init(cparams);
 
@@ -4120,10 +4102,8 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
-
+    ggml_free(model.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);
-    ggml_free(model.ctx);
-
     return 0;
 }

From ed4319e1a78e38777a3d0174f667829d9c0cc271 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:08:11 +0200
Subject: [PATCH 007/235] add and use function ggml_build_backward_expand to
 avoid stack overflows with large maximum number of nodes

GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
---
 .../train-text-from-scratch.cpp                  | 16 ++++++++++------
 ggml.c                                           | 10 ++++++----
 ggml.h                                           |  3 ++-
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 075e0307f4f64..61def445ecdc6 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3957,12 +3957,14 @@ int main(int argc, char ** argv) {
             logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
             loss   = cross_entropy_loss(ctx0, logits, target_probs);
             ggml_build_forward_expand(gf, loss);
-            *gb = ggml_build_backward(ctx0, gf, true);
+            *gb = *gf;
+            ggml_build_backward_expand(ctx0, gf, gb, true);
         } else {
             logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
             loss   = cross_entropy_loss(ctx0, logits, target_probs);
             ggml_build_forward_expand(gf, loss);
-            *gb = ggml_build_backward(ctx0, gf, true);
+            *gb = *gf;
+            ggml_build_backward_expand(ctx0, gf, gb, true);
         }
 
         ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
@@ -4070,13 +4072,15 @@ int main(int argc, char ** argv) {
             };
             struct ggml_context * ctx0 = ggml_init(cparams);
 
-            ggml_cgraph gf = {};
+            struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
+            memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
+            struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
 
             int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
 
-            ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
+            ggml_build_forward_expand(gf, logits);
+            ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
diff --git a/ggml.c b/ggml.c
index 19a194beb2542..92717f0aac7af 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15787,9 +15787,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     return result;
 }
 
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
-    struct ggml_cgraph result = *gf;
-
+void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
     GGML_ASSERT(gf->n_nodes > 0);
 
     // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
@@ -15818,10 +15816,14 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
         if (node->is_param) {
             GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_expand(&result, node->grad);
+            ggml_build_forward_expand(gb, node->grad);
         }
     }
+}
 
+struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
+    struct ggml_cgraph result = *gf;
+    ggml_build_backward_expand(ctx, gf, &result, keep);
     return result;
 }
 
diff --git a/ggml.h b/ggml.h
index 460976468a056..8f51f5d222099 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1403,7 +1403,8 @@ extern "C" {
             struct ggml_tensor  * tensor);
 
 
-    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
 
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

From a80f184e6d386b4a6d74902ddae61bf4740fd9a1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 29 Jun 2023 21:31:25 +0200
Subject: [PATCH 008/235] change AdamW decay parameter to work like the torch
 AdamW decay parameter

It is now relative to Adam learning rate `alpha*sched`.
Before that it was relative to `sched` only.

`alpha` being the maximum learning rate and `sched` being a scaling parameter in [0..1]
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 92717f0aac7af..451c765f97ad4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17351,8 +17351,8 @@ static enum ggml_opt_result ggml_opt_adam(
 
     // constants
     const float sched = params.adam.sched;
-    const float decay = params.adam.decay * sched;
     const float alpha = params.adam.alpha * sched;
+    const float decay = params.adam.decay * alpha;
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;

From f175ead6efc451bf60fd543ede756383b32b75b1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 29 Jun 2023 21:33:39 +0200
Subject: [PATCH 009/235] change default AdamW weight decay parameter used in
 training to 0.1 as used in nanoGPT

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 61def445ecdc6..9ee255f4e05c2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3416,7 +3416,7 @@ struct train_params get_default_train_params() {
     params.lbfgs_n_iter      = 16;
     params.adam_n_iter       = 16;
     params.adam_alpha        = 1e-3f;
-    params.adam_decay        = 1e-3f;
+    params.adam_decay        = 1e-1f;
     params.adam_beta1        = 0.9f;
     params.adam_beta2        = 0.999f;
     params.adam_gclip        = 1.0f;

From 97964a4cc964b099748ef7ca595b59606458c80f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 29 Jun 2023 21:36:28 +0200
Subject: [PATCH 010/235] change default AdamW weight decay parameter defined
 in ggml to 0.0, making Adam default instead of AdamW

btw: the default weight decay parameter for torch.optim.AdamW is 0.01
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 451c765f97ad4..229ddb2de6dd4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17916,7 +17916,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                     .adam = {
                         .n_iter = 10000,
                         .sched  = 1.000f,
-                        .decay  = 0.001f,
+                        .decay  = 0.0f,
                         .alpha  = 0.001f,
                         .beta1  = 0.9f,
                         .beta2  = 0.999f,

From 2c6985f79e70c175767ef76925bac99fb0107c18 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:55:54 +0200
Subject: [PATCH 011/235] bug fixes for cross entropy loss

ggml_cross_entropy_loss: sums where not correctly added in workload of each thread
ggml_cross_entropy_loss_back: simplify backward process, reducing numerical issues

guard usage of exp f16 lookup in cross entropy by #define GGML_CROSS_ENTROPY_EXP_FP16

cross entropy loss is only used once during training, but it is quite sensitive to numerical errors introduced by exp-f16-lookup.
so exp-f16-lookup for cross entropy loss is disabled by default, trading better gradients for very slightly worse runtime performance.
---
 ggml.c | 103 +++++++++++++++++----------------------------------------
 1 file changed, 30 insertions(+), 73 deletions(-)

diff --git a/ggml.c b/ggml.c
index 229ddb2de6dd4..d718de33b044f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -123,6 +123,7 @@ typedef void * thread_ret_t;
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 #define GGML_SILU_FP16
+// #define GGML_CROSS_ENTROPY_EXP_FP16
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -11486,6 +11487,7 @@ static void ggml_compute_forward_soft_max_back_f32(
         // dx = J * dy
         // dxk = sum_i(Jki * dyi)
         // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
+        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
         // dxk = sum_i(-yk*yi * dyi) + yk*dyk
         // dxk = -yk * sum_i(yi * dyi) + yk*dyk
         // dxk = -yk * dot(y, dy) + yk*dyk
@@ -13109,6 +13111,7 @@ static void ggml_compute_forward_flash_attn_f32(
                         if (SS[j] == -INFINITY) {
                             SS[j] = 0.0f;
                         } else {
+                            // const float val = expf(SS[j] - max);
                             ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                             memcpy(&scvt[j], &s, sizeof(uint16_t));
                             const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
@@ -13700,6 +13703,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                             if (SR[j] == -INFINITY) {
                                 SW[j] = 0.0f;
                             } else {
+                                // const float val = expf(SR[j] - max);
                                 ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                 memcpy(&scvt[j], &s, sizeof(uint16_t));
                                 const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
@@ -14317,6 +14321,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     const int nc = src0->ne[0];
     const int nr = ggml_nrows(src0);
 
+    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
+
     if (params->type == GGML_TASK_INIT) {
         if (ith == 0) {
             memset(sums, 0, sizeof(float) * (nth + nth * nc));
@@ -14345,7 +14351,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
         float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * st = (float *) params->wdata + nth + ith*nc;
+        float * st = ((float *) params->wdata) + nth + ith*nc;
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -14365,10 +14371,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
                 if (s0[i] == -INFINITY) {
                     st[i] = 0.0f;
                 } else {
-                    // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
                     ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                     memcpy(&scvt, &s, sizeof(scvt));
                     const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
                     sum += (ggml_float)val;
                     st[i] = val;
                 }
@@ -14384,7 +14394,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         ggml_vec_log_f32(nc, st, st);
         ggml_vec_mul_f32(nc, st, st, s1);
 
-        ggml_vec_sum_f32(nc, sums + ith, st);
+        float st_sum = 0;
+        ggml_vec_sum_f32(nc, &st_sum, st);
+        sums[ith] += st_sum;
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -14434,7 +14446,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         return;
     }
 
-    const float eps = 1e-9f;
+    const double eps = 1e-9f;
 
     // TODO: handle transposed/permuted matrices
     const int64_t nc = src0->ne[0];
@@ -14453,7 +14465,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
         float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
         float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * sm  = (float *) params->wdata + ith*nc;
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -14462,54 +14473,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
             assert(!isnan(s1[i]));
         }
 #endif
-        // step by step explanation:
-        {
-            //float * sums = (float *) params->wdata;
-
-            // forward pass with annotated gradients from backward pass
-            // (built by going in reverse operation order, adding to gradients of current operation args)
-            // st0 = exp(s0-max(s0))                                                       grad[st0] = grad[st1]*(1.0 - eps)/sum
-                                                          // from softmax_back:            grad[s0]  = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
-            // ggml_vec_scale_f32(nc, st, sum);           // st1 = st0*/sum = softmax(s0)  grad[st1] = grad[st2]*(1.0 - eps)
-            // ggml_vec_scale_f32(nc, st, (1.0f - eps));  // st2 = st1*(1.0 - eps)         grad[st2] = grad[st3]
-            // ggml_vec_add1_f32(nc, st, st, eps);        // st3 = st2 + eps               grad[st3] = grad[st4]/st3
-            // ggml_vec_log_f32(nc, st, st);              // st4 = log(st3)                grad[st4] = grad[st5] * s1
-            // ggml_vec_mul_f32(nc, st, st, s1);          // st5 = st4 * s1                grad[st5] = grad[sums[ith]]
-            // ggml_vec_sum_f32(nc, sums + ith, st);      // sums[ith] = st5               grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
-
-            // substitute into grad[st1], because we can reuse softmax_back from this point on
-            // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
-            // postorder:
-            // grad[st1] := softmax(s0)
-            // grad[st1] := grad[st1]*(1.0 - eps)
-            // grad[st1] := grad[st1] + eps
-            // grad[st1] := s1 / grad[st1]
-            // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
-
-            // src0 gradients by going through softmax_back
-            // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
-            //   from softmax_back:
-            //   dxk = yk * (dyk - dot(y, dy))
-            //   dot_y_dy := dot(y, dy)
-            //   dx := dy
-            //   dx := dx - dot_y_dy
-            //   dx := dx * y
-            //   postorder:
-            //   dot_st1_dst1 := dot(st1, grad[st1])
-            //   grad[s0] := grad[st1]
-            //   grad[s0] := grad[s0] - dot_st1_dst1
-            //   grad[s0] := grad[s0] * st1
-
-            // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
-            // sm           := softmax(s0)
-            // grad[s0]     := sm*(1.0 - eps)
-            // grad[s0]     := grad[s0] + eps
-            // grad[s0]     := s1 / grad[s0]
-            // grad[s0]     := grad[s0]*(1.0-eps)*-grad[cel]
-            // dot_st1_dst1 := dot(sm, grad[s0])
-            // grad[s0]     := grad[s0] - dot_st1_dst1
-            // grad[s0]     := grad[s0] * sm
-        }
 
         // soft_max
         ggml_float sum = 0.0;
@@ -14520,36 +14483,34 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
             uint16_t scvt;
             for (int i = 0; i < nc; i++) {
                 if (s0[i] == -INFINITY) {
-                    sm[i] = 0.0f;
+                    ds0[i] = 0.0f;
                 } else {
-                    // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max);
+#ifndef GGML_CROSS_ENTROPY_EXP_FP16
+                    const float s = s0[i] - max;
+                    const float val = expf(s);
+#else
                     ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
                     memcpy(&scvt, &s, sizeof(scvt));
                     const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+#endif
                     sum += (ggml_float)val;
-                    sm[i] = val;
+                    ds0[i] = val;
                 }
             }
 
             assert(sum > 0.0);
-            sum = 1.0/sum;
+            sum = (1.0 - eps)/sum;
         }
 
-        float dot_st1_dst1 = 0;
-        ggml_vec_scale_f32(nc, sm, sum);
-        ggml_vec_cpy_f32  (nc, ds0, sm);
-        ggml_vec_scale_f32(nc, ds0, (1.0f - eps));
-        ggml_vec_add1_f32 (nc, ds0, ds0, eps);
-        ggml_vec_div_f32  (nc, ds0, s1, ds0);
-        ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
-        ggml_vec_dot_f32  (nc, &dot_st1_dst1, sm, ds0);
-        ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
-        ggml_vec_mul_f32  (nc, ds0, ds0, sm);
+        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+        ggml_vec_scale_f32(nc, ds0, sum);
+        ggml_vec_add1_f32(nc, ds0, ds0, eps);
+        ggml_vec_sub_f32(nc, ds0, ds0, s1);
+        ggml_vec_scale_f32(nc, ds0, d[0]);
+
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
-            assert(!isnan(sm[i]));
-            assert(!isinf(sm[i]));
             assert(!isnan(ds0[i]));
             assert(!isinf(ds0[i]));
         }
@@ -16445,10 +16406,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
             case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                 {
                     n_tasks = n_threads;
-
-                    size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
-
-                    work_size = MAX(work_size, cur);
                 } break;
             case GGML_OP_NONE:
                 {

From 2d1e6e06753a84b44a323995b68c52eabec1ba7a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:57:58 +0200
Subject: [PATCH 012/235] fix test-grad0 for cross_entropy_loss

the second argument to cross_entropy_loss must sum up to 1 for each row
---
 tests/test-grad0.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 6d312216d58af..dc19c1ad273a4 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1358,15 +1358,26 @@ int main(int argc, const char ** argv) {
             int64_t ne2[4];
             get_random_dims(ne2, 4);
 
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+            for (int ndims = 1; ndims <= 4; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
                 x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
+                // the second argument to cross_entropy_loss must sum up to 1 for each row
+                int nr = ggml_nrows(x[1]);
+                int nc = ggml_nelements(x[1]) / nr;
+                for (int ir = 0; ir < nr; ++ir) {
+                    float sum = 0;
+                    for (int ic = 0; ic < nc; ++ic) {
+                        sum += ((float *) x[1]->data)[ic + ir*nc];
+                    }
+                    for (int ic = 0; ic < nc; ++ic) {
+                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
+                    }
+                }
                 ggml_set_param(ctx0, x[0]);
 
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
+                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
 
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
-                // finite differences regularly fails!
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-1f, INFINITY);
             }
         }
 

From 864e7e3aa1b08c4b2cd8cc2f17e0722fd019ffca Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:58:52 +0200
Subject: [PATCH 013/235] fix test-grad0 for soft_max

dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
---
 tests/test-grad0.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index dc19c1ad273a4..edc7e2834c7f8 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1345,9 +1345,18 @@ int main(int argc, const char ** argv) {
                 x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                 ggml_set_param(ctx0, x[0]);
 
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                float eps = 1e-6f;
+                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
+                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
+                struct ggml_tensor * f = ggml_sum(ctx0, 
+                                            ggml_log(ctx0, 
+                                                ggml_add1(ctx0, 
+                                                    ggml_scale(ctx0,
+                                                        ggml_soft_max(ctx0, x[0]),
+                                                        ggml_new_f32(ctx0, 1.0f - eps)),
+                                                    ggml_new_f32(ctx0, eps))));
+
+                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
             }
         }
 

From 87febeec91b9da387bf668dc10f83915d4bd19de Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 20:59:36 +0200
Subject: [PATCH 014/235] improve finite differences of test-grad0 by using
 double instead of float

---
 tests/test-grad0.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index edc7e2834c7f8..fe2ca212f82a0 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -275,14 +275,14 @@ bool check_gradient(
 
             ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
-            const float f0 = ggml_get_f32_1d(f, 0);
+            const double f0 = ggml_get_f32_1d(f, 0);
 
             ggml_set_f32_1d(x[i], k, xm);
 
             ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
-            const float f1 = ggml_get_f32_1d(f, 0);
-            const float g0 = (f0 - f1)/(2.0f*eps);
+            const double f1 = ggml_get_f32_1d(f, 0);
+            const double g0 = (f0 - f1)/(2.0*(double) eps);
 
             ggml_set_f32_1d(x[i], k, x0);
 
@@ -292,10 +292,10 @@ bool check_gradient(
 
             ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
-            const float g1 = ggml_get_f32_1d(x[i]->grad, k);
+            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
 
-            const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
+            const double error_abs = fabs(g0 - g1);
+            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
 
             if (error_abs > max_error_abs || error_rel > max_error_rel) {
                 printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",

From 51dc77092fa0aaaf832dbfda46058a413521b8a9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:05:12 +0200
Subject: [PATCH 015/235] change cross_entropy_loss to output average over all
 rows

this helps keeping the loss and gradients in a sane range
---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index d718de33b044f..07d100bf070d3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14334,7 +14334,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         if (ith == 0) {
             float * dp = (float *) dst->data;
             ggml_vec_sum_f32(nth, dp, sums);
-            dp[0] *= -1.0f;
+            dp[0] *= -1.0f / (float) nr;
         }
         return;
     }
@@ -14506,7 +14506,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
         ggml_vec_scale_f32(nc, ds0, sum);
         ggml_vec_add1_f32(nc, ds0, ds0, eps);
         ggml_vec_sub_f32(nc, ds0, ds0, s1);
-        ggml_vec_scale_f32(nc, ds0, d[0]);
+        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
 
 
 #ifndef NDEBUG

From 3744a9be74b27c758b06ea2bdf8ee97046e2b196 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:11:11 +0200
Subject: [PATCH 016/235] improve gradient checkpointing

sqrt(n_layers) is only the best checkpoint step when mem size of checkpoints and mem size of layers are equal.
since layers require more memory than the single-tensor-checkpoint we use, the optimal values are compute different:

```
  given: n, u, v
  objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
  b=n/a
  minimize(a*u+v*n/a)
  diff(a*u+v*n/a, a) = u - (v*n/a)/a
  diff(a*u+v*n/a, a) == 0
  u - (v*n/a)/a == 0
  u == v*n/(a*a)
  u*a*a = v*n
  a*a = v*n/u
  a = sqrt(n*v/u)
```

this change results in more checkpoints, requiring less layers to store between checkpoints, overall improving memory usage.
---
 .../train-text-from-scratch.cpp               | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9ee255f4e05c2..ae3f79c63bb95 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2090,22 +2090,39 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
 
 
+    {
+        // given: n, u, v
+        // objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
+        // b=n/a
+        // minimize(a*u+v*n/a)
+        // diff(a*u+v*n/a, a) = u - (v*n/a)/a
+        // diff(a*u+v*n/a, a) == 0
+        // u - (v*n/a)/a == 0
+        // u == v*n/(a*a)
+        // u*a*a = v*n
+        // a*a = v*n/u
+        // a = sqrt(n*v/u)
+    }
+
+    float memcost_checkpoint   = n_embd;           // (..)*N*n_batch
+    float memcost_snd_fwd_pass = 14*n_embd+4*n_ff; // (..)*N*n_batch
+
+    int n_checkstep = (int)(sqrtf(n_layer*memcost_checkpoint/memcost_snd_fwd_pass) + 0.5f);
+    if (n_checkstep < 1) {
+        n_checkstep = 1;
+    }
     std::vector<int> checkpoints;
-    // for (int il = 0; il < n_layer; ++il) {
-    //     checkpoints.push_back(il);
-    // }
-    // n_check: number of layers between checkpoints
-    int n_check = (int)(sqrtf(n_layer) + 0.5f);
-    printf("%s: n_check = %d\n", __func__, n_check);
-    for (int chk = n_check-1; chk+1 < n_layer; chk += n_check) {
+    for (int chk = n_checkstep-1; chk+1 < n_layer; chk += n_checkstep) {
         checkpoints.push_back(chk);
     }
+    int n_check = checkpoints.size();
+    // printf("%s: n_check = %d n_checkstep = %d\n", __func__, n_check, n_checkstep);
 
-    for (int i = 0; i < checkpoints.size(); ++i) {
-        printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
-    }
+    // for (int i = 0; i < n_check; ++i) {
+    //     printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
+    // }
 
-    // example for 16 layers:
+    // example for 16 layers and memcost_checkpoint=memcost_snd_fwd_pass:
     // inp  ~    implicit zeroth checkpoint == input
     // L00 f 4b  [
     // L01 f 4b    4th second forward pass

From fc379a2de36d45f5fcc12410dcc4eb468e294f8e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:12:25 +0200
Subject: [PATCH 017/235] disable gradient checkpointing debug output

---
 .../train-text-from-scratch.cpp               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index ae3f79c63bb95..08821d4129ef1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2179,9 +2179,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         struct my_llama_layer & layer = model->layers[il];
         // tensors with values necessary for backward pass are in persistent buf(-1)
         // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
-        bool is_checkpoint = (chk_idx < checkpoints.size() && il == checkpoints[chk_idx]);
+        bool is_checkpoint = (chk_idx < n_check && il == checkpoints[chk_idx]);
         if (is_checkpoint) {
-            printf("%s: layer %d is_checkpoint\n", __func__, il);
+            // printf("%s: layer %d is_checkpoint\n", __func__, il);
             chk_idx += 1;
         }
         const int prs = 0; // in first forward pass even persistent tensors are only temporary
@@ -2263,11 +2263,11 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     struct ggml_tensor * back_layer_inp = t31;
     struct ggml_tensor * grad_layer_inp = NULL;
 
-    printf("%s: checkpoints.size() = %zu\n", __func__, checkpoints.size());
-    chk_idx = checkpoints.size()-1;
+    // printf("%s: n_check = %u\n", __func__, n_check);
+    chk_idx = n_check-1;
     int avail_begin = n_layer;
     int avail_end = n_layer;
-    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
     for (int k = 0; k < n_layer; ++k) {
         // second forward pass for checkpointing
         int il = n_layer-1-k;
@@ -2278,14 +2278,14 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
             int begin = (chk_idx == -1)
                         ? 0
                         : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
-            int end   = (chk_idx+1 < checkpoints.size())
+            int end   = (chk_idx+1 < n_check)
                         ? (checkpoints[chk_idx+1] + 1)
                         : n_layer;
             GGML_ASSERT(begin <= il);
             GGML_ASSERT(il < end);
             cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
             clr_buf(2);
-            printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
+            // printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
             for (int i = begin; i < end; ++i) {
                 struct my_llama_layer & layer = model->layers[i];
                 const int prs = 2; // persistent until next checkpoint
@@ -2357,9 +2357,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
             --chk_idx;
             avail_begin = begin;
             avail_end   = end;
-            printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+            // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
         }
-        printf("%s: backward pass il=%d\n", __func__, il);
+        // printf("%s: backward pass il=%d\n", __func__, il);
 
         struct my_llama_layer & layer = model->layers[il];
 
@@ -2452,7 +2452,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
         layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
     }
-    printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
+    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
     GGML_ASSERT(chk_idx == -2);
     GGML_ASSERT(avail_begin == 0);
     clr_buf(0);

From d0fbb7d328d16f11a7ee229d08db81a46ca92bf0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:05:02 +0200
Subject: [PATCH 018/235] llama : fix rope usage in train-text-from-scratch
 after ChatGLM change

---
 .../train-text-from-scratch.cpp               | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 08821d4129ef1..b597fc82979d3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -456,8 +456,8 @@ struct ggml_tensor * forward(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, 1]
             // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
 
             // store key and value to memory
             {
@@ -713,8 +713,8 @@ struct ggml_tensor * forward_batch(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -957,7 +957,7 @@ struct ggml_tensor * forward_batch_wo_cache(
     const int N = n_tokens;
 
     const auto & hparams = model->hparams;
-    //const int n_ctx   = hparams.n_ctx;
+    const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
@@ -998,8 +998,8 @@ struct ggml_tensor * forward_batch_wo_cache(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1185,7 +1185,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     const int N = n_tokens;
 
     const auto & hparams = model->hparams;
-    //const int n_ctx   = hparams.n_ctx;
+    const int n_ctx   = hparams.n_ctx;
     const int n_vocab = hparams.n_vocab;
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
@@ -1220,8 +1220,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             // compute Q and K and RoPE them
             // wq   shape [n_embd, n_embd, 1, 1]
             // wk   shape [n_embd, n_embd, 1, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1613,10 +1613,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0));       assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
         use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0));       assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
         use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
         use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -1952,7 +1952,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     gf->perf_time_us = 0;
 
     const auto & hparams = model->hparams;
-    //const int n_ctx      = hparams.n_ctx;
+    const int n_ctx      = hparams.n_ctx;
     const int n_vocab    = hparams.n_vocab;
     const int n_embd     = hparams.n_embd;
     const int n_layer    = hparams.n_layer;
@@ -2196,10 +2196,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
         use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
         use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
         use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
         use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
         use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -2297,10 +2297,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
                 use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
                 use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
                 use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode));          assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
                 use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
                 use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode));          assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
                 use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
                 use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
                 use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -2426,10 +2426,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
         t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
         t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
         t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
         t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
         t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
         t04->grad = expand(gb, ggml_add_inplace(ctx0,
                         ggml_add_inplace(ctx0,

From c6a18e15c1f255f06ce03fee200f7e7c710989e8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:33:47 +0200
Subject: [PATCH 019/235] add more training parameters:

--enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay
--disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay
--opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero.
--opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero.
--opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero.
--adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero.
--adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha
---
 .../train-text-from-scratch.cpp               | 110 ++++++++++++++----
 1 file changed, 89 insertions(+), 21 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b597fc82979d3..f6e146b8091ae 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3333,10 +3333,12 @@ float cosine_decay(const int decay_steps, const float alpha, int step) {
     return decay;
 }
 
-float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) {
-    while (step > decay_steps) {
-        step -= decay_steps;
-        decay_steps = (int) restart_step_mult * decay_steps;
+float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult, bool enable_restart) {
+    if (enable_restart) {
+        while (step > decay_steps) {
+            step -= decay_steps;
+            decay_steps = (int) restart_step_mult * decay_steps;
+        }
     }
     return cosine_decay(decay_steps, alpha, step);
 }
@@ -3376,14 +3378,21 @@ struct train_params {
     int   cos_decay_steps;
     float cos_decay_restart;
     float cos_decay_alpha;
+    bool  enable_restart;
+
+    int   opt_past;
+    float opt_delta;
+    int   opt_max_no_improvement;
 
     int   lbfgs_n_iter;
     int   adam_n_iter;
     float adam_alpha;
+    float adam_min_alpha;
     float adam_decay;
     float adam_beta1;
     float adam_beta2;
     float adam_gclip;
+    float adam_eps_f;
 
     int mem_model_gb;
     int mem_compute_gb;
@@ -3424,19 +3433,26 @@ struct train_params get_default_train_params() {
     params.use_scratch            = true;
     params.use_checkpointing      = true;
 
+    params.opt_past               = 0;
+    params.opt_delta              = 1e-5f;
+    params.opt_max_no_improvement = 0;
+
     // only adam
     params.warmup            =  100;
     params.cos_decay_steps   = 1000;
     params.cos_decay_restart = 1.1f;
     params.cos_decay_alpha   = 0.0f;
+    params.enable_restart    = false;
 
     params.lbfgs_n_iter      = 16;
     params.adam_n_iter       = 16;
     params.adam_alpha        = 1e-3f;
+    params.adam_min_alpha    = 1e-4f;
     params.adam_decay        = 1e-1f;
     params.adam_beta1        = 0.9f;
     params.adam_beta2        = 0.999f;
     params.adam_gclip        = 1.0f;
+    params.adam_eps_f        = 0.0f;
 
     params.mem_model_gb   = 2;
     params.mem_compute_gb = 24;
@@ -3482,13 +3498,20 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
     fprintf(stderr, "  --cos-decay-alpha N        Only for Adam optimizer. Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
-    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
+    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
+    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
+    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
+    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
+    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha (default %f)\n", params->adam_min_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
+    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
@@ -3659,12 +3682,34 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->cos_decay_alpha = std::stof(argv[i]);
-        } else if (arg == "--lbfgs-iter") {
+        } else if (arg == "--enable-restart") {
+            params->enable_restart = true;
+        } else if (arg == "--disable-restart") {
+            params->enable_restart = false;
+        } else if (arg == "--opt-past") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->lbfgs_n_iter = std::stoi(argv[i]);
+            params->opt_past = std::stoi(argv[i]);
+        } else if (arg == "--opt-delta") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_delta = std::stof(argv[i]);
+        } else if (arg == "--opt-max-no-improvement") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_max_no_improvement = std::stoi(argv[i]);
+        } else if (arg == "--adam-epsf") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_eps_f = std::stof(argv[i]);
         } else if (arg == "--adam-iter") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3677,6 +3722,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_alpha = std::stof(argv[i]);
+        } else if (arg == "--adam-min-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_min_alpha = std::stof(argv[i]);
         } else if (arg == "--adam-decay") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3701,6 +3752,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_gclip = std::stof(argv[i]);
+        } else if (arg == "--lbfgs-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->lbfgs_n_iter = std::stoi(argv[i]);
         } else if (arg == "--mem-model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3846,21 +3903,28 @@ int main(int argc, char ** argv) {
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
     struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
-    opt_params_adam.print_forward_graph = false;
+    opt_params_adam.print_forward_graph  = false;
     opt_params_adam.print_backward_graph = false;
-    opt_params_adam.n_threads   = params.n_threads;
-    opt_params_adam.adam.n_iter = params.adam_n_iter;
-    opt_params_adam.adam.sched  = 1.0f;
-    opt_params_adam.adam.alpha  = params.adam_alpha;
-    opt_params_adam.adam.decay  = params.adam_decay;
-    opt_params_adam.adam.beta1  = params.adam_beta1;
-    opt_params_adam.adam.beta2  = params.adam_beta2;
-    opt_params_adam.adam.gclip  = params.adam_gclip;
-
-    opt_params_lbfgs.print_forward_graph = false;
+    opt_params_adam.n_threads            = params.n_threads;
+    opt_params_adam.past                 = params.opt_past;
+    opt_params_adam.delta                = params.opt_delta;
+    opt_params_adam.max_no_improvement   = params.opt_max_no_improvement;
+    opt_params_adam.adam.n_iter          = params.adam_n_iter;
+    opt_params_adam.adam.sched           = 1.0f;
+    opt_params_adam.adam.alpha           = params.adam_alpha;
+    opt_params_adam.adam.decay           = params.adam_decay;
+    opt_params_adam.adam.beta1           = params.adam_beta1;
+    opt_params_adam.adam.beta2           = params.adam_beta2;
+    opt_params_adam.adam.gclip           = params.adam_gclip;
+    opt_params_adam.adam.eps_f           = params.adam_eps_f;
+
+    opt_params_lbfgs.print_forward_graph  = false;
     opt_params_lbfgs.print_backward_graph = false;
-    opt_params_lbfgs.n_threads    = params.n_threads;
-    opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter;
+    opt_params_lbfgs.n_threads            = params.n_threads;
+    opt_params_adam.past                  = params.opt_past;
+    opt_params_adam.delta                 = params.opt_delta;
+    opt_params_adam.max_no_improvement    = params.opt_max_no_improvement;
+    opt_params_lbfgs.lbfgs.n_iter         = params.lbfgs_n_iter;
 
     opt->ctx = model.ctx;
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
@@ -3996,7 +4060,11 @@ int main(int argc, char ** argv) {
                 params.cos_decay_steps,
                 params.cos_decay_alpha,
                 opt->iter - params.warmup,
-                params.cos_decay_restart);
+                params.cos_decay_restart,
+                params.enable_restart);
+
+        float min_sched = params.adam_min_alpha / params.adam_alpha;
+        opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched);
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 

From ce937bc431f7ac88f5e5b0bab2475bcc673369ca Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:36:56 +0200
Subject: [PATCH 020/235] replace memcpy with reshape operation so that the
 graph is not cut at the input

this makes it possible to store other values into the input tensor and then simply recompute the graph without rebuilding it
---
 .../train-text-from-scratch.cpp               | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f6e146b8091ae..db7a528426f30 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -965,8 +965,8 @@ struct ggml_tensor * forward_batch_wo_cache(
     const int n_rot   = hparams.n_rot;
     const int n_ff    = get_n_ff(&hparams);
 
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
 
     // inpL shape [n_embd,N*n_batch,1]
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
@@ -1168,7 +1168,7 @@ struct ggml_tensor * forward_batch_wo_cache(
     }
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    // ggml_build_forward_expand(gf, inpL);
 
     return inpL;
 }
@@ -1193,8 +1193,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     const int n_rot   = hparams.n_rot;
     const int n_ff    = get_n_ff(&hparams);
 
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
+
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
 
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
     assert_shape_2d(inpL, n_embd, N*n_batch);
@@ -1336,7 +1337,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     }
 
     // run the computation
-    ggml_build_forward_expand(gf, inpL);
+    // ggml_build_forward_expand(gf, inpL);
 
     return inpL;
 }
@@ -1563,8 +1564,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     use_buf(-1);
 
-    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
-    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
 
     use_buf(-1);
 
@@ -2082,8 +2083,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
     use_buf(-1);
 
-    struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch);
-    memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch);
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
 
     use_buf(-1);
 

From ff759d957c34ef98c4700a014dd00de2a15d7435 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:38:03 +0200
Subject: [PATCH 021/235] remove unused function argument from
 get_example_targets_batch

---
 .../train-text-from-scratch.cpp                 | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index db7a528426f30..de71dc99671ed 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2581,7 +2581,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     }
 }
 
-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -2596,27 +2596,23 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
 
     ggml_set_f32(target_logits, -1.0f/n_vocab);
     ggml_set_f32(target_probs, 0.0f);
+    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
-        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
+        size_t sample_idx = (example_id*n_batch + k) % n_train_samples;
+        size_t sample = train_samples[sample_idx];
+        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
         set_i32_2d(tokens_input, 0, k, llama_token_bos());
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            // print_token(lctx, token);
             set_f32_3d(target_logits, token, i-1, k, +1.0f);
             set_f32_3d(target_probs,  token, i-1, k, +1.0f);
             if (i<n_tokens) {
                 set_i32_2d(tokens_input, i, k, token);
             }
         }
-        // printf("\n=\n");
-        // for (int i=0; i<n_tokens; ++i) {
-        //     int token = get_i32_2d(tokens_input, i, k);
-        //     print_token(lctx, token);
-        // }
-        // printf("\n-\n");
     }
 }
 
@@ -4011,8 +4007,7 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-
-        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
+        get_example_targets_batch(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
         GGML_ASSERT(n_past == 0);
 

From e843d6e71cea22eaa9a4288138ef02bc8cc50e7d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 21:38:52 +0200
Subject: [PATCH 022/235] measure and print total training time

---
 .../train-text-from-scratch/train-text-from-scratch.cpp    | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index de71dc99671ed..0f330fd4ab7b9 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3975,6 +3975,8 @@ int main(int argc, char ** argv) {
 
     printf("%s: begin training\n", __func__);
 
+    int64_t t0 = ggml_time_ms();
+
     for (int ex = 0; ex < params.n_examples; ++ex) {
         if (ex*n_batch >= (int) train_samples.size()) {
             shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
@@ -4112,6 +4114,11 @@ int main(int argc, char ** argv) {
         ggml_free(ctx0);
     }
 
+    int64_t t1 = ggml_time_ms();
+    int64_t d  = t1-t0;
+    double  dd = (double) d * 1e-3;
+    printf("%s: total training time=%f seconds\n", __func__, dd);
+
     if (params.n_examples > 0) {
         save_checkpoint(&model, opt, params.fn_checkpoint_out);
     }

From bfc311913991c75cc2d3c2978d9d2273a1370ac6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 22:15:08 +0200
Subject: [PATCH 023/235] add optimization callback to ggml_opt_resume_g

this callback is called before each iteration with custom data and pointer to learning schedule parameter (only used in Adam(W)).

can be used for dynamic learning schedule and setting input data for batches before each iteration
---
 .../train-text-from-scratch.cpp               | 14 +---
 ggml.c                                        | 71 ++++++++++++++-----
 ggml.h                                        | 15 ++--
 3 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0f330fd4ab7b9..6adbece4cc24c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4046,12 +4046,8 @@ int main(int argc, char ** argv) {
             ggml_build_backward_expand(ctx0, gf, gb, true);
         }
 
-        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
-
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
-        float error_before_opt = ggml_get_f32_1d(loss, 0);
-
         opt->params.adam.sched = (opt->iter < params.warmup)
             ? (float) opt->iter / (float) params.warmup
             : cosine_decay_restart(
@@ -4066,7 +4062,7 @@ int main(int argc, char ** argv) {
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb);
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, NULL, NULL);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
@@ -4074,14 +4070,10 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
-
-        float error_after_opt = ggml_get_f32_1d(loss, 0);
-
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
-            printf("error_before_opt: %.6f\n", error_before_opt);
-            printf("error_after_opt:  %.6f\n", error_after_opt);
+            printf("error_before_opt: %.6f\n", opt->loss_before);
+            printf("error_after_opt:  %.6f\n", opt->loss_after);
             printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
             printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
         }
diff --git a/ggml.c b/ggml.c
index 07d100bf070d3..e0f91ed5a0d02 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17281,7 +17281,9 @@ static enum ggml_opt_result ggml_opt_adam(
         struct ggml_opt_params params,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
     GGML_ASSERT(ggml_is_scalar(f));
 
     // these will store the parameters we want to optimize
@@ -17307,8 +17309,8 @@ static enum ggml_opt_result ggml_opt_adam(
     }
 
     // constants
-    const float sched = params.adam.sched;
-    const float alpha = params.adam.alpha * sched;
+    float sched = params.adam.sched;
+    const float alpha = params.adam.alpha;
     const float decay = params.adam.decay * alpha;
     const float beta1 = params.adam.beta1;
     const float beta2 = params.adam.beta2;
@@ -17320,6 +17322,10 @@ static enum ggml_opt_result ggml_opt_adam(
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
+    if (callback) {
+        callback(callback_data, &sched);
+    }
+
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
@@ -17332,6 +17338,9 @@ static enum ggml_opt_result ggml_opt_adam(
         pf[opt->iter % params.past] = opt->adam.fx_prev;
     }
 
+    opt->loss_before = opt->adam.fx_prev;
+    opt->loss_after  = opt->adam.fx_prev;
+
     // initialize
     if (opt->just_initialized) {
         opt->adam.n_no_improvement = 0;
@@ -17380,11 +17389,12 @@ static enum ggml_opt_result ggml_opt_adam(
                     gnorm = (float) ((ggml_float) gclip / norm);
                 }
             }
-            const float beta1h = alpha/(1.0f - powf(beta1, opt->iter));
-            const float beta2h =  1.0f/(1.0f - powf(beta2, opt->iter));
+            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
+            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
                 const int64_t ne = ggml_nelements(ps[p]);
+                const float p_decay = decay * sched;
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
                     float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
@@ -17393,13 +17403,13 @@ static enum ggml_opt_result ggml_opt_adam(
                     float mh = m[i]*beta1h;
                     float vh = v[i]*beta2h;
                     vh = sqrtf(vh) + eps;
-                    x  = x*(1.0f - decay) - mh/vh;
+                    x  = x*(1.0f - p_decay) - mh/vh;
                     ggml_set_f32_1d(ps[p], j, x);
                     ++i;
                 }
             }
         }
-        // {
+        {
         //     // update the gradient
         //     ggml_opt_get_grad(np, ps, g1);
 
@@ -17436,7 +17446,11 @@ static enum ggml_opt_result ggml_opt_adam(
 
         //     // update the parameters
         //     ggml_opt_set_params(np, ps, x);
-        // }
+        }
+
+        if (callback) {
+            callback(callback_data, &sched);
+        }
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
@@ -17444,6 +17458,8 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
+        opt->loss_after = fx;
+
 
         // check convergence
         if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
@@ -17525,7 +17541,9 @@ static enum ggml_opt_result linesearch_backtracking(
         struct ggml_cgraph * gf,
         struct ggml_cgraph * gb,
         const int np,
-        struct ggml_tensor * ps[]) {
+        struct ggml_tensor * ps[],
+        ggml_opt_callback callback,
+        void * callback_data) {
     int count = 0;
 
     float width  = 0.0f;
@@ -17554,6 +17572,12 @@ static enum ggml_opt_result linesearch_backtracking(
     dgtest = params->lbfgs.ftol*dginit;
 
     while (true) {
+        if (callback) {
+            // LBFG-S does not support learning rate -> ignore learning schedule
+            float sched = 0;
+            callback(callback_data, &sched);
+        }
+
         ggml_vec_cpy_f32(nx, x, xp);
         ggml_vec_mad_f32(nx, x, d, *step);
 
@@ -17624,7 +17648,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         struct ggml_opt_params params,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
     if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
         params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
         if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
@@ -17677,6 +17703,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     float * lm_s     = opt->lbfgs.lms->data;
     float * lm_y     = opt->lbfgs.lmy->data;
 
+    if (callback) {
+        // LBFG-S does not support learning rate -> ignore learning schedule
+        float sched = 0;
+        callback(callback_data, &sched);
+    }
+
     // evaluate the function value and its gradient
     {
         ggml_opt_set_params(np, ps, x);
@@ -17689,6 +17721,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_opt_get_grad(np, ps, g);
 
         fx = ggml_get_f32_1d(f, 0);
+
+        opt->loss_before = fx;
+        opt->loss_after  = fx;
     }
 
     // search direction = -gradient
@@ -17743,7 +17778,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
+        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps, callback, callback_data);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return
@@ -17753,6 +17788,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             return ls;
         }
 
+        opt->loss_after = fx;
+
         ggml_vec_norm_f32(nx, &xnorm, x);
         ggml_vec_norm_f32(nx, &gnorm, g);
 
@@ -17810,7 +17847,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         //     ys = y^t \cdot s    -> 1 / \rho.
         //     yy = y^t \cdot y.
         //
-        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]);
+        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
         ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
 
         lm_ys[end[0]] = ys;
@@ -18020,7 +18057,7 @@ enum ggml_opt_result ggml_opt_resume(
     *gf = ggml_build_forward (f);
     *gb = ggml_build_backward(ctx, gf, true);
 
-    return ggml_opt_resume_g(ctx, opt, f, gf, gb);
+    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
 }
 
 enum ggml_opt_result ggml_opt_resume_g(
@@ -18028,7 +18065,9 @@ enum ggml_opt_result ggml_opt_resume_g(
         struct ggml_opt_context * opt,
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
+        struct ggml_cgraph * gb,
+        ggml_opt_callback callback,
+        void * callback_data) {
 
     // build forward + backward compute graphs
     enum ggml_opt_result result = GGML_OPT_OK;
@@ -18036,11 +18075,11 @@ enum ggml_opt_result ggml_opt_resume_g(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
+                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
             } break;
         case GGML_OPT_LBFGS:
             {
-                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
+                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
             } break;
     }
 
diff --git a/ggml.h b/ggml.h
index 8f51f5d222099..fadc343eef41c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1469,6 +1469,8 @@ extern "C" {
         GGML_LINESEARCH_INVALID_PARAMETERS,
     };
 
+    typedef void (*ggml_opt_callback)(void * data, float * sched);
+
     // optimization parameters
     //
     //   see ggml.c (ggml_opt_default_params) for default values
@@ -1538,6 +1540,9 @@ extern "C" {
 
         bool just_initialized;
 
+        float loss_before;
+        float loss_after;
+
         struct {
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
@@ -1577,10 +1582,10 @@ extern "C" {
 
     // initialize optimizer context
     GGML_API void ggml_opt_init(
-            struct ggml_context * ctx,
+            struct ggml_context     * ctx,
             struct ggml_opt_context * opt,
-            struct ggml_opt_params params,
-            int64_t nx);
+            struct ggml_opt_params    params,
+            int64_t                   nx);
 
     // continue optimizing the function defined by the tensor f
     GGML_API enum ggml_opt_result ggml_opt_resume(
@@ -1594,7 +1599,9 @@ extern "C" {
             struct ggml_opt_context * opt,
             struct ggml_tensor * f,
             struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb);
+            struct ggml_cgraph * gb,
+            ggml_opt_callback callback,
+            void * callback_data);
 
     //
     // quantization

From d7aa4d9576cbcdd24578e9dc9be81777fd1611ec Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 22:18:50 +0200
Subject: [PATCH 024/235] use optimization callback in training

allows dynamic learning schedule and different batch data for each iteration without relying on low n_iter and high n_examples parameters

reduces runtime by avoiding restart of optimization function and improves training convergence by providing a different batch for each iteration
---
 .../train-text-from-scratch.cpp               | 81 +++++++++++++++++--
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 6adbece4cc24c..bde29c5b0714a 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3418,7 +3418,7 @@ struct train_params get_default_train_params() {
 
     params.n_threads  =    6;
     params.n_batch    =    8;
-    params.n_examples =    8;
+    params.n_examples =    1;
     params.n_predict  = 1024;
 
     params.print_info_interval    = 1;
@@ -3441,8 +3441,8 @@ struct train_params get_default_train_params() {
     params.cos_decay_alpha   = 0.0f;
     params.enable_restart    = false;
 
-    params.lbfgs_n_iter      = 16;
-    params.adam_n_iter       = 16;
+    params.lbfgs_n_iter      = 256;
+    params.adam_n_iter       = 256;
     params.adam_alpha        = 1e-3f;
     params.adam_min_alpha    = 1e-4f;
     params.adam_decay        = 1e-1f;
@@ -3803,6 +3803,61 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
     return true;
 }
 
+struct opt_callback_data {
+    struct train_params *     params;
+    struct ggml_opt_context * opt;
+    llama_token *             tokens_data;
+    size_t                    tokens_size;
+    int *                     samples_data;
+    size_t                    samples_size;
+    int                       shuffle_countdown;
+    struct ggml_tensor *      tokens_input;
+    struct ggml_tensor *      target_logits;
+    struct ggml_tensor *      target_probs;
+};
+
+void opt_callback(void * vdata, float * sched) {
+    struct opt_callback_data * data = (struct opt_callback_data *) vdata;
+    struct train_params * params    = data->params;
+    struct ggml_opt_context * opt   = data->opt;
+    int n_batch = params->n_batch;
+
+    *sched = (opt->iter < params->warmup)
+                ? (float) opt->iter / (float) params->warmup
+                : cosine_decay_restart(
+                    params->cos_decay_steps,
+                    params->cos_decay_alpha,
+                    opt->iter - params->warmup,
+                    params->cos_decay_restart,
+                    params->enable_restart);
+    float min_sched = params->adam_min_alpha / params->adam_alpha;
+    *sched = min_sched + *sched * (1.0f - min_sched);
+
+    int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+    printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+
+    if (data->shuffle_countdown < n_batch) {
+        printf("%s: reshuffle samples\n", __func__);
+        shuffle_ints(data->samples_data, data->samples_data + data->samples_size);
+        for (int i = 0; i < (int) data->samples_size; ++i) {
+            GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size);
+        }
+        data->shuffle_countdown = data->samples_size;
+    }
+
+    get_example_targets_batch(
+        data->samples_data,
+        data->samples_size,
+        data->tokens_data,
+        data->tokens_size,
+        opt->iter,
+        data->tokens_input,
+        data->target_logits,
+        data->target_probs);
+
+    data->shuffle_countdown -= n_batch;
+}
+
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -3975,6 +4030,18 @@ int main(int argc, char ** argv) {
 
     printf("%s: begin training\n", __func__);
 
+    struct opt_callback_data opt_cb_data;
+    opt_cb_data.params = &params;
+    opt_cb_data.opt = opt;
+    opt_cb_data.tokens_data = train_tokens.data();
+    opt_cb_data.tokens_size = train_tokens.size();
+    opt_cb_data.samples_data = train_samples.data();
+    opt_cb_data.samples_size = train_samples.size();
+    opt_cb_data.shuffle_countdown = train_samples.size();
+    opt_cb_data.tokens_input  = NULL;
+    opt_cb_data.target_logits = NULL;
+    opt_cb_data.target_probs  = NULL;
+
     int64_t t0 = ggml_time_ms();
 
     for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3998,6 +4065,10 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
+        opt_cb_data.tokens_input  = tokens_input;
+        opt_cb_data.target_logits = target_logits;
+        opt_cb_data.target_probs  = target_probs;
+
         int n_past = 0;
 
         struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
@@ -4009,8 +4080,6 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-        get_example_targets_batch(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
-
         GGML_ASSERT(n_past == 0);
 
         struct ggml_tensor * loss   = NULL;
@@ -4062,7 +4131,7 @@ int main(int argc, char ** argv) {
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, NULL, NULL);
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 

From e6ff0728e0c311e27d99c47e4a84a650119a5661 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 2 Jul 2023 23:01:38 +0200
Subject: [PATCH 025/235] add minimum number of tensor dimensions to apply
 weight decay (default 2)

this allows to not apply weight decay to bias parameters
---
 .../train-text-from-scratch.cpp                        | 10 ++++++++++
 ggml.c                                                 |  4 +++-
 ggml.h                                                 |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index bde29c5b0714a..aaaf954be9ea7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3386,6 +3386,7 @@ struct train_params {
     float adam_alpha;
     float adam_min_alpha;
     float adam_decay;
+    int   adam_decay_min_ndim;
     float adam_beta1;
     float adam_beta2;
     float adam_gclip;
@@ -3446,6 +3447,7 @@ struct train_params get_default_train_params() {
     params.adam_alpha        = 1e-3f;
     params.adam_min_alpha    = 1e-4f;
     params.adam_decay        = 1e-1f;
+    params.adam_decay_min_ndim = 2;
     params.adam_beta1        = 0.9f;
     params.adam_beta2        = 0.999f;
     params.adam_gclip        = 1.0f;
@@ -3505,6 +3507,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
     fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha (default %f)\n", params->adam_min_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
@@ -3731,6 +3734,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--adam-decay-min-ndim") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_decay_min_ndim = std::stoi(argv[i]);
         } else if (arg == "--adam-beta1") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -3965,6 +3974,7 @@ int main(int argc, char ** argv) {
     opt_params_adam.adam.sched           = 1.0f;
     opt_params_adam.adam.alpha           = params.adam_alpha;
     opt_params_adam.adam.decay           = params.adam_decay;
+    opt_params_adam.adam.decay_min_ndim  = params.adam_decay_min_ndim;
     opt_params_adam.adam.beta1           = params.adam_beta1;
     opt_params_adam.adam.beta2           = params.adam_beta2;
     opt_params_adam.adam.gclip           = params.adam_gclip;
diff --git a/ggml.c b/ggml.c
index e0f91ed5a0d02..2138cb8bc9e3d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17316,6 +17316,7 @@ static enum ggml_opt_result ggml_opt_adam(
     const float beta2 = params.adam.beta2;
     const float eps   = params.adam.eps;
     const float gclip = params.adam.gclip;
+    const int decay_min_ndim = params.adam.decay_min_ndim;
 
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
@@ -17394,7 +17395,7 @@ static enum ggml_opt_result ggml_opt_adam(
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
                 const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = decay * sched;
+                const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0) * sched;
                 for (int64_t j = 0; j < ne; ++j) {
                     float x = ggml_get_f32_1d(ps[p], j);
                     float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
@@ -17911,6 +17912,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                         .n_iter = 10000,
                         .sched  = 1.000f,
                         .decay  = 0.0f,
+                        .decay_min_ndim = 2,
                         .alpha  = 0.001f,
                         .beta1  = 0.9f,
                         .beta2  = 0.999f,
diff --git a/ggml.h b/ggml.h
index fadc343eef41c..3980c005036bb 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1506,6 +1506,7 @@ extern "C" {
 
             float sched; // schedule multiplier (fixed, decay or warmup)
             float decay; // weight decay for AdamW, use 0.0f to disable
+            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
             float alpha; // learning rate
             float beta1;
             float beta2;

From 58024d3e5f316a3d792a096089e937abbdf362f7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 17:57:08 +0200
Subject: [PATCH 026/235] rename training parameter cos-decay-alpha to
 cos-decay-min and clarify that adam-min-alpha also applies to warmup

---
 .../train-text-from-scratch.cpp               | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index aaaf954be9ea7..4c98b8bafb44d 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3321,23 +3321,23 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     }
 }
 
-float cosine_decay(const int decay_steps, const float alpha, int step) {
+float cosine_decay(const int decay_steps, const float minimum, int step) {
     if (step > decay_steps) {
         step = decay_steps;
     }
     const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
-    const float decay = (1 - alpha)*cosine_decay + alpha;
+    const float decay = (1 - minimum)*cosine_decay + minimum;
     return decay;
 }
 
-float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult, bool enable_restart) {
+float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
     if (enable_restart) {
         while (step > decay_steps) {
             step -= decay_steps;
             decay_steps = (int) restart_step_mult * decay_steps;
         }
     }
-    return cosine_decay(decay_steps, alpha, step);
+    return cosine_decay(decay_steps, minimum, step);
 }
 
 struct train_params {
@@ -3374,7 +3374,7 @@ struct train_params {
     int   warmup;
     int   cos_decay_steps;
     float cos_decay_restart;
-    float cos_decay_alpha;
+    float cos_decay_min;
     bool  enable_restart;
 
     int   opt_past;
@@ -3439,21 +3439,21 @@ struct train_params get_default_train_params() {
     params.warmup            =  100;
     params.cos_decay_steps   = 1000;
     params.cos_decay_restart = 1.1f;
-    params.cos_decay_alpha   = 0.0f;
+    params.cos_decay_min     = 0.1f;
     params.enable_restart    = false;
 
-    params.lbfgs_n_iter      = 256;
-    params.adam_n_iter       = 256;
-    params.adam_alpha        = 1e-3f;
-    params.adam_min_alpha    = 1e-4f;
-    params.adam_decay        = 1e-1f;
+    params.lbfgs_n_iter        = 256;
+    params.adam_n_iter         = 256;
+    params.adam_alpha          = 1e-3f;
+    params.adam_min_alpha      = 0;
+    params.adam_decay          = 1e-1f;
     params.adam_decay_min_ndim = 2;
-    params.adam_beta1        = 0.9f;
-    params.adam_beta2        = 0.999f;
-    params.adam_gclip        = 1.0f;
-    params.adam_eps_f        = 0.0f;
+    params.adam_beta1          = 0.9f;
+    params.adam_beta2          = 0.999f;
+    params.adam_gclip          = 1.0f;
+    params.adam_eps_f          = 0.0f;
 
-    params.mem_model_gb   = 2;
+    params.mem_model_gb   =  2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
     params.mem_compute1_gb = 1;
@@ -3496,7 +3496,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-alpha N        Only for Adam optimizer. Cosine decay alpha (default %f)\n", params->cos_decay_alpha);
+    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
     fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
     fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
     fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
@@ -3505,7 +3505,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha, usually 0.1 * alpha (default %f)\n", params->adam_min_alpha);
+    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
     fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
     fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
@@ -3676,12 +3676,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->cos_decay_restart = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-alpha") {
+        } else if (arg == "--cos-decay-min") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->cos_decay_alpha = std::stof(argv[i]);
+            params->cos_decay_min = std::stof(argv[i]);
         } else if (arg == "--enable-restart") {
             params->enable_restart = true;
         } else if (arg == "--disable-restart") {
@@ -3835,7 +3835,7 @@ void opt_callback(void * vdata, float * sched) {
                 ? (float) opt->iter / (float) params->warmup
                 : cosine_decay_restart(
                     params->cos_decay_steps,
-                    params->cos_decay_alpha,
+                    params->cos_decay_min,
                     opt->iter - params->warmup,
                     params->cos_decay_restart,
                     params->enable_restart);
@@ -4131,7 +4131,7 @@ int main(int argc, char ** argv) {
             ? (float) opt->iter / (float) params.warmup
             : cosine_decay_restart(
                 params.cos_decay_steps,
-                params.cos_decay_alpha,
+                params.cos_decay_min,
                 opt->iter - params.warmup,
                 params.cos_decay_restart,
                 params.enable_restart);

From 17a0898d50a5ce653093ee9a8f6528dbdc2e7e61 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 17:58:09 +0200
Subject: [PATCH 027/235] fix increase of model.train_samples and
 model.train_tokens

now that each optimizer iteration gets its own batch we need to multiply by number of opt iterations
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 4c98b8bafb44d..770b41e7b377f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4145,9 +4145,10 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
+        int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter;
         model.train_its = opt->iter;
-        model.train_samples += n_batch;
-        model.train_tokens  += n_batch * n_tokens;
+        model.train_samples += n_batch * n_iter;
+        model.train_tokens  += n_batch * n_tokens * n_iter;
 
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);

From 24a4b099f37ae2deef2296a0dae4b6fc5f27b266 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:24:57 +0200
Subject: [PATCH 028/235] change sampling parameters for prediction after
 training to defaults of common.h

and clarify what is context for prediction and what are generated tokens
---
 .../train-text-from-scratch.cpp               | 50 +++++++++++--------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 770b41e7b377f..2c17d0b99e349 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2799,19 +2799,19 @@ void shuffle_ints(int * begin, int * end) {
 }
 
 struct my_llama_sampler_params {
-    float temp            = 0.0f;  // <= 0.0 disabled
-    int   top_k           = 20;    // <= 0 to use vocab size
-    float top_p           = 0.95f; // 1.0 = disabled
-    float tfs_z           = 1.00f; // 1.0 = disabled
-    float typical_p       = 1.00f; // 1.0 = disabled
-    int   repeat_last_n   = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float repeat_penalty  = 1.0f;  // 1.0 = disabled
-    float alpha_presence  = 0.0f;  // 0.0 = disabled
-    float alpha_frequency = 0.0f;  // 0.0 = disabled
-    int   mirostat        = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float mirostat_tau    = 5.00f; // target entropy
-    float mirostat_eta    = 0.10f; // learning rate
-    bool  penalize_nl     = true;  // consider newlines as a repeatable token
+    float temp              = 0.0f;  // <= 0.0 disabled
+    int   top_k             = 20;    // <= 0 to use vocab size
+    float top_p             = 0.95f; // 1.0 = disabled
+    float tfs_z             = 1.00f; // 1.0 = disabled
+    float typical_p         = 1.00f; // 1.0 = disabled
+    int   repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float repeat_penalty    = 1.0f;  // 1.0 = disabled
+    float presence_penalty  = 0.0f;  // 0.0 = disabled
+    float frequency_penalty = 0.0f;  // 0.0 = disabled
+    int   mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float mirostat_tau      = 5.00f; // target entropy
+    float mirostat_eta      = 0.10f; // learning rate
+    bool  penalize_nl       = true;  // consider newlines as a repeatable token
 };
 
 struct my_llama_sampler {
@@ -2871,8 +2871,8 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
         candidates_p,
         last_tokens + n_last_tokens - n_last,
         n_last,
-        params.alpha_frequency,
-        params.alpha_presence);
+        params.frequency_penalty,
+        params.presence_penalty);
 
     if (!params.penalize_nl) {
         logits[llama_token_nl()] = nl_logit;
@@ -4203,12 +4203,22 @@ int main(int argc, char ** argv) {
         int n_gen = params.n_predict;
         int sample_ctx = n_tokens - n_tokens/8;
 
-        sampler.params.temp = 0.2f;
-        sampler.params.repeat_penalty = 1.1f;
-        sampler.params.mirostat = 2;
+        // use defaults from common.h
+        sampler.params.top_k             = 40;
+        sampler.params.top_p             = 0.95f;
+        sampler.params.tfs_z             = 1.00f;
+        sampler.params.typical_p         = 1.00f;
+        sampler.params.temp              = 0.8f;
+        sampler.params.repeat_penalty    = 1.1f;
+        sampler.params.repeat_last_n     = 64;
+        sampler.params.frequency_penalty = 0.0f;
+        sampler.params.presence_penalty  = 0.0f;
+        sampler.params.mirostat          = 0;
+        sampler.params.mirostat_tau      = 5.00f;
+        sampler.params.mirostat_eta      = 0.10f;
         init_sampler(&sampler, lctx);
 
-        printf("Generating %d tokens.\n", n_gen);
+        printf("[Prediction context]\n");
 
         struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
         struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
@@ -4223,7 +4233,7 @@ int main(int argc, char ** argv) {
             print_token(lctx, ggml_get_i32_1d(tokens_input, i));
         }
 
-        printf("---\n");
+        printf("\n[Generating %d tokens]\n", n_gen);
         for (int i=0; i<n_gen; ++i) {
             struct ggml_init_params cparams = {
                 compute_size, // .mem_size

From 1065c3b7b934167943ea7824a5dae9931318c358 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:35:11 +0200
Subject: [PATCH 029/235] tighten abs error bounds for cross_entropy_loss in
 test-grad0

---
 tests/test-grad0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index fe2ca212f82a0..0bbeff2707a91 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1386,7 +1386,7 @@ int main(int argc, const char ** argv) {
 
                 struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
 
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-1f, INFINITY);
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
             }
         }
 

From dbbc2633137f15205a80466451a0ebe5ba8baf2f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:45:18 +0200
Subject: [PATCH 030/235] add conditional compilation of using F16 exp in flash
 attention

uncomment `// #define GGML_FLASH_ATTN_EXP_FP16` to enable usage of f16 exp in flash attention
---
 ggml.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2138cb8bc9e3d..53f2c425450af 100644
--- a/ggml.c
+++ b/ggml.c
@@ -124,6 +124,7 @@ typedef void * thread_ret_t;
 #define GGML_GELU_QUICK_FP16
 #define GGML_SILU_FP16
 // #define GGML_CROSS_ENTROPY_EXP_FP16
+// #define GGML_FLASH_ATTN_EXP_FP16
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -13111,10 +13112,13 @@ static void ggml_compute_forward_flash_attn_f32(
                         if (SS[j] == -INFINITY) {
                             SS[j] = 0.0f;
                         } else {
-                            // const float val = expf(SS[j] - max);
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+                            const float val = expf(SS[j] - max);
+#else
                             ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
                             memcpy(&scvt[j], &s, sizeof(uint16_t));
                             const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
                             sump[j] += (ggml_float)val;
                             SS[j] = val;
                         }
@@ -13703,10 +13707,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
                             if (SR[j] == -INFINITY) {
                                 SW[j] = 0.0f;
                             } else {
-                                // const float val = expf(SR[j] - max);
+#ifndef GGML_FLASH_ATTN_EXP_FP16
+                                const float val = expf(SR[j] - max);
+#else
                                 ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
                                 memcpy(&scvt[j], &s, sizeof(uint16_t));
                                 const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+#endif
                                 sump[j] += (ggml_float)val;
                                 SW[j] = val;
                             }

From 47055c929fa4696a87c0ea10fc818d86359e622f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:45:54 +0200
Subject: [PATCH 031/235] tighten abs error bounds for flash_attn in test-grad0

---
 tests/test-grad0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 0bbeff2707a91..aba4b9c20b2a1 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1493,7 +1493,7 @@ int main(int argc, const char ** argv) {
 
                     struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
 
-                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                 }
             }
         }
@@ -1534,7 +1534,7 @@ int main(int argc, const char ** argv) {
 
                     struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
 
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                 }
             }
         }

From 0f6a8ab51958d9dc12ab4b311b95b2dd53d4e9ae Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:48:57 +0200
Subject: [PATCH 032/235] tighten abs error bounds for sqrt in test-grad0

---
 tests/test-grad0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index aba4b9c20b2a1..ef608a01d3a45 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -531,7 +531,7 @@ int main(int argc, const char ** argv) {
 
                 struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
 
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
+                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
             }
         }
 

From 87035b96f78170fc3b6eba071efd9075eb750cb3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 3 Jul 2023 18:56:05 +0200
Subject: [PATCH 033/235] remove out-commented vectorized code of opt_adam

the vectorized code might be bit faster for low number of parameters, but it had a big memory usage overhead
---
 ggml.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/ggml.c b/ggml.c
index 53f2c425450af..4ddd154bf473f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17417,44 +17417,6 @@ static enum ggml_opt_result ggml_opt_adam(
                 }
             }
         }
-        {
-        //     // update the gradient
-        //     ggml_opt_get_grad(np, ps, g1);
-
-        //     // m_t = beta1*m_t-1 + (1 - beta1)*g_t
-        //     ggml_vec_scale_f32(nx, m, beta1);
-        //     ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
-
-        //     // g2 = g1^2
-        //     ggml_vec_sqr_f32  (nx, g2, g1);
-
-        //     // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
-        //     ggml_vec_scale_f32(nx, v, beta2);
-        //     ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
-
-        //     // m^hat = m_t / (1 - beta1^t)
-        //     // v^hat = v_t / (1 - beta2^t)
-        //     // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1)
-        //     // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1
-        //     // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps)
-        //     // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps)
-        //     // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay)
-        //     ggml_vec_cpy_f32  (nx, mh, m);
-        //     ggml_vec_cpy_f32  (nx, vh, v);
-
-        //     ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter)));
-        //     ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, opt->iter)));
-
-        //     ggml_vec_sqrt_f32 (nx, vh, vh);
-        //     ggml_vec_acc1_f32 (nx, vh, eps);
-
-        //     ggml_vec_div_f32  (nx, mh, mh, vh);
-        //     ggml_vec_scale_f32(nx, x,  1.0f - decay);
-        //     ggml_vec_sub_f32  (nx, x,  x,  mh);
-
-        //     // update the parameters
-        //     ggml_opt_set_params(np, ps, x);
-        }
 
         if (callback) {
             callback(callback_data, &sched);

From ecdc16163efa41fc41ac2dfca63cb7af60e2362c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:09:56 +0200
Subject: [PATCH 034/235] ggml : update ggml_rms_norm_back with configurable
 eps

---
 ggml.c | 13 ++++++++++---
 ggml.h |  4 ++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4ddd154bf473f..756000cffcc10 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5824,7 +5824,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
 struct ggml_tensor * ggml_rms_norm_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
+        struct ggml_tensor  * b,
+        float  eps) {
     bool is_node = false;
 
     if (a->grad) {
@@ -5834,6 +5835,8 @@ struct ggml_tensor * ggml_rms_norm_back(
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
+    ggml_set_op_params(result, &eps, sizeof(eps));
+
     result->op   = GGML_OP_RMS_NORM_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
@@ -10211,7 +10214,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
-    const float eps = 1e-6f; // TODO: make this a parameter
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
 
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -15029,9 +15033,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
+                    float eps;
+                    memcpy(&eps, tensor->op_params, sizeof(float));
+
                     src0->grad = ggml_add_impl(ctx,
                             src0->grad,
-                            ggml_rms_norm_back(ctx, src0, tensor->grad),
+                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
                             inplace);
                 }
             } break;
diff --git a/ggml.h b/ggml.h
index 3980c005036bb..9e8ed956eae4e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -894,11 +894,11 @@ extern "C" {
 
     // a - x
     // b - dy
-    // TODO: update with configurable eps
     GGML_API struct ggml_tensor * ggml_rms_norm_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            float                 eps);
 
     // A: n columns, m rows
     // B: n columns, p rows  (i.e. we transpose it internally)

From c1a5e116a45227fcd48b14b9db27995b922a7b0d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:10:55 +0200
Subject: [PATCH 035/235] llama training : fix ggml_rms_norm_back calls to pass
 configurable eps

---
 .../train-text-from-scratch.cpp                | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 2c17d0b99e349..70fcdc5decfc5 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1838,7 +1838,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
         clr_buf(0);
         use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         if (grad_layer_inp) {
             t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
@@ -1854,7 +1854,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
         t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
         t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
         use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
         grad_layer_inp = t21;
         use_buf(0);
         t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
@@ -1899,9 +1899,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
     }
     clr_buf(0);
     use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
     // clr_buf(1);
     // clr_buf(0);
 
@@ -2396,9 +2396,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
 
         clr_buf(0);
         use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps));                    assert_shape_2d(t30->grad, n_embd, N*n_batch);
         if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
+            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad));                                  assert_shape_2d(t30->grad, n_embd, N*n_batch);
         }
         clr_buf(1);
         t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
@@ -2412,7 +2412,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
         t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
         use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad)));                  assert_shape_2d(t21->grad, n_embd, N*n_batch);
+        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
         grad_layer_inp = t21;
         use_buf(0);
         t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
@@ -2458,9 +2458,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
     GGML_ASSERT(avail_begin == 0);
     clr_buf(0);
     use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad)));  assert_shape_2d(t01->grad, n_embd, N*n_batch);
+    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
     use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                  assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
+    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
 
     *logits = t35;
 

From 22cb368dd964cb0506da1002e084fdc5ee92b23e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 28 Jul 2023 23:55:30 +0200
Subject: [PATCH 036/235] remove trailing whitespace

---
 tests/test-grad0.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index ef608a01d3a45..079eef7e02e37 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -1348,9 +1348,9 @@ int main(int argc, const char ** argv) {
                 float eps = 1e-6f;
                 // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
                 // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0, 
-                                            ggml_log(ctx0, 
-                                                ggml_add1(ctx0, 
+                struct ggml_tensor * f = ggml_sum(ctx0,
+                                            ggml_log(ctx0,
+                                                ggml_add1(ctx0,
                                                     ggml_scale(ctx0,
                                                         ggml_soft_max(ctx0, x[0]),
                                                         ggml_new_f32(ctx0, 1.0f - eps)),

From 2bf422eafd9fd8a7bd5a065b51975f84cdf3ca2b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 6 Aug 2023 23:07:57 +0200
Subject: [PATCH 037/235] add train function using automatic gradient
 checkpointing backward pass and allocator

---
 .../train-text-from-scratch.cpp               | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 70fcdc5decfc5..76e6ace640a66 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-alloc.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@@ -1342,6 +1343,291 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
     return inpL;
 }
 
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static size_t hash_find(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_GRAPH_HASHTABLE_SIZE;
+        }
+    }
+    return i;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t h = hash(p);
+    size_t i = hash_find(hash_table, p);
+
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    // insert
+    GGML_ASSERT(hash_table[i] == NULL);
+    hash_table[i] = p;
+    return false;
+}
+
+static bool hash_contains(void * hash_table[], void * p) {
+    size_t i = hash_find(hash_table, p);
+    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
+}
+
+struct hash_map {
+    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
+    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
+};
+static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
+
+struct hash_map * new_hash_map(struct ggml_context * ctx, struct ggml_tensor * * out_buf) {
+    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, HASH_MAP_SIZE);
+    if (out_buf) {
+        * out_buf = buf;
+    }
+    struct hash_map * result = (struct hash_map *) ((char *) buf->data);
+    *result = (struct hash_map) {
+        /*.keys   =*/ { NULL },
+        /*.vals   =*/ { NULL },
+    };
+    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
+        result->keys[i] = NULL;
+        result->vals[i] = NULL;
+    }
+    return result;
+};
+
+struct ggml_tensor * ggml_recompute_graph_node(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * graph,
+        struct hash_map     * replacements,
+        struct ggml_tensor  * node) {
+
+    if (node == NULL) {
+        return NULL;
+    }
+
+    if (node->is_param) {
+        return node;
+    }
+
+    if (!hash_contains(graph->visited_hash_table, node)) {
+        return node;
+    }
+
+    size_t i = hash_find(replacements->keys, node);
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+    if (replacements->keys[i] == p) {
+        return replacements->vals[i];
+    }
+
+    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+
+    // insert clone into replacements
+    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+    replacements->keys[i] = node;
+    replacements->vals[i] = clone;
+
+    clone->op       = node->op;
+    clone->grad     = node->grad;
+    clone->is_param = node->is_param;
+    clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
+    }
+
+    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
+    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
+    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
+    memcpy(clone->name,      node->name,      sizeof(node->name));
+
+    return clone;
+};
+
+void ggml_build_backward_gradient_checkpointing(
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * checkpoints,
+        int                     n_checkpoints) {
+    *gb_tmp = *gf;
+    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
+
+    if (n_checkpoints <= 0) {
+        *gb = *gb_tmp;
+        return;
+    }
+
+    struct hash_map * replacements = new_hash_map(ctx, NULL);
+
+    // insert checkpoints in replacements
+    for (int i = 0; i < n_checkpoints; ++i) {
+        size_t k = hash_find(replacements->keys, node);
+        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+        GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+        replacements->keys[k] = checkpoints[i];
+        replacements->vals[k] = checkpoints[i];
+    }
+
+    *gb = *gf;
+    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
+    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
+    // by recomputing them from checkpoints
+    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
+        struct ggml_tensor * node = gb_tmp->nodes[i];
+        for (int k = 0; k < GGML_MAX_SRC; ++k) {
+            // insert new tensors recomputing src, reusing already made replacements,
+            // remember replacements: remember new tensors with mapping from corresponding gf nodes
+            // recurse for input tensors,
+            // unless (i.e. terminating when) input tensors are checkpoints
+            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
+        }
+        // insert rewritten backward node with replacements made into resulting backward graph gb
+        ggml_build_forward_expand(gb, node);
+    }
+}
+
+struct ggml_tensor * llama_build_train_graphs(
+        struct my_llama_model * model,
+        struct ggml_allocr    * alloc,
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * logits,
+        struct ggml_tensor    * tokens_input,
+        struct ggml_tensor    * targets,
+        const  int              n_tokens,
+        const  int              n_batch,
+        const  bool             enable_flash_attn,
+        const  bool             enable_checkpointing) {
+
+    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+    const int n_past = 0;
+    const int N = n_tokens;
+    const auto & hparams = model->hparams;
+    const int n_ctx      = hparams.n_ctx;
+    const int n_vocab    = hparams.n_vocab;
+    const int n_embd     = hparams.n_embd;
+    const int n_layer    = hparams.n_layer;
+    const int n_head     = hparams.n_head;
+    const int n_rot      = hparams.n_rot;
+    const int n_ff       = get_n_ff(&hparams);
+    const int rope_mode  = 0;
+
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  assert_shape_1d(t00, N*n_batch);
+    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); assert_shape_2d(t01, n_embd, N*n_batch);
+
+    struct ggml_tensor * cur = t01;
+
+    std::vector<struct ggml_tensor *> checkpoints;
+    checkpoints.push_back(cur);
+
+    struct ggml_tensor * kv_scale;
+    if (flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)));
+    }
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t02, t03);                               assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          assert_shape_2d(t08, n_embd, N*n_batch);
+        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          assert_shape_2d(t11, N*n_batch, n_embd);
+        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        struct ggml_tensor * t16;
+        if (enable_flash_attn) {
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        } else {
+            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 assert_shape_4d(t16_0, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          assert_shape_4d(t16_1, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            assert_shape_4d(t16_2, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    assert_shape_4d(t16_3, N, N, n_head, n_batch);
+            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        }
+        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 assert_shape_2d(t19, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               assert_shape_2d(t21, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               assert_shape_2d(t24, n_embd, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    assert_shape_2d(t27, n_ff, N*n_batch);
+        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               assert_shape_2d(t28, n_ff, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t21, t29);                               assert_shape_2d(t30, n_embd, N*n_batch);
+        cur = t30;
+        checkpoints.push_back(cur);
+    }
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        assert_shape_1d(t36, 1);
+
+    checkpoints.push_back(t31);
+    checkpoints.push_back(t32);
+    checkpoints.push_back(t33);
+    checkpoints.push_back(t34);
+    checkpoints.push_back(t35);
+    checkpoints.push_back(t36);
+
+    ggml_build_forward_expand(gf, t36);
+
+    if (enable_checkpointing) {
+        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
+    } else {
+        *gb = *gf;
+        ggml_build_backward_expand(ctx, gf, gb, true);
+    }
+
+    if (alloc) {
+        // make sure t35 and t36 are not reallocated by inserting new temporary node depending on them
+        struct ggml_tensor * dep = ggml_scale_inplace(ctx, t35, t36);
+        int n_nodes_before = gb->n_nodes;
+        ggml_build_forward_expand(gb, dep);
+
+        int n_nodes_after = gb->n_nodes;
+        GGML_ASSERT(n_nodes_after == n_nodes_before + 1);
+
+        ggml_allocr_reset(alloc);
+        ggml_allocr_alloc_graph(alloc, gb);
+
+        // remove the additional node that was insert
+        gb->nodes[n_nodes_after-1] = NULL;
+        gb->n_nodes = n_nodes_before;
+    }
+
+    *logits = t35;
+    return t36;
+}
+
+
 // expand the graph nodes without creating leafs.
 struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
     // check if already visited

From fc826c8ea81136961a499d48fb3ab2e221bdbeee Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:49:22 +0200
Subject: [PATCH 038/235] in train function replace add_inplace by regular add

because using add_inplace seems to result in different gradients
---
 .../train-text-from-scratch/train-text-from-scratch.cpp     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 76e6ace640a66..51eb96fc9510c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1264,7 +1264,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             assert_shape_2d(cur, n_embd, N*n_batch);
         }
 
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
+        // struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
         assert_shape_2d(inpFF, n_embd, N*n_batch);
 
         // feed-forward network
@@ -1304,7 +1305,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
             assert_shape_2d(cur, n_embd, N*n_batch);
         }
 
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
+        // cur = ggml_add_inplace(ctx0, cur, inpFF);
+        cur = ggml_add(ctx0, cur, inpFF);
         assert_shape_2d(cur, n_embd, N*n_batch);
 
         // input for next layer

From d43741540b74fb083aa36ce625409695f39229bf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:51:20 +0200
Subject: [PATCH 039/235] don't use allocate hash_map on context

because the context has no_alloc=True when using memory allocator resulting in NULL data pointers
---
 .../train-text-from-scratch.cpp               | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 51eb96fc9510c..03ec39d860018 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1392,16 +1392,8 @@ struct hash_map {
 };
 static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
 
-struct hash_map * new_hash_map(struct ggml_context * ctx, struct ggml_tensor * * out_buf) {
-    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, HASH_MAP_SIZE);
-    if (out_buf) {
-        * out_buf = buf;
-    }
-    struct hash_map * result = (struct hash_map *) ((char *) buf->data);
-    *result = (struct hash_map) {
-        /*.keys   =*/ { NULL },
-        /*.vals   =*/ { NULL },
-    };
+struct hash_map * new_hash_map() {
+    struct hash_map * result = new struct hash_map;
     for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
         result->keys[i] = NULL;
         result->vals[i] = NULL;
@@ -1409,6 +1401,10 @@ struct hash_map * new_hash_map(struct ggml_context * ctx, struct ggml_tensor * *
     return result;
 };
 
+void free_hash_map(struct hash_map * map) {
+    delete map;
+}
+
 struct ggml_tensor * ggml_recompute_graph_node(
         struct ggml_context * ctx,
         struct ggml_cgraph  * graph,
@@ -1471,7 +1467,7 @@ void ggml_build_backward_gradient_checkpointing(
         return;
     }
 
-    struct hash_map * replacements = new_hash_map(ctx, NULL);
+    struct hash_map * replacements = new_hash_map();
 
     // insert checkpoints in replacements
     for (int i = 0; i < n_checkpoints; ++i) {
@@ -1498,6 +1494,8 @@ void ggml_build_backward_gradient_checkpointing(
         // insert rewritten backward node with replacements made into resulting backward graph gb
         ggml_build_forward_expand(gb, node);
     }
+
+    free_hash_map(replacements);
 }
 
 struct ggml_tensor * llama_build_train_graphs(

From cfddc36be220a035ceaab4bb7365b399cc0cf700 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:52:15 +0200
Subject: [PATCH 040/235] correctly clone reshape and permute operations by
 also cloning tensor->nb values

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 03ec39d860018..92e6315dcb061 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1440,6 +1440,9 @@ struct ggml_tensor * ggml_recompute_graph_node(
     clone->grad     = node->grad;
     clone->is_param = node->is_param;
     clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
+        clone->nb[k] = node->nb[k];
+    }
     for (int k = 0; k < GGML_MAX_SRC; ++k) {
         clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
     }

From 0dd496c5e21a6baeb377babd769efa8e01981e9e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:52:48 +0200
Subject: [PATCH 041/235] fix variable name and add missing type cast

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 92e6315dcb061..266f378b9e4ed 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1425,8 +1425,8 @@ struct ggml_tensor * ggml_recompute_graph_node(
 
     size_t i = hash_find(replacements->keys, node);
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-    if (replacements->keys[i] == p) {
-        return replacements->vals[i];
+    if (replacements->keys[i] == node) {
+        return (struct ggml_tensor *) replacements->vals[i];
     }
 
     struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);

From 52c92c0a8cc88f45bcde9556b4f6d4481a36bd9d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:53:36 +0200
Subject: [PATCH 042/235] terminate recursive tensor cloning when reaching
 tensor without src tensors

---
 .../train-text-from-scratch.cpp                       | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 266f378b9e4ed..9b73361ca7e60 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1423,6 +1423,17 @@ struct ggml_tensor * ggml_recompute_graph_node(
         return node;
     }
 
+    int count_children = 0;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        if (node->src[k]) {
+            ++count_children;
+        }
+    }
+
+    if (count_children == 0) {
+        return node;
+    }
+
     size_t i = hash_find(replacements->keys, node);
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
     if (replacements->keys[i] == node) {

From 345f516f7c3384e38e610e413f9060dd729049f4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:55:13 +0200
Subject: [PATCH 043/235] correctly clone view tensors by setting data pointers

without this the checkpointing would only work when being used together with memory allocator
---
 .../train-text-from-scratch.cpp               | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9b73361ca7e60..410ba69b9fbec 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1405,6 +1405,33 @@ void free_hash_map(struct hash_map * map) {
     delete map;
 }
 
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
 struct ggml_tensor * ggml_recompute_graph_node(
         struct ggml_context * ctx,
         struct ggml_cgraph  * graph,
@@ -1457,6 +1484,11 @@ struct ggml_tensor * ggml_recompute_graph_node(
     for (int k = 0; k < GGML_MAX_SRC; ++k) {
         clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
     }
+    if (ggml_is_view(clone)) {
+        struct ggml_tensor * source = get_view_source(clone);
+        GGML_ASSERT(source != NULL);
+        clone->data = source->data;
+    }
 
     GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
     GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);

From 5a11b75875e2e82501c1bdaf7c4528a3ecc4f4e3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:55:51 +0200
Subject: [PATCH 044/235] fix variable names

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 410ba69b9fbec..bdc7cdade9a1b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1517,9 +1517,9 @@ void ggml_build_backward_gradient_checkpointing(
 
     // insert checkpoints in replacements
     for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = hash_find(replacements->keys, node);
+        size_t k = hash_find(replacements->keys, checkpoints[i]);
         GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-        GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
         replacements->keys[k] = checkpoints[i];
         replacements->vals[k] = checkpoints[i];
     }

From b2f13101961825ec4ab4b86907d611daff739ffe Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:57:13 +0200
Subject: [PATCH 045/235] swap arguments to commutative ops to be the same as
 in `forward_batch_wo_cache_flash_attn`

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index bdc7cdade9a1b..d5fde1ca59461 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1590,7 +1590,7 @@ struct ggml_tensor * llama_build_train_graphs(
         struct my_llama_layer & layer = model->layers[il];
         struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      assert_shape_2d(t02, n_embd, N*n_batch);
         struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t02, t03);                               assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               assert_shape_2d(t04, n_embd, N*n_batch);
         struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          assert_shape_2d(t05, n_embd, N*n_batch);
         struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
@@ -1625,7 +1625,7 @@ struct ggml_tensor * llama_build_train_graphs(
         struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    assert_shape_2d(t27, n_ff, N*n_batch);
         struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               assert_shape_2d(t28, n_ff, N*n_batch);
         struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t21, t29);                               assert_shape_2d(t30, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
         checkpoints.push_back(cur);
     }

From 5884b43a622a88f7f2fddf73f97ae9f50137efdc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:58:49 +0200
Subject: [PATCH 046/235] add input tensors as checkpoints

so that recursive tensor cloning of gradient checkpointing terminates on input tensors
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index d5fde1ca59461..48edf3651145f 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1579,7 +1579,10 @@ struct ggml_tensor * llama_build_train_graphs(
     struct ggml_tensor * cur = t01;
 
     std::vector<struct ggml_tensor *> checkpoints;
-    checkpoints.push_back(cur);
+    checkpoints.push_back(tokens_input);
+    checkpoints.push_back(targets);
+    checkpoints.push_back(t00);
+    checkpoints.push_back(t01);
 
     struct ggml_tensor * kv_scale;
     if (flash_attn) {

From 9716eb8ef0830e54badb729244175329744d9b99 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 17:59:19 +0200
Subject: [PATCH 047/235] fix variable name and add missing boolean negation

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 48edf3651145f..88a1c3a504108 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1585,8 +1585,8 @@ struct ggml_tensor * llama_build_train_graphs(
     checkpoints.push_back(t01);
 
     struct ggml_tensor * kv_scale;
-    if (flash_attn) {
-        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)));
+    if (!enable_flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
     }
 
     for (int il = 0; il < n_layer; ++il) {

From 38f4438c32def72e7a0fd42f9caba9df80a5cc32 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:07:16 +0200
Subject: [PATCH 048/235] make sure some tensors are not reallocated by
 inserting new temporary nodes depending on them:

output and parameter gradient tensors need to be available at the end of the graph execution

parameter gradient tensors also need to be available before the graph execution because they are set to zero before each optimizer iteration

checkpoint tensors are allocated all together to reduce memory allocator fragmentation

afterwards, in addition to the temporary nodes, we also need to reset the temporary leafs
---
 .../train-text-from-scratch.cpp               | 31 ++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 88a1c3a504108..0583250598762 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1656,19 +1656,36 @@ struct ggml_tensor * llama_build_train_graphs(
     }
 
     if (alloc) {
-        // make sure t35 and t36 are not reallocated by inserting new temporary node depending on them
-        struct ggml_tensor * dep = ggml_scale_inplace(ctx, t35, t36);
+        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
+        int n_leafs_before = gb->n_leafs;
         int n_nodes_before = gb->n_nodes;
-        ggml_build_forward_expand(gb, dep);
+        struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
+        // output tensors
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+        // gradient tensors (will be set to zero by ggml_graph_reset)
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            if (!gf->grads[i]) continue;
+            ggml_allocr_alloc(alloc, gf->grads[i]);
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
+        }
+        for (int i = 0; i < checkpoints.size(); ++i) {
+            ggml_allocr_alloc(alloc, checkpoints[i]);
+        }
 
+        int n_leafs_after = gb->n_leafs;
         int n_nodes_after = gb->n_nodes;
-        GGML_ASSERT(n_nodes_after == n_nodes_before + 1);
 
-        ggml_allocr_reset(alloc);
         ggml_allocr_alloc_graph(alloc, gb);
 
-        // remove the additional node that was insert
-        gb->nodes[n_nodes_after-1] = NULL;
+        // remove the additional nodes and leafs
+        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
+            gb->leafs[i] = NULL;
+        }
+        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
+            gb->nodes[i] = NULL;
+        }
+        gb->n_leafs = n_leafs_before;
         gb->n_nodes = n_nodes_before;
     }
 

From d6c5b03858aff6d68f1549d64343b566c53b3830 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:08:19 +0200
Subject: [PATCH 049/235] fix ASSERT to work with zero layers

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0583250598762..28fbd2dc84ea6 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2806,7 +2806,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
         layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
     }
     // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    GGML_ASSERT(chk_idx == -2);
+    GGML_ASSERT(n_check == 0 || chk_idx == -2);
     GGML_ASSERT(avail_begin == 0);
     clr_buf(0);
     use_buf(0);

From 4ed096c6b086af80032d4b41138a4cc932bb3426 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:10:02 +0200
Subject: [PATCH 050/235] add training options whether to use allocator and/or
 unified training function

---
 .../train-text-from-scratch.cpp                  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 28fbd2dc84ea6..15f60513f3be2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3721,6 +3721,8 @@ struct train_params {
     bool use_flash;
     bool use_scratch;
     bool use_checkpointing;
+    bool use_alloc;
+    bool use_unified;
 
     // only adam
     int   warmup;
@@ -3782,6 +3784,8 @@ struct train_params get_default_train_params() {
     params.use_flash              = true;
     params.use_scratch            = true;
     params.use_checkpointing      = true;
+    params.use_alloc              = true;
+    params.use_unified            = true;
 
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
@@ -3845,6 +3849,10 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --use-scratch              Use scratch buffers. Implies use-flash. (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
     fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing. Implies use-scratch and use-flash. (default)\n");
+    fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
+    fprintf(stderr, "  --use-alloc                Use allocator. Implies use-unified. (default)\n");
+    fprintf(stderr, "  --no-unified               Don't use unified\n");
+    fprintf(stderr, "  --use-unified              Use unified. (default)\n");
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
@@ -4010,6 +4018,14 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_checkpointing = false;
         } else if (arg == "--use-checkpointing") {
             params->use_checkpointing = true;
+        } else if (arg == "--no-alloc") {
+            params->use_alloc = false;
+        } else if (arg == "--use-alloc") {
+            params->use_alloc = true;
+        } else if (arg == "--no-unified") {
+            params->use_unified = false;
+        } else if (arg == "--use-unified") {
+            params->use_unified = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;

From 865c4cd3c1ceceab3e7a4b537b03051befbbc6bc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:12:58 +0200
Subject: [PATCH 051/235] integrate unified training function which may use
 memory allocator

the unified training function also supports arguments whether to use flash attention and/or gradient checkpointing
---
 .../train-text-from-scratch.cpp               | 43 +++++++++++++++----
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 15f60513f3be2..a4b41e7fb8c44 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4391,6 +4391,12 @@ int main(int argc, char ** argv) {
     uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
     uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
 
+    ggml_allocr * alloc = NULL;
+    if (params.use_alloc) {
+        static const size_t tensor_alignment = 32;
+        alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
+    }
+
     GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
     train_samples.push_back(0);
@@ -4437,33 +4443,48 @@ int main(int argc, char ** argv) {
         };
         struct ggml_context * ctx0 = ggml_init(cparams);
 
+        ggml_set_no_alloc(ctx0, false);
+
+        // don't use alloc for input tensors, so we can safely fill them with data
         struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
+        ggml_set_no_alloc(ctx0, (alloc != NULL));
+
+        if (alloc) {
+            ggml_allocr_reset(alloc);
+        }
+
         opt_cb_data.tokens_input  = tokens_input;
         opt_cb_data.target_logits = target_logits;
         opt_cb_data.target_probs  = target_probs;
 
         int n_past = 0;
 
-        struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
-        struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
-
-        memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
-        memset(gbbuf->data, 0, ggml_nbytes(gbbuf));
-
-        struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
-        struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+        struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+        struct ggml_cgraph * gb_tmp = (params.use_unified || params.use_alloc) 
+            ? ggml_new_graph(ctx0)
+            : NULL;
 
         GGML_ASSERT(n_past == 0);
 
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_checkpointing) {
+        if (params.use_alloc || params.use_unified) {
+            loss = llama_build_train_graphs(
+                &model, alloc, ctx0,
+                gf, gb, gb_tmp,
+                &logits, tokens_input, target_probs,
+                n_tokens, n_batch,
+                params.use_flash, 
+                params.use_checkpointing
+            );
+        } else if (params.use_checkpointing) {
             loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
                     &model, ctx0,
                     gf, gb,
@@ -4641,6 +4662,10 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (alloc) {
+        ggml_allocr_free(alloc);
+    }
+
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;

From 3e99a8d65369ba2bdcc8eff1e6036fe11966cadc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:15:09 +0200
Subject: [PATCH 052/235] format name of cloned tensors with " (clone)" suffix

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a4b41e7fb8c44..7983b3bfab027 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1493,7 +1493,7 @@ struct ggml_tensor * ggml_recompute_graph_node(
     GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
     GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
     memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    memcpy(clone->name,      node->name,      sizeof(node->name));
+    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
 
     return clone;
 };

From 75baed230cedf1929e93bcc006160016e2672a70 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:17:14 +0200
Subject: [PATCH 053/235] set names for tensors in unified train function for
 easier debugging

---
 .../train-text-from-scratch.cpp               | 94 ++++++++++---------
 1 file changed, 52 insertions(+), 42 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7983b3bfab027..07982706330d7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1572,9 +1572,19 @@ struct ggml_tensor * llama_build_train_graphs(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
+    auto set_name = [](struct ggml_tensor * t, const char * n) {
+        ggml_set_name(t, n);
+        if (t->grad) {
+            ggml_format_name(t->grad, "%s->grad", n);
+        }
+    };
+
+    set_name(tokens_input, "tokens_input");
+    set_name(targets,      "targets");
+
     GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); assert_shape_2d(t01, n_embd, N*n_batch);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
+    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
 
     struct ggml_tensor * cur = t01;
 
@@ -1591,53 +1601,53 @@ struct ggml_tensor * llama_build_train_graphs(
 
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          assert_shape_2d(t08, n_embd, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          assert_shape_2d(t11, N*n_batch, n_embd);
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
+        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
         struct ggml_tensor * t16;
         if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
         } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
+            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
         }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               assert_shape_2d(t30, n_embd, N*n_batch);
+        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
+        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
         checkpoints.push_back(cur);
     }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        assert_shape_1d(t36, 1);
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
 
     checkpoints.push_back(t31);
     checkpoints.push_back(t32);

From fe788a1c7a7bbaf286b12b99a3df75dad4c7403b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:24:13 +0200
Subject: [PATCH 054/235] allocate graph on context using ggml_new_graph

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 07982706330d7..eb3ac9ac314d7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -4641,9 +4641,7 @@ int main(int argc, char ** argv) {
             };
             struct ggml_context * ctx0 = ggml_init(cparams);
 
-            struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
-            memset(gfbuf->data, 0, ggml_nbytes(gfbuf));
-            struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
+            struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);

From c954f41ca43e0c0d6e8f1225d9e598722e3a1dff Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:27:01 +0200
Subject: [PATCH 055/235] remove handwritten training functions

---
 .../train-text-from-scratch.cpp               | 1586 +----------------
 1 file changed, 8 insertions(+), 1578 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index eb3ac9ac314d7..9d94bdfcf6984 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -946,406 +946,6 @@ struct ggml_tensor * forward_batch(
     return inpL;
 }
 
-struct ggml_tensor * forward_batch_wo_cache(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
-            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        Kcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-            assert_shape_4d(KQ_scaled, N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch);
-
-            // Vcur shape [N, n_batch, n_embd/n_head, n_head]
-            // V shape    [N, n_embd/n_head, n_head, n_batch]
-            struct ggml_tensor * V =
-                ggml_permute(ctx0,
-                    Vcur,
-                    0, 3, 1, 2);
-            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    // ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * tokens = ggml_reshape_1d(ctx0, tokens_input, N*n_batch);
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head);
-            assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head);
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        Kcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * V =
-                ggml_permute(ctx0,
-                    Vcur,
-                    0, 3, 1, 2);
-            assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch);
-
-            bool masked = true;
-            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur = ggml_add_inplace(ctx0, cur, inpFF);
-        cur = ggml_add(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // lm_head
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    // ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
-
 static size_t hash(void * p) {
     return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
 }
@@ -1703,1146 +1303,6 @@ struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-
-// expand the graph nodes without creating leafs.
-struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
-    // check if already visited
-    for (int i = 0; i < g->n_nodes; i++) {
-        if (g->nodes[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < g->n_leafs; i++) {
-        if (g->leafs[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (t->src[i]) {
-            expand(g, t->src[i]);
-        }
-    }
-
-    GGML_ASSERT(g->n_nodes < GGML_MAX_NODES);
-
-    if (strlen(t->name) == 0) {
-        snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes);
-    }
-
-    g->nodes[g->n_nodes] = t;
-    g->grads[g->n_nodes] = t->grad;
-    g->n_nodes++;
-    return t;
-}
-
-void graph_set_leafs_grads(struct ggml_cgraph * g) {
-    // moves leaf nodes to g->leafs.
-    // i.e. g->n_nodes might change.
-    int n_nodes = 0;
-    for (int i = 0; i < g->n_nodes; ++i) {
-        struct ggml_tensor * node = g->nodes[i];
-        const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL;
-        if (is_leaf) {
-            GGML_ASSERT(g->n_leafs < GGML_MAX_NODES);
-
-            if (strlen(node->name) == 0) {
-                snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs);
-            }
-
-            g->leafs[g->n_leafs] = node;
-            g->n_leafs++;
-        } else {
-            GGML_ASSERT(n_nodes < GGML_MAX_NODES);
-
-            if (strlen(node->name) == 0) {
-                snprintf(node->name, sizeof(node->name), "node_%d", n_nodes);
-            }
-
-            g->nodes[n_nodes] = node;
-            g->grads[n_nodes] = node->grad;
-            n_nodes++;
-        }
-    }
-    for (int i=n_nodes; i < g->n_nodes; ++i) {
-        g->nodes[i] = NULL;
-        g->grads[i] = NULL;
-    }
-    g->n_nodes = n_nodes;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        void                  * compute_buf_0,
-        void                  * compute_buf_1,
-        size_t                  size_buf_0,
-        size_t                  size_buf_1,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    gf->n_nodes = 0;
-    gf->n_leafs = 0;
-    gf->perf_runs = 0;
-    gf->perf_cycles = 0;
-    gf->perf_time_us = 0;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
-    const int rope_mode  = 0;
-
-    bool track_max_mem = true;
-
-    int last_buf = -1;
-    size_t buf_offs[2] = { 0, 0 };
-    size_t buf_size[2] = { size_buf_0,
-                           size_buf_1 };
-    void * buf_data[2] = { compute_buf_0,
-                           compute_buf_1 };
-    size_t buf_maxs[2] = { 0, 0 };
-
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
-        size_t last_offs = 0;
-        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-        if (last_buf >= 0) {
-            buf_offs[last_buf] = last_offs;
-            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-        }
-        if (buf >= 0) {
-            size_t offs = buf_offs[buf];
-            size_t size = buf_size[buf];
-            void * data = buf_data[buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-        last_buf = buf;
-    };
-
-
-    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
-        if (buf < 0) return;
-        if (track_max_mem) {
-            size_t last_offs = 0;
-            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-            if (last_buf >= 0) {
-                buf_offs[last_buf] = last_offs;
-                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-            }
-        }
-        buf_offs[buf] = 0;
-        if (track_max_mem && last_buf >= 0) {
-            size_t offs = buf_offs[last_buf];
-            size_t size = buf_size[last_buf];
-            void * data = buf_data[last_buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-    };
-
-
-    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 0;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = N;
-        int64_t ne1 = n_embd/n_head;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 2*nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
-        if (a == NULL) {
-            return b;
-        } else {
-            return ggml_add_inplace(ctx0, a, b);
-        }
-    };
-
-    use_buf(-1);
-
-    model->tok_embeddings->grad    = NULL;
-    model->norm->grad              = NULL;
-    model->output->grad            = NULL;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = NULL;
-        layer.wq->grad             = NULL;
-        layer.wk->grad             = NULL;
-        layer.wv->grad             = NULL;
-        layer.wo->grad             = NULL;
-        layer.ffn_norm->grad       = NULL;
-        layer.w1->grad             = NULL;
-        layer.w2->grad             = NULL;
-        layer.w3->grad             = NULL;
-    }
-
-    clr_buf(0);
-    clr_buf(1);
-
-    use_buf(-1);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
-
-    use_buf(-1);
-
-    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    // need to remember these for the backward pass
-    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
-
-    struct ggml_tensor * cur = t01;
-
-    for (int il = 0; il < n_layer; ++il) {
-        clr_buf(0);
-        struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(-1)
-        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-        t02L[il] = t02;
-        t03L[il] = t03;
-        t04L[il] = t04;
-        t05L[il] = t05;
-        t06L[il] = t06;
-        t07L[il] = t07;
-        t08L[il] = t08;
-        t09L[il] = t09;
-        t10L[il] = t10;
-        t11L[il] = t11;
-        t12L[il] = t12;
-        t13L[il] = t13;
-        t14L[il] = t14;
-        t15L[il] = t15;
-        t16L[il] = t16;
-        t17L[il] = t17;
-        t18L[il] = t18;
-        t19L[il] = t19;
-        t20L[il] = t20;
-        t21L[il] = t21;
-        t22L[il] = t22;
-        t23L[il] = t23;
-        t24L[il] = t24;
-        t25L[il] = t25;
-        t26L[il] = t26;
-        t27L[il] = t27;
-        t28L[il] = t28;
-        t29L[il] = t29;
-        t30L[il] = t30;
-
-        cur      = t30;
-    }
-    clr_buf(0);
-    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
-    use_buf(-1);
-    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
-
-    {
-        /*
-        tok_embeddings                                                        | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00)
-        L0_att_norm                                                           | grad_L0_att_norm    = ggml_repeat_back(grad_t03L0, L0_att_norm.shape)
-        L0_wq                                                                 | grad_L0_wq          = ggml_out_prod(t04L0, grad_t05L0)
-        L0_wk                                                                 | grad_L0_wk          = ggml_out_prod(t04L0, grad_t08L0)
-        L0_wv                                                                 | grad_L0_wv          = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0))
-        L0_wo                                                                 | grad_L0_wo          = ggml_out_prod(t19L0, grad_t20L0)
-        L0_ffn_norm                                                           | grad_L0_ffn_norm    = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape)
-        L0_w1                                                                 | grad_L0_w1          = ggml_out_prod(t24L0, grad_t26L0)
-        L0_w2                                                                 | grad_L0_w2          = ggml_out_prod(t28L0, grad_t29L0)
-        L0_w3                                                                 | grad_L0_w3          = ggml_out_prod(t24L0, grad_t25L0)
-        L1_att_norm                                                           | grad_L1_att_norm    = ggml_repeat_back(grad_t03L1, L1_att_norm.shape)
-        L1_wq                                                                 | grad_L1_wq          = ggml_out_prod(t04L1, grad_t05L1)
-        L1_wk                                                                 | grad_L1_wk          = ggml_out_prod(t04L1, grad_t08L1)
-        L1_wv                                                                 | grad_L1_wv          = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1))
-        L1_wo                                                                 | grad_L1_wo          = ggml_out_prod(t19L1, grad_t20L1)
-        L1_ffn_norm                                                           | grad_L1_ffn_norm    = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape)
-        L1_w1                                                                 | grad_L1_w1          = ggml_out_prod(t24L1, grad_t26L1)
-        L1_w2                                                                 | grad_L1_w2          = ggml_out_prod(t28L1, grad_t29L1)
-        L1_w3                                                                 | grad_L1_w3          = ggml_out_prod(t24L1, grad_t25L1)
-        norm                                                                  | grad_norm           = ggml_repeat_back(grad_t32, norm.shape)
-        output                                                                | grad_output         = ggml_out_prod(t33, grad_t34)
-                                                                              |
-        t01 = ggml_get_rows(tok_embeddings, t00)                              | grad_t01   = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0)
-        for layer:                                                            |
-        t02L0*= ggml_rms_norm     (t01)                                       | grad_t02L0 = ggml_mul(grad_t04L0, t03L0)
-        t03L0 = ggml_repeat       (L0_att_norm, t02L0_shape)                  | grad_t03L0 = ggml_mul(grad_t04L0, t02L0)
-        t04L0*= ggml_mul          (t02L0, t03L0)                              | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0))
-        t05L0 = ggml_mul_mat      (L0_wq, t04L0)                              | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape)
-        t06L0 = ggml_reshape_4d   (t05L0, n_embd/n_head, n_head, N, n_batch)  | grad_t06L0 = ggml_rope_back(grad_t07L0)
-        t07L0 = ggml_rope_inplace (t06L0)                                     | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3)
-        t08L0 = ggml_mul_mat      (L0_wk, t04L0)                              | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape)
-        t09L0 = ggml_reshape_4d   (t08L0, n_embd/n_head, n_head, N, n_batch)  | grad_t09L0 = ggml_rope_back(grad_t10L0)
-        t10L0 = ggml_rope_inplace (t09L0)                                     | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3)
-        t11L0 = ggml_mul_mat      (t04L0, L0_wv)                              | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape)
-        t12L0 = ggml_reshape_4d   (t11L0, N, n_batch, n_embd/n_head, n_head)  | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1)
-        t13L0*= ggml_permute      (t07L0, 0, 2, 1, 3)                         | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t14L0*= ggml_permute      (t10L0, 0, 2, 1, 3)                         | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t15L0*= ggml_permute      (t12L0, 0, 3, 1, 2)                         | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0))
-        t16L0 = ggml_flash_attn   (t13L0, t14L0, t15L0)                       | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3)
-        t17L0 = ggml_permute      (t16L0, 0, 2, 1, 3)                         | grad_t17L0 = grad_t18L0
-        t18L0 = ggml_cont         (t17L0)                                     | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape)
-        t19L0*= ggml_reshape_2d   (t18L0, n_embd, N*n_batch)                  | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0))
-        t20L0 = ggml_mul_mat      (L0_wo, t19L0)                              | grad_t20L0 = grad_t21L0
-        t21L0*= ggml_add          (t20L0, t01)                                | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0)
-        t22L0*= ggml_rms_norm     (t21L0)                                     | grad_t22L0 = ggml_mul(grad_t24L0, t23L0)
-        t23L0 = ggml_repeat       (L0_ffn_norm, t22L0_shape)                  | grad_t23L0 = ggml_mul(grad_t24L0, t22L0)
-        t24L0*= ggml_mul          (t23L0, t22L0)                              | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0))
-        t25L0*= ggml_mul_mat      (L0_w3, t24L0)                              | grad_t25L0 = ggml_mul(grad_t28L0, t27L0)
-        t26L0*= ggml_mul_mat      (L0_w1, t24L0)                              | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0)
-        t27L0*= ggml_silu         (t26L0)                                     | grad_t27L0 = ggml_mul(grad_t28L0, t25L0)
-        t28L0*= ggml_mul          (t27L0, t25L0)                              | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0))
-        t29L0 = ggml_mul_mat      (L0_w2, t28L0)                              | grad_t29L0 = grad_t30L0
-        t30L0*= ggml_add          (t21L0, t29L0)                              | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1
-                                                                              ^
-        t02L1*= ggml_rms_norm     (t30L0)                                     | grad_t02L1 = ggml_mul(grad_t04L1, t03L1)
-        t03L1 = ggml_repeat       (L1_att_norm, t02L1_shape)                  | grad_t03L1 = ggml_mul(grad_t04L1, t02L1)
-        t04L1*= ggml_mul          (t02L1, t03L1)                              | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1))
-        t05L1 = ggml_mul_mat      (L1_wq, t04L1)                              | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape)
-        t06L1 = ggml_reshape_4d   (t05L1, n_embd/n_head, n_head, N, n_batch)  | grad_t06L1 = ggml_rope_back(grad_t07L1)
-        t07L1 = ggml_rope_inplace (t06L1)                                     | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3)
-        t08L1 = ggml_mul_mat      (L1_wk, t04L1)                              | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape)
-        t09L1 = ggml_reshape_4d   (t08L1, n_embd/n_head, n_head, N, n_batch)  | grad_t09L1 = ggml_rope_back(grad_t10L1)
-        t10L1 = ggml_rope_inplace (t09L1)                                     | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3)
-        t11L1 = ggml_mul_mat      (t04L1, L1_wv)                              | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape)
-        t12L1 = ggml_reshape_4d   (t11L1, N, n_batch, n_embd/n_head, n_head)  | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1)
-        t13L1*= ggml_permute      (t07L1, 0, 2, 1, 3)                         | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t14L1*= ggml_permute      (t10L1, 0, 2, 1, 3)                         | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t15L1*= ggml_permute      (t12L1, 0, 3, 1, 2)                         | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1))
-        t16L1 = ggml_flash_attn   (t13L1, t14L1, t15L1)                       | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3)
-        t17L1 = ggml_permute      (t16L1, 0, 2, 1, 3)                         | grad_t17L1 = grad_t18L1
-        t18L1 = ggml_cont         (t17L1)                                     | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape)
-        t19L1*= ggml_reshape_2d   (t18L1, n_embd, N*n_batch)                  | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1))
-        t20L1 = ggml_mul_mat      (L1_wo, t19L1)                              | grad_t20L1 = grad_t21L1
-        t21L1*= ggml_add          (t20L1, t30L0)                              | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1)
-        t22L1*= ggml_rms_norm     (t21L1)                                     | grad_t22L1 = ggml_mul(grad_t24L1, t23L1)
-        t23L1 = ggml_repeat       (L1_ffn_norm, t22L1_shape)                  | grad_t23L1 = ggml_mul(grad_t24L1, t22L1)
-        t24L1*= ggml_mul          (t23L1, t22L1)                              | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1))
-        t25L1*= ggml_mul_mat      (L1_w3, t24L1)                              | grad_t25L1 = ggml_mul(grad_t28L1, t27L1)
-        t26L1*= ggml_mul_mat      (L1_w1, t24L1)                              | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1)
-        t27L1*= ggml_silu         (t26L1)                                     | grad_t27L1 = ggml_mul(grad_t28L1, t25L1)
-        t28L1*= ggml_mul          (t27L1, t25L1)                              | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1))
-        t29L1 = ggml_mul_mat      (L1_w2, t28L1)                              | grad_t29L1 = grad_t30L1
-        t30L1*= ggml_add          (t21L1, t29L1)                              | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31)
-                                                                              ^
-        t31   = ggml_rms_norm     (t30L1)                                     | grad_t31   = ggml_mul(grad_t33, t32)
-        t32   = ggml_repeat       (norm, t31.shape)                           | grad_t32   = ggml_mul(grad_t33, t31)
-        t33   = ggml_mul          (t32, t31)                                  | grad_t33   = ggml_out_prod(output, ggml_transpose(grad_t34))
-        t34   = ggml_mul_mat      (output, t33)                               | grad_t34   = ggml_reshape(grad_t35, t34.shape)
-        t35   = ggml_reshape_3d   (t34, n_vocab, N, n_batch)                  | grad_t35   = ggml_cross_entropy_loss_back(t35, targets, grad_t36)
-        t36   = ggml_cross_entropy_loss(t35, targets)                         | grad_t36   = 1 (optimizer)
-        tensors marked with * need to be stored until grad computation
-        tensors during grad computation are all temporary
-        */
-    }
-
-    *gb = *gf;
-
-    // t36->grad gets set to one by optimizer, so we need the tensor.
-    // initialize it with 1.0f to make sure.
-    use_buf(-1);
-    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
-
-    use_buf(0);
-    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
-    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
-    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
-    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
-
-    use_buf(-1);
-
-    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
-    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
-
-    clr_buf(1);
-    use_buf(1);
-    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
-
-    struct ggml_tensor * back_layer_inp = t31;
-    struct ggml_tensor * grad_layer_inp = NULL;
-
-    for (int k = 0; k < n_layer; ++k) {
-        int il = n_layer-1-k;
-        struct my_llama_layer & layer = model->layers[il];
-
-        struct ggml_tensor * t02 = t02L[il];
-        struct ggml_tensor * t03 = t03L[il];
-        struct ggml_tensor * t04 = t04L[il];
-        struct ggml_tensor * t05 = t05L[il];
-        struct ggml_tensor * t06 = t06L[il];
-        struct ggml_tensor * t07 = t07L[il];
-        struct ggml_tensor * t08 = t08L[il];
-        struct ggml_tensor * t09 = t09L[il];
-        struct ggml_tensor * t10 = t10L[il];
-        struct ggml_tensor * t11 = t11L[il];
-        struct ggml_tensor * t12 = t12L[il];
-        struct ggml_tensor * t13 = t13L[il];
-        struct ggml_tensor * t14 = t14L[il];
-        struct ggml_tensor * t15 = t15L[il];
-        struct ggml_tensor * t16 = t16L[il];
-        struct ggml_tensor * t17 = t17L[il];
-        struct ggml_tensor * t18 = t18L[il];
-        struct ggml_tensor * t19 = t19L[il];
-        struct ggml_tensor * t20 = t20L[il];
-        struct ggml_tensor * t21 = t21L[il];
-        struct ggml_tensor * t22 = t22L[il];
-        struct ggml_tensor * t23 = t23L[il];
-        struct ggml_tensor * t24 = t24L[il];
-        struct ggml_tensor * t25 = t25L[il];
-        struct ggml_tensor * t26 = t26L[il];
-        struct ggml_tensor * t27 = t27L[il];
-        struct ggml_tensor * t28 = t28L[il];
-        struct ggml_tensor * t29 = t29L[il];
-        struct ggml_tensor * t30 = t30L[il];
-
-        clr_buf(0);
-        use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        }
-        clr_buf(1);
-        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
-        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
-        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
-        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
-        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
-        t24->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
-                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
-        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
-        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
-        grad_layer_inp = t21;
-        use_buf(0);
-        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
-        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
-        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
-        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
-        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
-        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
-        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
-        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
-        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
-        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
-        t04->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_add_inplace(ctx0,
-                            ggml_out_prod(ctx0, layer.wv, t11->grad),
-                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
-                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02;
-        // use_buf(0);
-
-        use_buf(-1);
-        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
-        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
-        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
-        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
-        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
-        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
-        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
-        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
-        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-        // use_buf(0);
-    }
-    clr_buf(0);
-    use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
-    use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-    // clr_buf(1);
-    // clr_buf(0);
-
-    *logits = t35;
-
-    clr_buf(0);
-    clr_buf(1);
-
-    if (track_max_mem) {
-        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
-        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-    }
-
-    // now that all grads are created, set the graph leafs and grads
-    graph_set_leafs_grads(gf);
-    graph_set_leafs_grads(gb);
-
-    return t36;
-}
-
-struct ggml_tensor * forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
-        struct my_llama_model * model,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        void                  * compute_buf_0,
-        void                  * compute_buf_1,
-        void                  * compute_buf_2,
-        size_t                  size_buf_0,
-        size_t                  size_buf_1,
-        size_t                  size_buf_2,
-        const  int              n_tokens,
-        const  int              n_batch) {
-
-    // implements gradient-checkpointing as explained in readme of https://github.com/cybertronai/gradient-checkpointing
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    const int n_past = 0;
-    const int N = n_tokens;
-
-    gf->n_nodes = 0;
-    gf->n_leafs = 0;
-    gf->perf_runs = 0;
-    gf->perf_cycles = 0;
-    gf->perf_time_us = 0;
-
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
-    const int rope_mode  = 0;
-
-    bool track_max_mem = true;
-
-    int last_buf = -1;
-    size_t buf_offs[3] = { 0, 0, 0 };
-    size_t buf_size[3] = { size_buf_0,
-                           size_buf_1,
-                           size_buf_2 };
-    void * buf_data[3] = { compute_buf_0,
-                           compute_buf_1,
-                           compute_buf_2 };
-    size_t buf_maxs[3] = { 0, 0, 0 };
-
-    auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs] (int buf) {
-        size_t last_offs = 0;
-        last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-        if (last_buf >= 0) {
-            buf_offs[last_buf] = last_offs;
-            buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-        }
-        if (buf >= 0) {
-            size_t offs = buf_offs[buf];
-            size_t size = buf_size[buf];
-            void * data = buf_data[buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-        last_buf = buf;
-    };
-
-
-    auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) {
-        if (buf < 0) return;
-        if (track_max_mem) {
-            size_t last_offs = 0;
-            last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-            if (last_buf >= 0) {
-                buf_offs[last_buf] = last_offs;
-                buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]);
-            }
-        }
-        buf_offs[buf] = 0;
-        if (track_max_mem && last_buf >= 0) {
-            size_t offs = buf_offs[last_buf];
-            size_t size = buf_size[last_buf];
-            void * data = buf_data[last_buf];
-            ggml_set_scratch(ctx0, { offs, size, data, });
-        }
-    };
-
-
-    auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 0;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = n_embd/n_head;
-        int64_t ne1 = N;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * {
-        int64_t ne0 = N;
-        int64_t ne1 = n_embd/n_head;
-        int64_t ne2 = n_head;
-        int64_t ne3 = n_batch;
-        size_t  nb0 = ggml_element_size(t);
-        size_t  nb1 = nb0*ne0;
-        size_t  nb2 = nb1*ne1;
-        size_t  nb3 = nb2*ne2;
-        size_t offset = 2*nb3*ne3;
-        return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset);
-    };
-
-    auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * {
-        if (a == NULL) {
-            return b;
-        } else {
-            return ggml_add_inplace(ctx0, a, b);
-        }
-    };
-
-    use_buf(-1);
-
-    model->tok_embeddings->grad    = NULL;
-    model->norm->grad              = NULL;
-    model->output->grad            = NULL;
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        layer.attention_norm->grad = NULL;
-        layer.wq->grad             = NULL;
-        layer.wk->grad             = NULL;
-        layer.wv->grad             = NULL;
-        layer.wo->grad             = NULL;
-        layer.ffn_norm->grad       = NULL;
-        layer.w1->grad             = NULL;
-        layer.w2->grad             = NULL;
-        layer.w3->grad             = NULL;
-    }
-
-    clr_buf(0);
-    clr_buf(1);
-    clr_buf(2);
-
-    use_buf(-1);
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx0, tokens_input, N*n_batch); assert_shape_1d(t00, N*n_batch);
-
-    use_buf(-1);
-
-    struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch);
-
-
-    {
-        // given: n, u, v
-        // objective: minimize(a*u+b*v) where a*b=n, a>0, b>0
-        // b=n/a
-        // minimize(a*u+v*n/a)
-        // diff(a*u+v*n/a, a) = u - (v*n/a)/a
-        // diff(a*u+v*n/a, a) == 0
-        // u - (v*n/a)/a == 0
-        // u == v*n/(a*a)
-        // u*a*a = v*n
-        // a*a = v*n/u
-        // a = sqrt(n*v/u)
-    }
-
-    float memcost_checkpoint   = n_embd;           // (..)*N*n_batch
-    float memcost_snd_fwd_pass = 14*n_embd+4*n_ff; // (..)*N*n_batch
-
-    int n_checkstep = (int)(sqrtf(n_layer*memcost_checkpoint/memcost_snd_fwd_pass) + 0.5f);
-    if (n_checkstep < 1) {
-        n_checkstep = 1;
-    }
-    std::vector<int> checkpoints;
-    for (int chk = n_checkstep-1; chk+1 < n_layer; chk += n_checkstep) {
-        checkpoints.push_back(chk);
-    }
-    int n_check = checkpoints.size();
-    // printf("%s: n_check = %d n_checkstep = %d\n", __func__, n_check, n_checkstep);
-
-    // for (int i = 0; i < n_check; ++i) {
-    //     printf("%s: checkpoint #%d = %d\n", __func__, i, checkpoints[i]);
-    // }
-
-    // example for 16 layers and memcost_checkpoint=memcost_snd_fwd_pass:
-    // inp  ~    implicit zeroth checkpoint == input
-    // L00 f 4b  [
-    // L01 f 4b    4th second forward pass
-    // L02 f 4b
-    // L03 fc4b  ] first checkpoint
-    // L04 f 3b  [
-    // L05 f 3b   3rd second forward pass
-    // L06 f 3b
-    // L07 fc3b  ] second checkpoint
-    // L08 f 2b  [
-    // L09 f 2b   2nd second forward pass
-    // L10 f 2b
-    // L11 fc2b  ] third checkpoint
-    // L12 f 1b  [
-    // L13 f 1b   1st second forward pass
-    // L14 f 1b
-    // L15 f 1b  ]
-
-    // need to remember these for the backward pass
-    std::vector<struct ggml_tensor *> t02L; t02L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t03L; t03L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t04L; t04L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t05L; t05L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t06L; t06L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t07L; t07L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t08L; t08L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t09L; t09L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t10L; t10L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t11L; t11L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t12L; t12L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t13L; t13L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t14L; t14L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t15L; t15L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t16L; t16L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t17L; t17L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t18L; t18L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t19L; t19L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t20L; t20L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t21L; t21L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t22L; t22L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t23L; t23L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t24L; t24L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t25L; t25L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t26L; t26L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t27L; t27L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t28L; t28L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t29L; t29L.resize(n_layer, NULL);
-    std::vector<struct ggml_tensor *> t30L; t30L.resize(n_layer, NULL);
-
-    struct ggml_tensor * cur = t01;
-
-    int chk_idx = 0;
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        // tensors with values necessary for backward pass are in persistent buf(-1)
-        // other tensors with buf(0), buf(1), etc are only temporary needed, and their memory reused
-        bool is_checkpoint = (chk_idx < n_check && il == checkpoints[chk_idx]);
-        if (is_checkpoint) {
-            // printf("%s: layer %d is_checkpoint\n", __func__, il);
-            chk_idx += 1;
-        }
-        const int prs = 0; // in first forward pass even persistent tensors are only temporary
-        const int tmp = 0; // temporary
-        // nxt is required to compute next layer.
-        // for checkpoints we need to remember this for usage in backward pass,
-        // otherwise temporary until next of this kind
-        const int nxt = is_checkpoint ? -1 : 1;
-        clr_buf(0);
-        use_buf(prs); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-        use_buf(prs); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        use_buf(prs); struct ggml_tensor * t13 = expand(gf, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t14 = expand(gf, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t15 = expand(gf, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        use_buf(prs); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        use_buf(tmp); struct ggml_tensor * t17 = expand(gf, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t18 = expand(gf, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        use_buf(prs); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t27 = expand(gf, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-        use_buf(prs); struct ggml_tensor * t28 = expand(gf, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-        use_buf(tmp); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-        clr_buf( 1);
-        use_buf(nxt); struct ggml_tensor * t30 = expand(gf, ggml_add          (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-
-        // only t30L is remembered for checkpointing in first forward pass
-        if (is_checkpoint) {
-            t30L[il] = t30;
-        }
-        cur = t30;
-    }
-    clr_buf(0);
-    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
-    use_buf(-1);
-    struct ggml_tensor * t34   = expand(gf, ggml_mul_mat   (ctx0, model->output, t33));        assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch));  assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets));      assert_shape_1d(t36, 1);
-
-    *gb = *gf;
-
-    // t36->grad gets set to one by optimizer, so we need the tensor.
-    // initialize it with 1.0f to make sure.
-    use_buf(-1);
-    t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f));
-
-    use_buf(0);
-    t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad));              assert_shape_3d(t35->grad, n_vocab, N, n_batch);
-    t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch));                    assert_shape_2d(t34->grad, n_vocab, N*n_batch);
-    t33->grad = expand(gb, ggml_out_prod   (ctx0, model->output, ggml_transpose(ctx0, t34->grad)));   assert_shape_2d(t33->grad, n_embd, N*n_batch);
-    t32->grad = expand(gb, ggml_mul        (ctx0, t33->grad, t31));                                   assert_shape_2d(t32->grad, n_embd, N*n_batch);
-
-    use_buf(-1);
-
-    model->norm->grad   = expand(gb, add_or_set(model->norm->grad,   ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd);
-    model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad)));            assert_shape_2d(model->output->grad, n_embd, n_vocab);
-
-    clr_buf(1);
-    use_buf(1);
-    t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32));  assert_shape_2d(t31->grad, n_embd, N*n_batch);
-
-    struct ggml_tensor * back_layer_inp = t31;
-    struct ggml_tensor * grad_layer_inp = NULL;
-
-    // printf("%s: n_check = %u\n", __func__, n_check);
-    chk_idx = n_check-1;
-    int avail_begin = n_layer;
-    int avail_end = n_layer;
-    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    for (int k = 0; k < n_layer; ++k) {
-        // second forward pass for checkpointing
-        int il = n_layer-1-k;
-        if (il < avail_begin) {
-            // make sure, that txxL[il] is available
-            // forward pass from last checkpoint
-            GGML_ASSERT(chk_idx >= -1);
-            int begin = (chk_idx == -1)
-                        ? 0
-                        : checkpoints[chk_idx] + 1; // checkpoint[chk_idx] contains t30 for computing following layers -> +1
-            int end   = (chk_idx+1 < n_check)
-                        ? (checkpoints[chk_idx+1] + 1)
-                        : n_layer;
-            GGML_ASSERT(begin <= il);
-            GGML_ASSERT(il < end);
-            cur = (chk_idx == -1) ? t01 : t30L[checkpoints[chk_idx]];
-            clr_buf(2);
-            // printf("%s: second forward pass chk_idx=%d begin=%d end=%d\n", __func__, chk_idx, begin, end);
-            for (int i = begin; i < end; ++i) {
-                struct my_llama_layer & layer = model->layers[i];
-                const int prs = 2; // persistent until next checkpoint
-                const int tmp = 0; // temporary for this layer
-                const bool is_checkpoint = (i == end-1);
-                clr_buf(0);
-                use_buf(prs); struct ggml_tensor * t02 = expand(gb, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t03 = expand(gb, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t04 = expand(gb, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t05 = expand(gb, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t06 = expand(gb, ggml_reshape_4d   (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t07 = expand(gb, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t08 = expand(gb, ggml_mul_mat      (ctx0, layer.wk, t04));                          assert_shape_2d(t08, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t09 = expand(gb, ggml_reshape_4d   (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t10 = expand(gb, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, n_ctx));   assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t11 = expand(gb, ggml_mul_mat      (ctx0, t04, layer.wv));                          assert_shape_2d(t11, N*n_batch, n_embd);
-                use_buf(prs); struct ggml_tensor * t12 = expand(gb, ggml_reshape_4d   (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-                use_buf(prs); struct ggml_tensor * t13 = expand(gb, ggml_permute      (ctx0, t07, 0, 2, 1, 3));                        assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t14 = expand(gb, ggml_permute      (ctx0, t10, 0, 2, 1, 3));                        assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t15 = expand(gb, ggml_permute      (ctx0, t12, 0, 3, 1, 2));                        assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-                use_buf(prs); struct ggml_tensor * t16 = expand(gb, ggml_flash_attn   (ctx0, t13, t14, t15, true));                    assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-                use_buf(tmp); struct ggml_tensor * t17 = expand(gb, ggml_permute      (ctx0, t16, 0, 2, 1, 3));                        assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t18 = expand(gb, ggml_cont         (ctx0, t17));                                    assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-                use_buf(prs); struct ggml_tensor * t19 = expand(gb, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t20 = expand(gb, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t21 = expand(gb, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t22 = expand(gb, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t23 = expand(gb, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t24 = expand(gb, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t25 = expand(gb, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t26 = expand(gb, ggml_mul_mat      (ctx0, layer.w1, t24));                          assert_shape_2d(t26, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t27 = expand(gb, ggml_silu         (ctx0, t26));                                    assert_shape_2d(t27, n_ff, N*n_batch);
-                use_buf(prs); struct ggml_tensor * t28 = expand(gb, ggml_mul          (ctx0, t27, t25));                               assert_shape_2d(t28, n_ff, N*n_batch);
-                use_buf(tmp); struct ggml_tensor * t29 = expand(gb, ggml_mul_mat      (ctx0, layer.w2, t28));                          assert_shape_2d(t29, n_embd, N*n_batch);
-                if (t30L[i] == NULL) {
-                    use_buf(prs); struct ggml_tensor * t30 = expand(gb, ggml_add      (ctx0, t21, t29));                               assert_shape_2d(t30, n_embd, N*n_batch);
-                    t30L[i] = t30;
-                    cur     = t30;
-                }
-                t02L[i] = t02;
-                t03L[i] = t03;
-                t04L[i] = t04;
-                t05L[i] = t05;
-                t06L[i] = t06;
-                t07L[i] = t07;
-                t08L[i] = t08;
-                t09L[i] = t09;
-                t10L[i] = t10;
-                t11L[i] = t11;
-                t12L[i] = t12;
-                t13L[i] = t13;
-                t14L[i] = t14;
-                t15L[i] = t15;
-                t16L[i] = t16;
-                t17L[i] = t17;
-                t18L[i] = t18;
-                t19L[i] = t19;
-                t20L[i] = t20;
-                t21L[i] = t21;
-                t22L[i] = t22;
-                t23L[i] = t23;
-                t24L[i] = t24;
-                t25L[i] = t25;
-                t26L[i] = t26;
-                t27L[i] = t27;
-                t28L[i] = t28;
-                t29L[i] = t29;
-            }
-            --chk_idx;
-            avail_begin = begin;
-            avail_end   = end;
-            // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-        }
-        // printf("%s: backward pass il=%d\n", __func__, il);
-
-        struct my_llama_layer & layer = model->layers[il];
-
-        struct ggml_tensor * t02 = t02L[il];
-        struct ggml_tensor * t03 = t03L[il];
-        struct ggml_tensor * t04 = t04L[il];
-        struct ggml_tensor * t05 = t05L[il];
-        struct ggml_tensor * t06 = t06L[il];
-        struct ggml_tensor * t07 = t07L[il];
-        struct ggml_tensor * t08 = t08L[il];
-        struct ggml_tensor * t09 = t09L[il];
-        struct ggml_tensor * t10 = t10L[il];
-        struct ggml_tensor * t11 = t11L[il];
-        struct ggml_tensor * t12 = t12L[il];
-        struct ggml_tensor * t13 = t13L[il];
-        struct ggml_tensor * t14 = t14L[il];
-        struct ggml_tensor * t15 = t15L[il];
-        struct ggml_tensor * t16 = t16L[il];
-        struct ggml_tensor * t17 = t17L[il];
-        struct ggml_tensor * t18 = t18L[il];
-        struct ggml_tensor * t19 = t19L[il];
-        struct ggml_tensor * t20 = t20L[il];
-        struct ggml_tensor * t21 = t21L[il];
-        struct ggml_tensor * t22 = t22L[il];
-        struct ggml_tensor * t23 = t23L[il];
-        struct ggml_tensor * t24 = t24L[il];
-        struct ggml_tensor * t25 = t25L[il];
-        struct ggml_tensor * t26 = t26L[il];
-        struct ggml_tensor * t27 = t27L[il];
-        struct ggml_tensor * t28 = t28L[il];
-        struct ggml_tensor * t29 = t29L[il];
-        struct ggml_tensor * t30 = t30L[il];
-
-        clr_buf(0);
-        use_buf(0);
-        t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad, rms_norm_eps));                    assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        if (grad_layer_inp) {
-            t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad));                                  assert_shape_2d(t30->grad, n_embd, N*n_batch);
-        }
-        clr_buf(1);
-        t29->grad = t30->grad;                                                                                        assert_shape_2d(t29->grad, n_embd, N*n_batch);
-        t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad)));                       assert_shape_2d(t28->grad, n_ff, N*n_batch);
-        t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25));                                                       assert_shape_2d(t27->grad, n_ff, N*n_batch);
-        t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad));                                                 assert_shape_2d(t26->grad, n_ff, N*n_batch);
-        t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27));                                                       assert_shape_2d(t25->grad, n_ff, N*n_batch);
-        t24->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)),
-                        ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad))));                             assert_shape_2d(t24->grad, n_embd, N*n_batch);
-        t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22));                                                       assert_shape_2d(t23->grad, n_embd, N*n_batch);
-        t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad)));              assert_shape_2d(t22->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad, rms_norm_eps)));    assert_shape_2d(t21->grad, n_embd, N*n_batch);
-        grad_layer_inp = t21;
-        use_buf(0);
-        t20->grad = t21->grad;                                                                                        assert_shape_2d(t20->grad, n_embd, N*n_batch);
-        t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad)));                       assert_shape_2d(t19->grad, n_embd, N*n_batch);
-        t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch));                  assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch);
-        t17->grad = t18->grad;                                                                                        assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch);
-        t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3));                                            assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true));     assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch);
-        t15->grad = expand(gb, view__v(flash_attn));                                                                  assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch);
-        t14->grad = expand(gb, view__k(flash_attn));                                                                  assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch);
-        t13->grad = expand(gb, view__q(flash_attn));                                                                  assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch);
-        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
-        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
-        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
-        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
-        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
-        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
-        t04->grad = expand(gb, ggml_add_inplace(ctx0,
-                        ggml_add_inplace(ctx0,
-                            ggml_out_prod(ctx0, layer.wv, t11->grad),
-                            ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))),
-                        ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad))));                             assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02));                                                       assert_shape_2d(t04->grad, n_embd, N*n_batch);
-        use_buf(1);
-        t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02)));              assert_shape_2d(t02->grad, n_embd, N*n_batch);
-        back_layer_inp = t02;
-
-        use_buf(-1);
-        layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm)));   assert_shape_1d(layer.attention_norm->grad, n_embd);
-        layer.wq->grad             = expand(gb, add_or_set(layer.wq->grad,             ggml_out_prod(ctx0, t04, t05->grad)));                       assert_shape_2d(layer.wq->grad,             n_embd, n_embd);
-        layer.wk->grad             = expand(gb, add_or_set(layer.wk->grad,             ggml_out_prod(ctx0, t04, t08->grad)));                       assert_shape_2d(layer.wk->grad,             n_embd, n_embd);
-        layer.wv->grad             = expand(gb, add_or_set(layer.wv->grad,             ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad,             n_embd, n_embd);
-        layer.wo->grad             = expand(gb, add_or_set(layer.wo->grad,             ggml_out_prod(ctx0, t19, t20->grad)));                       assert_shape_2d(layer.wo->grad,             n_embd, n_embd);
-        layer.ffn_norm->grad       = expand(gb, add_or_set(layer.ffn_norm->grad,       ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm)));         assert_shape_1d(layer.ffn_norm->grad,       n_embd);
-        layer.w1->grad             = expand(gb, add_or_set(layer.w1->grad,             ggml_out_prod(ctx0, t24, t26->grad)));                       assert_shape_2d(layer.w1->grad,             n_embd, n_ff);
-        layer.w2->grad             = expand(gb, add_or_set(layer.w2->grad,             ggml_out_prod(ctx0, t28, t29->grad)));                       assert_shape_2d(layer.w2->grad,             n_ff, n_embd);
-        layer.w3->grad             = expand(gb, add_or_set(layer.w3->grad,             ggml_out_prod(ctx0, t24, t25->grad)));                       assert_shape_2d(layer.w3->grad,             n_embd, n_ff);
-    }
-    // printf("%s: chk_idx=%d avail_begin=%d avail_end=%d\n", __func__, chk_idx, avail_begin, avail_end);
-    GGML_ASSERT(n_check == 0 || chk_idx == -2);
-    GGML_ASSERT(avail_begin == 0);
-    clr_buf(0);
-    use_buf(0);
-    t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad, rms_norm_eps)));        assert_shape_2d(t01->grad, n_embd, N*n_batch);
-    use_buf(-1);
-    model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings));                                      assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab);
-
-    *logits = t35;
-
-    clr_buf(0);
-    clr_buf(1);
-    clr_buf(2);
-
-    if (track_max_mem) {
-        printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]);
-        printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]);
-        printf("%s: max size compute buf2: %zu\n", __func__, buf_maxs[2]);
-    }
-
-    // now that all grads are created, set the graph leafs and grads
-    graph_set_leafs_grads(gf);
-    graph_set_leafs_grads(gb);
-
-    return t36;
-}
-
 void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
     *ptr = value;
@@ -4485,44 +2945,14 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        if (params.use_alloc || params.use_unified) {
-            loss = llama_build_train_graphs(
-                &model, alloc, ctx0,
-                gf, gb, gb_tmp,
-                &logits, tokens_input, target_probs,
-                n_tokens, n_batch,
-                params.use_flash, 
-                params.use_checkpointing
-            );
-        } else if (params.use_checkpointing) {
-            loss = forward_batch_wo_cache_flash_attn_train_grad_checkpointing(
-                    &model, ctx0,
-                    gf, gb,
-                    &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1, compute_buf_2,
-                    size_buf_0, size_buf_1, size_buf_2,
-                    n_tokens, n_batch);
-        } else if (params.use_scratch) {
-            loss = forward_batch_wo_cache_flash_attn_train(
-                    &model, ctx0,
-                    gf, gb,
-                    &logits, tokens_input, target_probs,
-                    compute_buf_0, compute_buf_1,
-                    size_buf_0, size_buf_1,
-                    n_tokens, n_batch);
-        } else if (params.use_flash) {
-            logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
-            loss   = cross_entropy_loss(ctx0, logits, target_probs);
-            ggml_build_forward_expand(gf, loss);
-            *gb = *gf;
-            ggml_build_backward_expand(ctx0, gf, gb, true);
-        } else {
-            logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch);
-            loss   = cross_entropy_loss(ctx0, logits, target_probs);
-            ggml_build_forward_expand(gf, loss);
-            *gb = *gf;
-            ggml_build_backward_expand(ctx0, gf, gb, true);
-        }
+        loss = llama_build_train_graphs(
+            &model, alloc, ctx0,
+            gf, gb, gb_tmp,
+            &logits, tokens_input, target_probs,
+            n_tokens, n_batch,
+            params.use_flash, 
+            params.use_checkpointing
+        );
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 

From 271e4d64b56fc4cb10535f473b5ee65ebe063441 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:31:59 +0200
Subject: [PATCH 056/235] remove unused training parameters "use_scratch" and
 "use_unified"

---
 .../train-text-from-scratch.cpp               | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 9d94bdfcf6984..b6d6db4b80df1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2189,10 +2189,8 @@ struct train_params {
     bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
-    bool use_scratch;
     bool use_checkpointing;
     bool use_alloc;
-    bool use_unified;
 
     // only adam
     int   warmup;
@@ -2252,10 +2250,8 @@ struct train_params get_default_train_params() {
     params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
-    params.use_scratch            = true;
     params.use_checkpointing      = true;
     params.use_alloc              = true;
-    params.use_unified            = true;
 
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
@@ -2313,16 +2309,12 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention. Implies no-scratch and no-checkpointing.\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-scratch               Don't use scratch buffers. Implies no-checkpointing.\n");
-    fprintf(stderr, "  --use-scratch              Use scratch buffers. Implies use-flash. (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
-    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing. Implies use-scratch and use-flash. (default)\n");
+    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
     fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
-    fprintf(stderr, "  --use-alloc                Use allocator. Implies use-unified. (default)\n");
-    fprintf(stderr, "  --no-unified               Don't use unified\n");
-    fprintf(stderr, "  --use-unified              Use unified. (default)\n");
+    fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
@@ -2480,10 +2472,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_flash = false;
         } else if (arg == "--use-flash") {
             params->use_flash = true;
-        } else if (arg == "--no-scratch") {
-            params->use_scratch = false;
-        } else if (arg == "--use-scratch") {
-            params->use_scratch = true;
         } else if (arg == "--no-checkpointing") {
             params->use_checkpointing = false;
         } else if (arg == "--use-checkpointing") {
@@ -2492,10 +2480,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_alloc = false;
         } else if (arg == "--use-alloc") {
             params->use_alloc = true;
-        } else if (arg == "--no-unified") {
-            params->use_unified = false;
-        } else if (arg == "--use-unified") {
-            params->use_unified = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2936,7 +2920,7 @@ int main(int argc, char ** argv) {
 
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);
         struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = (params.use_unified || params.use_alloc) 
+        struct ggml_cgraph * gb_tmp = params.use_alloc
             ? ggml_new_graph(ctx0)
             : NULL;
 

From 6f161c784b94c0ab6ae84f96257983378abe1a70 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:33:27 +0200
Subject: [PATCH 057/235] remove trailing whitespace

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index b6d6db4b80df1..5d4a1c2c25af1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2934,7 +2934,7 @@ int main(int argc, char ** argv) {
             gf, gb, gb_tmp,
             &logits, tokens_input, target_probs,
             n_tokens, n_batch,
-            params.use_flash, 
+            params.use_flash,
             params.use_checkpointing
         );
 

From 3794dceb7f164e6688f4895448cbfdee04164d20 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 18:44:42 +0200
Subject: [PATCH 058/235] remove unused train params: mem_compute1_gb &
 mem_compute2_gb

mem_compute_gb is used for compute when automatic memory allocator is not enabled, otherwise it can be very small to only hold the tensor definitions
mem_compute0_gb is used for automatic memory allocator (as long as measurement of max required size is not implemented)
---
 .../train-text-from-scratch.cpp               | 26 +------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 5d4a1c2c25af1..e7b43bf9ac036 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2217,9 +2217,6 @@ struct train_params {
     int mem_model_gb;
     int mem_compute_gb;
     int mem_compute0_gb;
-    int mem_compute1_gb;
-    int mem_compute2_gb;
-    int mem_compute3_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -2278,8 +2275,6 @@ struct train_params get_default_train_params() {
     params.mem_model_gb   =  2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 1;
-    params.mem_compute2_gb = 2;
     return params;
 }
 
@@ -2336,9 +2331,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
-    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb);
-    fprintf(stderr, "  --mem-compute1 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb);
-    fprintf(stderr, "  --mem-compute2 N           Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute2_gb);
+    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "\n");
 }
 
@@ -2604,18 +2597,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->mem_compute0_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute1") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute1_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute2") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute2_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -2839,11 +2820,7 @@ int main(int argc, char ** argv) {
     uint8_t * compute_addr = new uint8_t[compute_size];
 
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
-    size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb);
-    size_t size_buf_2 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute2_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
-    uint8_t * compute_buf_1 = new uint8_t[size_buf_1];
-    uint8_t * compute_buf_2 = new uint8_t[size_buf_2];
 
     ggml_allocr * alloc = NULL;
     if (params.use_alloc) {
@@ -3090,7 +3067,6 @@ int main(int argc, char ** argv) {
 
     delete[] compute_addr;
     delete[] compute_buf_0;
-    delete[] compute_buf_1;
     ggml_free(model.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);

From 6e280b24dcf5d751e12b8a0b3ced7dc41c589e2d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 19:02:12 +0200
Subject: [PATCH 059/235] remove unused forward_batch function

---
 .../train-text-from-scratch.cpp               | 290 ------------------
 1 file changed, 290 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index e7b43bf9ac036..94a2a766d1c8b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -656,296 +656,6 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-struct ggml_tensor * forward_batch(
-        struct my_llama_model    * model,
-        struct my_llama_kv_cache * cache,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_past,
-        const  int              n_batch) {
-
-    const int N = n_tokens;
-
-    struct my_llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_vocab = hparams.n_vocab;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-    const int n_ff    = get_n_ff(&hparams);
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch);
-    memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch);
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    // inpL shape [n_embd,N*n_batch,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
-    assert_shape_2d(inpL, n_embd, N*n_batch);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-
-        // norm
-        {
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
-                        cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, n_ctx);
-            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
-            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [N, n_embd, n_batch, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0,
-                    ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_mul_mat(ctx0,
-                                model->layers[il].wv,
-                                cur),
-                        n_embd, N, n_batch),
-                        1, 0, 2, 3));
-                assert_shape_3d(Vcur, N, n_embd, n_batch);
-
-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
-                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_2d_inplace(ctx0, kc,
-                        ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch),
-                        ggml_element_size(kc)*n_embd*n_ctx,
-                        (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past));
-                vc = ggml_set_2d_inplace(ctx0, vc,
-                        ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch),
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx));
-
-                assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer);
-                assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer);
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
-            // Q shape    [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
-
-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_4d(ctx0,
-                            ggml_view_3d(ctx0,
-                                kc,
-                                n_embd,
-                                (n_past + N),
-                                n_batch,
-                                n_embd*ggml_element_size(kc),
-                                n_ctx*n_embd*ggml_element_size(kc),
-                                il*n_batch*n_ctx*n_embd*ggml_element_size(kc)),
-                            n_embd/n_head, n_head, n_past + N, n_batch),
-                        0, 2, 1, 3);
-            assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            assert_shape_4d(KQ, n_past + N, N, n_head, n_batch);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
-
-            // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
-            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
-            struct ggml_tensor * V =
-                ggml_view_4d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head, n_batch,
-                        ggml_element_size(vc)*n_ctx,
-                        ggml_element_size(vc)*n_ctx*n_embd/n_head,
-                        ggml_element_size(vc)*n_ctx*n_embd,
-                        il*n_batch*n_ctx*n_embd*ggml_element_size(vc));
-            assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch);
-
-            // KQV shape [n_embd/n_head, N, n_head, n_batch]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-            assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N*n_batch,1,1]
-        struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA);
-        assert_shape_2d(inpFF, n_embd, N*n_batch);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
-                        cur);
-                assert_shape_2d(cur, n_embd, N*n_batch);
-            }
-
-            // tmp shape [n_ff,N*n_batch,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
-                    cur);
-            assert_shape_2d(tmp, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
-                    cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // SILU activation
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_silu(ctx0, cur);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_ff,N*n_batch,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-            assert_shape_2d(cur, n_ff, N*n_batch);
-
-            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
-                    cur);
-            assert_shape_2d(cur, n_embd, N*n_batch);
-        }
-
-        // cur shape [n_embd,N*n_batch,1,1]
-        cur = ggml_add_inplace(ctx0, cur, inpFF);
-        assert_shape_2d(cur, n_embd, N*n_batch);
-
-        // input for next layer
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = cur;
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
-                    inpL);
-
-        assert_shape_2d(inpL, n_embd, N*n_batch);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N*n_batch,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
-    assert_shape_2d(inpL, n_vocab, N*n_batch);
-
-    {
-        // inpL shape [n_vocab,N,n_batch,1]
-        inpL = ggml_reshape_3d(ctx0,
-                        inpL,
-                        n_vocab, N, n_batch);
-        assert_shape_3d(inpL, n_vocab, N, n_batch);
-    }
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
 static size_t hash(void * p) {
     return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
 }

From faf3e21eaf8ac4667a233fc7d03e8cb29477183a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 20:50:09 +0200
Subject: [PATCH 060/235] add debug asserts in ggml_allocr_alloc to some common
 pitfalls when using this function directly

---
 ggml-alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 5e1be61ff6cef..ddf973daec7e4 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -104,6 +104,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g
 }
 
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+#ifdef GGML_ALLOCATOR_DEBUG
+    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
+    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
+#endif
     size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
 

From 098654c27760f837bddf6b114d0d3e53788e7043 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 20:56:56 +0200
Subject: [PATCH 061/235] only use ggml_allocr_alloc when tensor has NULL data
 and is no view

---
 .../train-text-from-scratch/train-text-from-scratch.cpp   | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 94a2a766d1c8b..a30291a1c7169 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -986,11 +986,15 @@ struct ggml_tensor * llama_build_train_graphs(
         // gradient tensors (will be set to zero by ggml_graph_reset)
         for (int i = 0; i < gf->n_nodes; ++i) {
             if (!gf->grads[i]) continue;
-            ggml_allocr_alloc(alloc, gf->grads[i]);
+            if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) {
+                ggml_allocr_alloc(alloc, gf->grads[i]);
+            }
             ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
         }
         for (int i = 0; i < checkpoints.size(); ++i) {
-            ggml_allocr_alloc(alloc, checkpoints[i]);
+            if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
+                ggml_allocr_alloc(alloc, checkpoints[i]);
+            }
         }
 
         int n_leafs_after = gb->n_leafs;

From 3e6468b0976dbfe329b9f9511e770f8ad4092fe1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 20:56:03 +0200
Subject: [PATCH 062/235] fix test when to create temporary backward graph

temporary backward graph is only necessary when using checkpointing
---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index a30291a1c7169..11754ffd9442b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2611,7 +2611,7 @@ int main(int argc, char ** argv) {
 
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);
         struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = params.use_alloc
+        struct ggml_cgraph * gb_tmp = params.use_checkpointing
             ? ggml_new_graph(ctx0)
             : NULL;
 

From 56228461c83010c1ee848c8e6a88bc8e23a576ca Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 21:12:02 +0200
Subject: [PATCH 063/235] fix memory "leak" in optimizers

each iteration a new cplan with new memory for work data was allocated.
now cplan creation only happens at the start of optimization, with each iteration reusing the cplan and its work data.
---
 ggml.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index 47f912683cd50..142d5965a1c8b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17368,7 +17368,10 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_compute(gb, &cplan);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17455,7 +17458,7 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+        ggml_graph_compute(gb, &cplan);
 
         const float fx = ggml_get_f32_1d(f, 0);
         opt->loss_after = fx;
@@ -17528,7 +17531,6 @@ struct ggml_lbfgs_iteration_data {
 };
 
 static enum ggml_opt_result linesearch_backtracking(
-        struct ggml_context * ctx,
         const struct ggml_opt_params * params,
         int nx,
         float * x,
@@ -17540,6 +17542,7 @@ static enum ggml_opt_result linesearch_backtracking(
         struct ggml_tensor * f,
         struct ggml_cgraph * gf,
         struct ggml_cgraph * gb,
+        struct ggml_cplan  * cplan,
         const int np,
         struct ggml_tensor * ps[],
         ggml_opt_callback callback,
@@ -17588,7 +17591,7 @@ static enum ggml_opt_result linesearch_backtracking(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
+            ggml_graph_compute(gb, cplan);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17682,6 +17685,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         opt->iter = iter;
     }
 
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
+    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+
     float * x  = opt->lbfgs.x->data;  // current parameters
     float * xp = opt->lbfgs.xp->data; // previous parameters
     float * g  = opt->lbfgs.g->data;  // current gradient
@@ -17716,7 +17723,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
+        ggml_graph_compute(gb, &cplan);
 
         ggml_opt_get_grad(np, ps, g);
 
@@ -17778,7 +17785,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return

From 3b5515bbe0e2224425986ba24f1f5d84aa38dce9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 14 Aug 2023 22:09:36 +0200
Subject: [PATCH 064/235] reverse order of for loop in
 ggml_build_backward_expand to save memory when using gradient checkpointing
 and allocator

with this loop order gradient checkpointing with allocator on 16 layer model saves 13% memory; 2 layer memory it saves 2% memory.

the computation results are the same
---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 142d5965a1c8b..79098a2fccb38 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15809,7 +15809,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         }
     }
 
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
+    for (int i = 0; i < gf->n_nodes; i++) {
         struct ggml_tensor * node = gf->nodes[i];
 
         if (node->is_param) {

From 316b0707f48aeeca900d8505cbe7064b81af7881 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 6 Aug 2023 17:28:22 +0200
Subject: [PATCH 065/235] add API functions to access llama model tensors

---
 llama.cpp | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 llama.h   | 26 ++++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 839739870eb3e..c5112e5bab1f0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4147,6 +4147,10 @@ int llama_n_embd_from_model(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
+int llama_n_layer_from_model(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
 int llama_n_vocab(const struct llama_context * ctx) {
     return ctx->model.vocab.id_to_token.size();
 }
@@ -4159,6 +4163,10 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
+int llama_n_layer(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_layer;
+}
+
 int llama_get_vocab_from_model(
         const struct llama_model * model,
         const char * * strings,
@@ -4180,6 +4188,70 @@ int llama_get_vocab(
     return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
 }
 
+struct llama_layer * llama_get_layer_from_model(
+        const struct llama_model * model,
+        int layer_idx) {
+    if (layer_idx < 0 || layer_idx >= model->hparams.n_layer) {
+        return NULL;
+    } else {
+        return &model->layers[layer_idx];
+    }
+}
+
+struct llama_layer * llama_get_layer(
+        const struct llama_context * ctx,
+        int layer_idx) {
+    return llama_get_layer_from_model(&ctx->model, layer_idx);
+}
+
+struct ggml_tensor * llama_get_model_tok_embeddings(const struct llama_model * model) {
+    return model->tok_embeddings;
+}
+
+struct ggml_tensor * llama_get_model_norm(const struct llama_model * model) {
+    return model->norm;
+}
+
+struct ggml_tensor * llama_get_model_output(const struct llama_model * model) {
+    return model->output;
+}
+
+struct ggml_tensor * llama_get_layer_attention_norm(const struct llama_layer * layer) {
+    return layer->attention_norm;
+}
+
+struct ggml_tensor * llama_get_layer_wq(const struct llama_layer * layer) {
+    return layer->wq;
+}
+
+struct ggml_tensor * llama_get_layer_wk(const struct llama_layer * layer) {
+    return layer->wk;
+}
+
+struct ggml_tensor * llama_get_layer_wv(const struct llama_layer * layer) {
+    return layer->wv;
+}
+
+struct ggml_tensor * llama_get_layer_wo(const struct llama_layer * layer) {
+    return layer->wo;
+}
+
+struct ggml_tensor * llama_get_layer_ffn_norm(const struct llama_layer * layer) {
+    return layer->ffn_norm;
+}
+
+struct ggml_tensor * llama_get_layer_w1(const struct llama_layer * layer) {
+    return layer->w1;
+}
+
+struct ggml_tensor * llama_get_layer_w2(const struct llama_layer * layer) {
+    return layer->w2;
+}
+
+struct ggml_tensor * llama_get_layer_w3(const struct llama_layer * layer) {
+    return layer->w3;
+}
+
 float * llama_get_logits(struct llama_context * ctx) {
     return ctx->logits.data();
 }
diff --git a/llama.h b/llama.h
index fa1977f2d9492..647f9abdc0fc4 100644
--- a/llama.h
+++ b/llama.h
@@ -69,6 +69,7 @@ extern "C" {
 
     struct llama_model;
     struct llama_context;
+    struct llama_layer;
 
     typedef int llama_token;
 
@@ -329,10 +330,12 @@ extern "C" {
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    LLAMA_API int llama_n_layer(const struct llama_context * ctx);
 
     LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
     LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
     LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+    LLAMA_API int llama_n_layer_from_model(const struct llama_model * model);
 
     // Get the vocabulary as output parameters.
     // Returns number of results.
@@ -348,6 +351,29 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
 
+    // Get a llama layer
+    LLAMA_API struct llama_layer * llama_get_layer(
+            const struct llama_context * ctx,
+                                     int layer);
+
+    LLAMA_API struct llama_layer * llama_get_layer_from_model(
+              const struct llama_model * model,
+                                     int layer);
+    
+    LLAMA_API struct ggml_tensor * llama_get_model_tok_embeddings(const struct llama_model * model);
+    LLAMA_API struct ggml_tensor * llama_get_model_norm          (const struct llama_model * model);
+    LLAMA_API struct ggml_tensor * llama_get_model_output        (const struct llama_model * model);
+
+    LLAMA_API struct ggml_tensor * llama_get_layer_attention_norm(const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wq            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wk            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wv            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wo            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_ffn_norm      (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_w1            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_w2            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_w3            (const struct llama_layer * layer);
+
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token

From 5e059ace25606e47bbba7f887300cea82d4bf4c2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 15 Aug 2023 13:54:28 +0200
Subject: [PATCH 066/235] add stub example for finetuning, based on
 train-text-from-scratch

---
 examples/CMakeLists.txt          |    1 +
 examples/finetune/CMakeLists.txt |    5 +
 examples/finetune/README.md      |   22 +
 examples/finetune/finetune.cpp   | 2788 ++++++++++++++++++++++++++++++
 4 files changed, 2816 insertions(+)
 create mode 100644 examples/finetune/CMakeLists.txt
 create mode 100644 examples/finetune/README.md
 create mode 100644 examples/finetune/finetune.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a7b26776ad355..bc5e3905e8bfe 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -42,6 +42,7 @@ else()
     add_subdirectory(benchmark)
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
+    add_subdirectory(finetune)
     add_subdirectory(simple)
     add_subdirectory(embd-input)
     if (LLAMA_METAL)
diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt
new file mode 100644
index 0000000000000..2b52d21cfb381
--- /dev/null
+++ b/examples/finetune/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET finetune)
+add_executable(${TARGET} finetune.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
new file mode 100644
index 0000000000000..726ec47c0ce4f
--- /dev/null
+++ b/examples/finetune/README.md
@@ -0,0 +1,22 @@
+# train-text-from-scratch
+
+Basic usage instructions:
+
+```bash
+# get training data
+wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
+
+# train
+./bin/train-text-from-scratch \
+        --vocab-model ../models/ggml-vocab.bin \
+        --ctx 64 --embd 256 --head 8 --layer 16 \
+        --checkpoint-in  chk-shakespeare-256x16.bin \
+        --checkpoint-out chk-shakespeare-256x16.bin \
+        --model-out ggml-shakespeare-256x16-f32.bin \
+        --train-data "shakespeare.txt" \
+        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
+        --print-details-interval 0 --predict 16 --use-flash
+
+# predict
+./bin/main -m ggml-shakespeare-256x16-f32.bin
+```
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
new file mode 100644
index 0000000000000..11754ffd9442b
--- /dev/null
+++ b/examples/finetune/finetune.cpp
@@ -0,0 +1,2788 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "llama.h"
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+
+struct random_normal_distribution {
+    std::mt19937 gen;
+    std::normal_distribution<float> rd;
+    float min;
+    float max;
+};
+
+struct random_uniform_distribution {
+    std::mt19937 gen;
+    std::uniform_real_distribution<float> rd;
+};
+
+void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::normal_distribution<float>{mean, std};
+    rnd->min = min;
+    rnd->max = max;
+}
+
+void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::uniform_real_distribution<float>{min, max};
+}
+
+int clamp(const int v, const int min, const int max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float fclamp(const float v, const float min, const float max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+float frand_normal(struct random_normal_distribution * rnd) {
+    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
+}
+
+float frand_uniform(struct random_uniform_distribution * rnd) {
+    return rnd->rd(rnd->gen);
+}
+
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+    float scale = 1.0f; // xavier
+    switch (tensor->n_dims) {
+        case 1:
+            scale /= sqrtf(tensor->ne[0]);
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = scale * frand_normal(rnd);
+            }
+            break;
+        case 2:
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = scale * frand_normal(rnd);
+                }
+            }
+            break;
+        case 3:
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = scale * frand_normal(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = scale * frand_normal(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
+    switch (tensor->n_dims) {
+        case 1:
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = frand_uniform(rnd);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = frand_uniform(rnd);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = frand_uniform(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = frand_uniform(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+    return tensor;
+}
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_kv_cache {
+    struct ggml_context * ctx = NULL;
+
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    // llama_ctx_buffer buf;
+
+    int n; // number of tokens currently in the cache
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+void set_param_model(struct my_llama_model * model) {
+    const auto& hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct ggml_context* ctx = model->ctx;
+
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->output);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wq);
+        ggml_set_param(ctx, layer.wk);
+        ggml_set_param(ctx, layer.wv);
+        ggml_set_param(ctx, layer.wo);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+
+void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings, &rnd);
+    randomize_tensor_normal(model->norm,           &rnd);
+    randomize_tensor_normal(model->output,         &rnd);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+        randomize_tensor_normal(layer.attention_norm, &rnd);
+
+        randomize_tensor_normal(layer.wq, &rnd);
+        randomize_tensor_normal(layer.wk, &rnd);
+        randomize_tensor_normal(layer.wv, &rnd);
+        randomize_tensor_normal(layer.wo, &rnd);
+
+        randomize_tensor_normal(layer.ffn_norm, &rnd);
+
+        randomize_tensor_normal(layer.w1, &rnd);
+        randomize_tensor_normal(layer.w2, &rnd);
+        randomize_tensor_normal(layer.w3, &rnd);
+    }
+}
+
+bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_ctx   = hparams.n_ctx;
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+
+    const int64_t n_mem      = n_layer*n_ctx*n_batch;
+    const int64_t n_elements = n_embd*n_mem;
+
+    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+
+    // struct ggml_init_params params;
+    // params.mem_size   = cache.buf.size;
+    // params.mem_buffer = cache.buf.addr;
+    // params.no_alloc   = false;
+    if (!cache->ctx) {
+        struct ggml_init_params params;
+        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        cache->ctx = ggml_init(params);
+
+        if (!cache->ctx) {
+            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+            return false;
+        }
+    }
+
+    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+
+    return true;
+}
+
+struct ggml_tensor * forward(
+        struct my_llama_model    * model,
+        struct my_llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past) {
+
+    const int N = n_tokens;
+
+    struct my_llama_kv_cache& kv_self = *cache;
+    const auto & hparams = model->hparams;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_head  = hparams.n_head;
+    const int n_rot   = hparams.n_rot;
+
+    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
+
+    struct ggml_tensor * kc = kv_self.k;
+    struct ggml_tensor * vc = kv_self.v;
+
+    // inpL shape [n_embd,N,1,1]
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * inpSA = inpL;
+
+        struct ggml_tensor * cur;
+
+        // lctx.use_buf(ctx0, 0);
+
+        // norm
+        {
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
+
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        cur);
+        }
+
+        // self-attention
+        {
+            // compute Q and K and RoPE them
+            // wq   shape [n_embd, n_embd, 1, 1]
+            // wk   shape [n_embd, n_embd, 1, 1]
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Kcur shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+
+            // store key and value to memory
+            {
+                // compute the transposed [N, n_embd] V matrix
+                // wv   shape [n_embd, n_embd, 1, 1]
+                // Vcur shape [n_embd, N, 1, 1]
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
+
+                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
+                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
+
+                /* {
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
+                            (   n_ctx)*ggml_element_size(kv_self.v),
+                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                } //*/
+
+                kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                vc = ggml_set_2d_inplace(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+            }
+
+            // Qcur shape [n_embd/n_head, n_head, N, 1]
+            // Q shape    [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        Qcur,
+                        0, 2, 1, 3);
+
+            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // K shape [n_embd/n_head, n_past + N, n_head, 1]
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            // KQ shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
+
+            // KQ_masked = mask_past(KQ_scaled)
+            // KQ_masked shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            // KQ_soft_max shape [n_past + N, N, n_head, 1]
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // split cached V into n_head heads
+            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
+            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, vc,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(vc),
+                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(vc)*n_embd);
+
+            // KQV shape [n_embd/n_head, N, n_head, 1]
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            // KQV_merged shape
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
+            // cur = ggml_cpy(ctx0,
+            //         KQV_merged,
+            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection (no bias)
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].wo,
+                    cur);
+        }
+
+        // lctx.use_buf(ctx0, 1);
+
+        // inpFF shape [n_embd,N,1,1]
+        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+
+        // feed-forward network
+        {
+            // norm
+            {
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
+
+                // cur = ffn_norm*cur
+                // cur shape [n_embd,N,1,1]
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        cur);
+            }
+
+            // tmp shape [n_ff,N,1,1]
+            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+                    model->layers[il].w3,
+                    cur);
+
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w1,
+                    cur);
+
+            // SILU activation
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_silu(ctx0, cur);
+
+            // cur shape [n_ff,N,1,1]
+            cur = ggml_mul(ctx0, cur, tmp);
+
+            // cur shape [n_embd,N,1,1]
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].w2,
+                    cur);
+        }
+
+        // cur shape [n_embd,N,1,1]
+        cur = ggml_add(ctx0, cur, inpFF);
+
+        // input for next layer
+        // inpL shape [n_embd,N,1,1]
+        inpL = cur;
+    }
+
+    // norm
+    {
+
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
+
+        // inpL = norm*inpL
+        // inpL shape [n_embd,N,1,1]
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model->norm, inpL),
+                    inpL);
+
+        //embeddings = inpL;
+    }
+
+    // lm_head
+    // inpL shape [n_vocab,N,1,1]
+    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+
+    // run the computation
+    ggml_build_forward_expand(gf, inpL);
+
+    return inpL;
+}
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+}
+
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+}
+
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+}
+
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == ne3);
+}
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static size_t hash_find(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_GRAPH_HASHTABLE_SIZE;
+        }
+    }
+    return i;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t h = hash(p);
+    size_t i = hash_find(hash_table, p);
+
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    // insert
+    GGML_ASSERT(hash_table[i] == NULL);
+    hash_table[i] = p;
+    return false;
+}
+
+static bool hash_contains(void * hash_table[], void * p) {
+    size_t i = hash_find(hash_table, p);
+    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
+}
+
+struct hash_map {
+    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
+    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
+};
+static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
+
+struct hash_map * new_hash_map() {
+    struct hash_map * result = new struct hash_map;
+    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
+        result->keys[i] = NULL;
+        result->vals[i] = NULL;
+    }
+    return result;
+};
+
+void free_hash_map(struct hash_map * map) {
+    delete map;
+}
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
+struct ggml_tensor * ggml_recompute_graph_node(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * graph,
+        struct hash_map     * replacements,
+        struct ggml_tensor  * node) {
+
+    if (node == NULL) {
+        return NULL;
+    }
+
+    if (node->is_param) {
+        return node;
+    }
+
+    if (!hash_contains(graph->visited_hash_table, node)) {
+        return node;
+    }
+
+    int count_children = 0;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        if (node->src[k]) {
+            ++count_children;
+        }
+    }
+
+    if (count_children == 0) {
+        return node;
+    }
+
+    size_t i = hash_find(replacements->keys, node);
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+    if (replacements->keys[i] == node) {
+        return (struct ggml_tensor *) replacements->vals[i];
+    }
+
+    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+
+    // insert clone into replacements
+    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+    replacements->keys[i] = node;
+    replacements->vals[i] = clone;
+
+    clone->op       = node->op;
+    clone->grad     = node->grad;
+    clone->is_param = node->is_param;
+    clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
+        clone->nb[k] = node->nb[k];
+    }
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
+    }
+    if (ggml_is_view(clone)) {
+        struct ggml_tensor * source = get_view_source(clone);
+        GGML_ASSERT(source != NULL);
+        clone->data = source->data;
+    }
+
+    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
+    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
+    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
+    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
+
+    return clone;
+};
+
+void ggml_build_backward_gradient_checkpointing(
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * checkpoints,
+        int                     n_checkpoints) {
+    *gb_tmp = *gf;
+    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
+
+    if (n_checkpoints <= 0) {
+        *gb = *gb_tmp;
+        return;
+    }
+
+    struct hash_map * replacements = new_hash_map();
+
+    // insert checkpoints in replacements
+    for (int i = 0; i < n_checkpoints; ++i) {
+        size_t k = hash_find(replacements->keys, checkpoints[i]);
+        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
+        replacements->keys[k] = checkpoints[i];
+        replacements->vals[k] = checkpoints[i];
+    }
+
+    *gb = *gf;
+    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
+    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
+    // by recomputing them from checkpoints
+    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
+        struct ggml_tensor * node = gb_tmp->nodes[i];
+        for (int k = 0; k < GGML_MAX_SRC; ++k) {
+            // insert new tensors recomputing src, reusing already made replacements,
+            // remember replacements: remember new tensors with mapping from corresponding gf nodes
+            // recurse for input tensors,
+            // unless (i.e. terminating when) input tensors are checkpoints
+            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
+        }
+        // insert rewritten backward node with replacements made into resulting backward graph gb
+        ggml_build_forward_expand(gb, node);
+    }
+
+    free_hash_map(replacements);
+}
+
+struct ggml_tensor * llama_build_train_graphs(
+        struct my_llama_model * model,
+        struct ggml_allocr    * alloc,
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * logits,
+        struct ggml_tensor    * tokens_input,
+        struct ggml_tensor    * targets,
+        const  int              n_tokens,
+        const  int              n_batch,
+        const  bool             enable_flash_attn,
+        const  bool             enable_checkpointing) {
+
+    ggml_set_scratch(ctx, { 0, 0, nullptr, });
+    const int n_past = 0;
+    const int N = n_tokens;
+    const auto & hparams = model->hparams;
+    const int n_ctx      = hparams.n_ctx;
+    const int n_vocab    = hparams.n_vocab;
+    const int n_embd     = hparams.n_embd;
+    const int n_layer    = hparams.n_layer;
+    const int n_head     = hparams.n_head;
+    const int n_rot      = hparams.n_rot;
+    const int n_ff       = get_n_ff(&hparams);
+    const int rope_mode  = 0;
+
+    auto set_name = [](struct ggml_tensor * t, const char * n) {
+        ggml_set_name(t, n);
+        if (t->grad) {
+            ggml_format_name(t->grad, "%s->grad", n);
+        }
+    };
+
+    set_name(tokens_input, "tokens_input");
+    set_name(targets,      "targets");
+
+    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
+    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
+
+    struct ggml_tensor * cur = t01;
+
+    std::vector<struct ggml_tensor *> checkpoints;
+    checkpoints.push_back(tokens_input);
+    checkpoints.push_back(targets);
+    checkpoints.push_back(t00);
+    checkpoints.push_back(t01);
+
+    struct ggml_tensor * kv_scale;
+    if (!enable_flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
+    }
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
+        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
+        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
+        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        struct ggml_tensor * t16;
+        if (enable_flash_attn) {
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        } else {
+            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
+            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+        }
+        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
+        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
+        cur = t30;
+        checkpoints.push_back(cur);
+    }
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
+
+    checkpoints.push_back(t31);
+    checkpoints.push_back(t32);
+    checkpoints.push_back(t33);
+    checkpoints.push_back(t34);
+    checkpoints.push_back(t35);
+    checkpoints.push_back(t36);
+
+    ggml_build_forward_expand(gf, t36);
+
+    if (enable_checkpointing) {
+        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
+    } else {
+        *gb = *gf;
+        ggml_build_backward_expand(ctx, gf, gb, true);
+    }
+
+    if (alloc) {
+        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
+        int n_leafs_before = gb->n_leafs;
+        int n_nodes_before = gb->n_nodes;
+        struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
+        // output tensors
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+        // gradient tensors (will be set to zero by ggml_graph_reset)
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            if (!gf->grads[i]) continue;
+            if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) {
+                ggml_allocr_alloc(alloc, gf->grads[i]);
+            }
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
+        }
+        for (int i = 0; i < checkpoints.size(); ++i) {
+            if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
+                ggml_allocr_alloc(alloc, checkpoints[i]);
+            }
+        }
+
+        int n_leafs_after = gb->n_leafs;
+        int n_nodes_after = gb->n_nodes;
+
+        ggml_allocr_alloc_graph(alloc, gb);
+
+        // remove the additional nodes and leafs
+        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
+            gb->leafs[i] = NULL;
+        }
+        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
+            gb->nodes[i] = NULL;
+        }
+        gb->n_leafs = n_leafs_before;
+        gb->n_nodes = n_nodes_before;
+    }
+
+    *logits = t35;
+    return t36;
+}
+
+void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+    *ptr = value;
+}
+
+void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    *ptr = value;
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %.2f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+
+void print_token(struct llama_context * ctx, llama_token token) {
+    printf("%s", llama_token_to_str(ctx, token));
+}
+
+void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i=0; i<tokens->ne[0]; ++i) {
+        int token = ggml_get_i32_1d(tokens, i);
+        print_token(ctx, token);
+    }
+}
+
+void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
+    for (int i1=0; i1<tokens->ne[1]; ++i1) {
+        //int num_newline = 0;
+        for (int i0=0; i0<tokens->ne[0]; ++i0) {
+            int token = get_i32_2d(tokens, i0, i1);
+            print_token(ctx, token);
+            // bool isnl = (token == llama_token_nl());
+            // if (isnl) {
+            //     ++num_newline;
+            // }
+            // if (isnl) {
+            //     if (num_newline < 2) {
+            //         print_token(ctx, token);
+            //     } else {
+            //         printf("\\n");
+            //     }
+            // } else {
+            //     print_token(ctx, token);
+            // }
+        }
+        printf("\n--\n");
+    }
+}
+
+void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab  = target_logits->ne[0];
+
+    size_t sample = train_samples[example_id % n_train_samples];
+    GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+
+    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    ggml_set_f32(target_probs, 0.0f);
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    for (int i=1; i<n_tokens+1; ++i) {
+        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
+        set_f32_2d(target_logits, token, i-1, +1.0f);
+        set_f32_2d(target_probs,  token, i-1, +1.0f);
+        if (i<n_tokens) {
+            ggml_set_i32_1d(tokens_input, i, token);
+        }
+    }
+}
+
+void get_example_targets_batch(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(target_logits->n_dims == 3);
+    GGML_ASSERT(target_probs->n_dims  == 3);
+    int n_vocab  = target_logits->ne[0];
+    int n_tokens = tokens_input->ne[0];
+    int n_batch  = tokens_input->ne[1];
+    GGML_ASSERT(n_tokens == target_logits->ne[1]);
+    GGML_ASSERT(n_batch  == target_logits->ne[2]);
+    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
+    GGML_ASSERT(n_tokens == target_probs->ne[1]);
+    GGML_ASSERT(n_batch  == target_probs->ne[2]);
+
+    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    ggml_set_f32(target_probs, 0.0f);
+    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
+    for (int k=0; k<n_batch; ++k) {
+        // printf("%s: batch %d\n", __func__, k);
+        size_t sample_idx = (example_id*n_batch + k) % n_train_samples;
+        size_t sample = train_samples[sample_idx];
+        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
+        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+
+        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        for (int i=1; i<n_tokens+1; ++i) {
+            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
+            set_f32_3d(target_logits, token, i-1, k, +1.0f);
+            set_f32_3d(target_probs,  token, i-1, k, +1.0f);
+            if (i<n_tokens) {
+                set_i32_2d(tokens_input, i, k, token);
+            }
+        }
+    }
+}
+
+
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) {
+    int n_tokens = tokens_input->ne[0];
+    int n_vocab = target_logits->ne[0];
+    for (int i=0; i<n_tokens-n_shift; ++i) {
+        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
+        for (int k=0; k<n_vocab; ++k) {
+            ggml_set_f32_1d(target_logits, i*n_vocab + k, ggml_get_f32_1d(target_logits, (i + n_shift)*n_vocab + k));
+            ggml_set_f32_1d(target_probs, i*n_vocab + k,  ggml_get_f32_1d(target_probs,  (i + n_shift)*n_vocab + k));
+        }
+    }
+}
+
+struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * target) {
+    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, target, a)));
+}
+
+struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * probs) {
+    return ggml_cross_entropy_loss(ctx, a, probs);
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
+    struct llama_file f(filename, "rb");
+
+    std::vector<char> buf;
+    buf.resize(f.size+1);
+
+    f.read_raw(buf.data(), f.size);
+    buf[f.size] = '\0';
+
+    out.resize(buf.size());
+
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
+    if (n_tokens >= 0) {
+        out.resize(n_tokens);
+    }
+
+    bool verify = false;
+    if (verify) {
+        const char * in  = buf.data();
+        const char * end = buf.data() + buf.size();
+        for (int i = 0; i < (int) out.size(); ++i) {
+            const char * s = llama_token_to_str(lctx, out[i]);
+            int len = strlen(s);
+            if (in >= end) {
+                printf("%s: unexpected end of original text.\n", __func__);
+                break;
+            }
+            const bool matches = (strncmp(in, s, len) == 0);
+            if (matches) {
+                in += len;
+            } else {
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+            }
+        }
+    }
+
+    return n_tokens;
+}
+
+void shuffle_ints(int * begin, int * end) {
+    if (end <= begin) return;
+    int max=begin[0];
+    for (int i=1; i<end-begin; ++i) {
+        if (begin[i] > max) {
+            max = begin[i];
+        }
+    }
+    std::vector<float> vals;
+    vals.resize(max+1);
+    for (int i=0; i<max+1; ++i) {
+       vals[i] = frand();
+    }
+    std::sort(begin, end, [&vals](int a, int b){
+       return vals.at(a) < vals.at(b);
+    });
+}
+
+struct my_llama_sampler_params {
+    float temp              = 0.0f;  // <= 0.0 disabled
+    int   top_k             = 20;    // <= 0 to use vocab size
+    float top_p             = 0.95f; // 1.0 = disabled
+    float tfs_z             = 1.00f; // 1.0 = disabled
+    float typical_p         = 1.00f; // 1.0 = disabled
+    int   repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float repeat_penalty    = 1.0f;  // 1.0 = disabled
+    float presence_penalty  = 0.0f;  // 0.0 = disabled
+    float frequency_penalty = 0.0f;  // 0.0 = disabled
+    int   mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float mirostat_tau      = 5.00f; // target entropy
+    float mirostat_eta      = 0.10f; // learning rate
+    bool  penalize_nl       = true;  // consider newlines as a repeatable token
+};
+
+struct my_llama_sampler {
+    struct llama_context * ctx = NULL;
+    my_llama_sampler_params params;
+
+    int n_vocab = 0;
+    int n_ctx = 0;
+
+    float mirostat_mu;
+
+    std::vector<llama_token_data> candidates;
+    llama_token_data_array candidates_p;
+
+};
+
+void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) {
+    sampler->ctx = ctx;
+    sampler->n_vocab = llama_n_vocab(sampler->ctx);
+    sampler->n_ctx   = llama_n_ctx(sampler->ctx);
+    sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
+}
+
+llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
+    GGML_ASSERT(sampler->ctx != NULL);
+
+    struct llama_context * ctx = sampler->ctx;
+
+    sampler->candidates.resize(sampler->n_vocab);
+    for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) {
+        sampler->candidates[token_id].id = token_id;
+        sampler->candidates[token_id].logit = logits[token_id];
+        sampler->candidates[token_id].p = 0.0;
+    }
+
+    llama_token_data_array * candidates_p = & sampler->candidates_p;
+
+    candidates_p->data = sampler->candidates.data();
+    candidates_p->size = sampler->candidates.size();
+    candidates_p->sorted = false;
+
+    const auto params = sampler->params;
+
+    // Apply penalties
+    const float nl_logit = logits[llama_token_nl()];
+
+    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
+
+    llama_sample_repetition_penalty(
+        ctx,
+        candidates_p,
+        last_tokens + n_last_tokens - n_last,
+        n_last,
+        params.repeat_penalty);
+    llama_sample_frequency_and_presence_penalties(
+        ctx,
+        candidates_p,
+        last_tokens + n_last_tokens - n_last,
+        n_last,
+        params.frequency_penalty,
+        params.presence_penalty);
+
+    if (!params.penalize_nl) {
+        logits[llama_token_nl()] = nl_logit;
+    }
+
+    llama_token token = 0;
+    if (params.temp <= 0) {
+        // Greedy sampling
+        token = llama_sample_token_greedy(ctx, candidates_p);
+    } else {
+        if (params.mirostat == 1) {
+            int mirostat_m = 100;
+            llama_sample_temperature(ctx, candidates_p, params.temp);
+            token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu);
+        } else if (params.mirostat == 2) {
+            llama_sample_temperature(ctx, candidates_p, params.temp);
+            token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu);
+        } else {
+            // Temperature sampling
+            llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
+            llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
+            llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
+
+            llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
+            llama_sample_temperature  (ctx, candidates_p, params.temp);
+            token = llama_sample_token(ctx, candidates_p);
+        }
+    }
+    return token;
+}
+
+void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
+    GGML_ASSERT(logits->ne[0] == (int64_t) mask.size());
+    for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
+        for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
+            for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
+                if (!mask[i0]) continue;
+                float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]);
+                *ptr = value;
+            }
+        }
+    }
+}
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    int32_t nd = file->read_u32();
+    GGML_ASSERT(nd == tensor->n_dims);
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
+    GGML_ASSERT(type == tensor->type);
+
+    uint32_t ne[4];
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+    for (int i=0; i<nd; ++i) {
+        GGML_ASSERT(ne[i] == tensor->ne[i]);
+    }
+
+    std::string name = file->read_string(name_len);
+    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
+
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->read_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+void skip_tensor(struct llama_file * file) {
+    int32_t nd = file->read_u32();
+
+    uint32_t name_len       = file->read_u32();
+    enum     ggml_type type = (enum ggml_type) file->read_u32();
+
+    uint32_t ne[4] = { 1, 1, 1, 1 };
+
+    file->read_raw(ne, sizeof(ne[0]) * nd);
+
+    std::string name = file->read_string(name_len);
+
+    file->seek(-file->tell() & 31, SEEK_CUR);
+
+    size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
+    size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
+    file->seek(nbytes, SEEK_CUR);
+}
+
+void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
+    const uint32_t version = 1;
+    GGML_ASSERT(opt->nx   >= 0);
+    GGML_ASSERT(opt->iter >= 0);
+    file->write_u32(version);
+    file->write_u32(opt->params.past);
+    file->write_u32(opt->params.lbfgs.m);
+    file->write_raw(&opt->nx,     sizeof(opt->nx));
+    file->write_raw(&opt->iter,   sizeof(opt->iter));
+    file->write_u32((uint32_t)  opt->just_initialized);
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                GGML_ASSERT(opt->adam.m  != NULL);
+                GGML_ASSERT(opt->adam.v  != NULL);
+                write_tensor(file, opt->adam.m);
+                write_tensor(file, opt->adam.v);
+                write_tensor(file, opt->adam.pf);
+                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->lbfgs.x != NULL);
+                write_tensor(file, opt->lbfgs.x);
+                write_tensor(file, opt->lbfgs.xp);
+                write_tensor(file, opt->lbfgs.g);
+                write_tensor(file, opt->lbfgs.gp);
+                write_tensor(file, opt->lbfgs.d);
+                write_tensor(file, opt->lbfgs.pf);
+                write_tensor(file, opt->lbfgs.lmal);
+                write_tensor(file, opt->lbfgs.lmys);
+                write_tensor(file, opt->lbfgs.lms);
+                write_tensor(file, opt->lbfgs.lmy);
+                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+struct ggml_opt_params_v0 {
+    enum ggml_opt_type type;
+    int n_threads;
+    int past;
+    float delta;
+    int max_no_improvement;
+    bool print_forward_graph;
+    bool print_backward_graph;
+    struct {
+        int n_iter;
+        float sched;
+        float decay;
+        float alpha;
+        float beta1;
+        float beta2;
+        float eps;
+        float eps_f;
+        float eps_g;
+    } adam;
+    struct {
+        int m;
+        int n_iter;
+        int max_linesearch;
+        float eps;
+        float ftol;
+        float wolfe;
+        float min_step;
+        float max_step;
+        enum ggml_linesearch linesearch;
+    } lbfgs;
+};
+
+void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    ggml_opt_params_v0 pv0;
+    file->read_raw(&pv0, sizeof(pv0));
+    opt->params.past = pv0.past;
+    opt->params.lbfgs.m = pv0.lbfgs.m;
+    file->read_raw(&opt->nx, sizeof(opt->nx));
+    ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+    file->read_raw(&opt->iter,   sizeof(opt->iter));
+    opt->just_initialized = (bool) file->read_u32();
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                skip_tensor(file);
+                skip_tensor(file);
+                skip_tensor(file);
+                read_tensor(file, opt->adam.m);
+                read_tensor(file, opt->adam.v);
+                skip_tensor(file);
+                skip_tensor(file);
+                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->lbfgs.x != NULL);
+                read_tensor(file, opt->lbfgs.x);
+                read_tensor(file, opt->lbfgs.xp);
+                read_tensor(file, opt->lbfgs.g);
+                read_tensor(file, opt->lbfgs.gp);
+                read_tensor(file, opt->lbfgs.d);
+                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+                read_tensor(file, opt->lbfgs.lmal);
+                read_tensor(file, opt->lbfgs.lmys);
+                read_tensor(file, opt->lbfgs.lms);
+                read_tensor(file, opt->lbfgs.lmy);
+                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    opt->params.past    = (int) file->read_u32();
+    opt->params.lbfgs.m = (int) file->read_u32();
+    file->read_raw(&opt->nx,     sizeof(opt->nx));
+    ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+    file->read_raw(&opt->iter,   sizeof(opt->iter));
+    opt->just_initialized = (bool) file->read_u32();
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                read_tensor(file, opt->adam.m);
+                read_tensor(file, opt->adam.v);
+                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                GGML_ASSERT(opt->lbfgs.x != NULL);
+                read_tensor(file, opt->lbfgs.x);
+                read_tensor(file, opt->lbfgs.xp);
+                read_tensor(file, opt->lbfgs.g);
+                read_tensor(file, opt->lbfgs.gp);
+                read_tensor(file, opt->lbfgs.d);
+                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+                read_tensor(file, opt->lbfgs.lmal);
+                read_tensor(file, opt->lbfgs.lmys);
+                read_tensor(file, opt->lbfgs.lms);
+                read_tensor(file, opt->lbfgs.lmy);
+                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+            } break;
+    }
+}
+
+void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+    uint32_t version = file->read_u32();
+    printf("%s: opt context version %u\n", __func__, version);
+    switch (version) {
+        case 0:
+            {
+                read_opt_context_v0(file, ctx, opt);
+            } break;
+        case 1:
+            {
+                read_opt_context_v1(file, ctx, opt);
+            } break;
+        default:
+            {
+                fprintf(stderr, "%s: unknown version %u\n", __func__, version);
+            }
+    }
+}
+
+void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+    const uint32_t magic   = 'ggcp';
+    const uint32_t version = 0;
+
+    file.write_u32(magic);
+    file.write_u32(version);
+    file.write_u32(model->train_its);
+    file.write_u32(model->train_samples);
+    file.write_u32(model->train_tokens);
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+
+    write_opt_context(&file, opt);
+}
+
+bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
+    struct llama_file file(filename, "rb");
+
+    uint32_t magic;
+    uint32_t version;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+
+    if (file.fp) {
+        printf("%s: Loading model from '%s'.\n", __func__, filename);
+        magic                  = file.read_u32();
+        GGML_ASSERT(magic     == 'ggcp');
+        version                = file.read_u32();
+        GGML_ASSERT(version   == 0);
+        train_its              = file.read_u32();
+        train_samples          = file.read_u32();
+        train_tokens           = file.read_u32();
+        model->hparams.n_vocab = file.read_u32();
+        model->hparams.n_embd  = file.read_u32();
+        model->hparams.n_mult  = file.read_u32();
+        model->hparams.n_head  = file.read_u32();
+        model->hparams.n_layer = file.read_u32();
+        model->hparams.n_rot   = file.read_u32();
+        print_params(&model->hparams);
+    }
+
+    if (init) {
+        init_model(model);
+    }
+
+    if (file.fp) {
+        model->train_its = train_its;
+        model->train_samples = train_samples;
+        model->train_tokens = train_tokens;
+    }
+
+    printf("%s: Training iterations: %u.\n", __func__, model->train_its);
+    printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
+    printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
+
+    if (file.fp) {
+        read_tensor(&file, model->tok_embeddings);
+        read_tensor(&file, model->norm);
+        read_tensor(&file, model->output);
+
+        for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+            auto & layer = model->layers[i];
+
+            read_tensor(&file, layer.attention_norm);
+            read_tensor(&file, layer.wq);
+            read_tensor(&file, layer.wk);
+            read_tensor(&file, layer.wv);
+            read_tensor(&file, layer.wo);
+            read_tensor(&file, layer.ffn_norm);
+            read_tensor(&file, layer.w1);
+            read_tensor(&file, layer.w2);
+            read_tensor(&file, layer.w3);
+        }
+
+        read_opt_context(&file, model->ctx, opt);
+    }
+
+    return (file.fp != NULL);
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+    // write_vocab
+    uint32_t n_vocab = model->hparams.n_vocab;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        const auto & token_score = vocab->id_to_token.at(i);
+        file.write_u32((uint32_t) token_score.tok.size());
+        file.write_raw(token_score.tok.data(), token_score.tok.size());
+        file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+    // write tensors
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
+float cosine_decay(const int decay_steps, const float minimum, int step) {
+    if (step > decay_steps) {
+        step = decay_steps;
+    }
+    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
+    const float decay = (1 - minimum)*cosine_decay + minimum;
+    return decay;
+}
+
+float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
+    if (enable_restart) {
+        while (step > decay_steps) {
+            step -= decay_steps;
+            decay_steps = (int) restart_step_mult * decay_steps;
+        }
+    }
+    return cosine_decay(decay_steps, minimum, step);
+}
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_checkpointing;
+    bool use_alloc;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_min;
+    bool  enable_restart;
+
+    int   opt_past;
+    float opt_delta;
+    int   opt_max_no_improvement;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_min_alpha;
+    float adam_decay;
+    int   adam_decay_min_ndim;
+    float adam_beta1;
+    float adam_beta2;
+    float adam_gclip;
+    float adam_eps_f;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+};
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    1;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_checkpointing      = true;
+    params.use_alloc              = true;
+
+    params.opt_past               = 0;
+    params.opt_delta              = 1e-5f;
+    params.opt_max_no_improvement = 0;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_min     = 0.1f;
+    params.enable_restart    = false;
+
+    params.lbfgs_n_iter        = 256;
+    params.adam_n_iter         = 256;
+    params.adam_alpha          = 1e-3f;
+    params.adam_min_alpha      = 0;
+    params.adam_decay          = 1e-1f;
+    params.adam_decay_min_ndim = 2;
+    params.adam_beta1          = 0.9f;
+    params.adam_beta2          = 0.999f;
+    params.adam_gclip          = 1.0f;
+    params.adam_eps_f          = 0.0f;
+
+    params.mem_model_gb   =  2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    return params;
+}
+
+void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
+    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
+    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
+    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
+    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
+    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
+    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
+    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
+    fprintf(stderr, "  --mult N                   Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
+    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
+    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
+    fprintf(stderr, "  --rotmax N                 Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax);
+    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
+    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
+    fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
+    fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
+    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
+    fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
+    fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
+    fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
+    fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
+    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
+    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
+    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
+    fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
+    fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
+    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
+    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
+    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
+    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
+    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
+    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
+    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
+    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
+    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
+    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
+    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
+    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
+    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
+    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
+    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
+    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
+    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
+    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
+    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
+    fprintf(stderr, "\n");
+}
+
+bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--vocab-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--train-data") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_train_data = argv[i];
+        } else if (arg == "--checkpoint-in") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_checkpoint_in = argv[i];
+        } else if (arg == "--checkpoint-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_checkpoint_out = argv[i];
+        } else if (arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->seed = std::stoi(argv[i]);
+        } else if (arg == "-c" || arg == "--ctx") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--embd") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_embd = std::stoi(argv[i]);
+        } else if (arg == "--mult") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_mult = std::stoi(argv[i]);
+        } else if (arg == "--head") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_head = std::stoi(argv[i]);
+        } else if (arg == "--layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_layer = std::stoi(argv[i]);
+        } else if (arg == "--rotmax") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rotmax = std::stoi(argv[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_batch = std::stoi(argv[i]);
+        } else if (arg == "-n" || arg == "--examples") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_examples = std::stoi(argv[i]);
+        } else if (arg == "--predict") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_predict = std::stoi(argv[i]);
+        } else if (arg == "--print-info-interval") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->print_info_interval = std::stoi(argv[i]);
+        } else if (arg == "--print-details-interval") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->print_details_interval = std::stoi(argv[i]);
+        } else if (arg == "--samples-after-nl") {
+            params->samples_start_after_nl = true;
+        } else if (arg == "--use-lbfgs") {
+            params->use_adam = false;
+        } else if (arg == "--use-adam") {
+            params->use_adam = true;
+        } else if (arg == "--no-flash") {
+            params->use_flash = false;
+        } else if (arg == "--use-flash") {
+            params->use_flash = true;
+        } else if (arg == "--no-checkpointing") {
+            params->use_checkpointing = false;
+        } else if (arg == "--use-checkpointing") {
+            params->use_checkpointing = true;
+        } else if (arg == "--no-alloc") {
+            params->use_alloc = false;
+        } else if (arg == "--use-alloc") {
+            params->use_alloc = true;
+        } else if (arg == "--warmup") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->warmup = std::stoi(argv[i]);
+        } else if (arg == "--cos-decay-steps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_steps = std::stof(argv[i]);
+        } else if (arg == "--cos-decay-restart") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_restart = std::stof(argv[i]);
+        } else if (arg == "--cos-decay-min") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->cos_decay_min = std::stof(argv[i]);
+        } else if (arg == "--enable-restart") {
+            params->enable_restart = true;
+        } else if (arg == "--disable-restart") {
+            params->enable_restart = false;
+        } else if (arg == "--opt-past") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_past = std::stoi(argv[i]);
+        } else if (arg == "--opt-delta") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_delta = std::stof(argv[i]);
+        } else if (arg == "--opt-max-no-improvement") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->opt_max_no_improvement = std::stoi(argv[i]);
+        } else if (arg == "--adam-epsf") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_eps_f = std::stof(argv[i]);
+        } else if (arg == "--adam-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_n_iter = std::stoi(argv[i]);
+        } else if (arg == "--adam-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_alpha = std::stof(argv[i]);
+        } else if (arg == "--adam-min-alpha") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_min_alpha = std::stof(argv[i]);
+        } else if (arg == "--adam-decay") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_decay = std::stof(argv[i]);
+        } else if (arg == "--adam-decay-min-ndim") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_decay_min_ndim = std::stoi(argv[i]);
+        } else if (arg == "--adam-beta1") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_beta1 = std::stof(argv[i]);
+        } else if (arg == "--adam-beta2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_beta2 = std::stof(argv[i]);
+        } else if (arg == "--adam-gclip") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->adam_gclip = std::stof(argv[i]);
+        } else if (arg == "--lbfgs-iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->lbfgs_n_iter = std::stoi(argv[i]);
+        } else if (arg == "--mem-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_model_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute_gb = std::stoi(argv[i]);
+        } else if (arg == "--mem-compute0") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->mem_compute0_gb = std::stoi(argv[i]);
+        } else if (arg == "-h" || arg == "--help") {
+            train_print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            train_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        train_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+struct opt_callback_data {
+    struct train_params *     params;
+    struct ggml_opt_context * opt;
+    llama_token *             tokens_data;
+    size_t                    tokens_size;
+    int *                     samples_data;
+    size_t                    samples_size;
+    int                       shuffle_countdown;
+    struct ggml_tensor *      tokens_input;
+    struct ggml_tensor *      target_logits;
+    struct ggml_tensor *      target_probs;
+};
+
+void opt_callback(void * vdata, float * sched) {
+    struct opt_callback_data * data = (struct opt_callback_data *) vdata;
+    struct train_params * params    = data->params;
+    struct ggml_opt_context * opt   = data->opt;
+    int n_batch = params->n_batch;
+
+    *sched = (opt->iter < params->warmup)
+                ? (float) opt->iter / (float) params->warmup
+                : cosine_decay_restart(
+                    params->cos_decay_steps,
+                    params->cos_decay_min,
+                    opt->iter - params->warmup,
+                    params->cos_decay_restart,
+                    params->enable_restart);
+    float min_sched = params->adam_min_alpha / params->adam_alpha;
+    *sched = min_sched + *sched * (1.0f - min_sched);
+
+    int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+    printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+
+    if (data->shuffle_countdown < n_batch) {
+        printf("%s: reshuffle samples\n", __func__);
+        shuffle_ints(data->samples_data, data->samples_data + data->samples_size);
+        for (int i = 0; i < (int) data->samples_size; ++i) {
+            GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size);
+        }
+        data->shuffle_countdown = data->samples_size;
+    }
+
+    get_example_targets_batch(
+        data->samples_data,
+        data->samples_size,
+        data->tokens_data,
+        data->tokens_size,
+        opt->iter,
+        data->tokens_input,
+        data->target_logits,
+        data->target_probs);
+
+    data->shuffle_countdown -= n_batch;
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+
+    if (!train_params_parse(argc, argv, &params)) {
+        return 1;
+    }
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+    printf("%s: seed: %u\n", __func__, params.seed);
+    srand(params.seed);
+
+    struct llama_context_params llama_params = llama_context_default_params();
+    llama_params.vocab_only = true;
+
+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+    struct llama_vocab vocab;
+    {
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab.id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];
+            vocab.id_to_token[i].tok   = tok;
+            vocab.id_to_token[i].score = score;
+            vocab.token_to_id.emplace(tok, i);
+        }
+    }
+
+    printf("%s: tokenize training data\n", __func__);
+    std::vector<llama_token> train_tokens;
+    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
+        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
+    }
+    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
+
+    struct my_llama_model model;
+    model.hparams.n_vocab = llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = params.n_embd;
+    model.hparams.n_mult  = params.n_mult;
+    model.hparams.n_head  = params.n_head;
+    model.hparams.n_layer = params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+
+    print_params(&model.hparams);
+
+    std::vector<size_t> token_noccurs;
+    std::vector<bool>   token_notavail;
+    token_noccurs.resize(model.hparams.n_vocab, 0);
+    token_notavail.resize(model.hparams.n_vocab, true);
+    for (int i = 0; i < (int) train_tokens.size(); ++i) {
+        ++token_noccurs[train_tokens[i]];
+        token_notavail[train_tokens[i]] = false;
+    }
+
+    std::vector<float> token_freq;
+    token_freq.resize(model.hparams.n_vocab, 0);
+    int n_unique_tokens = 0;
+    for (int i = 0; i < (int) token_noccurs.size(); ++i) {
+        token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size();
+        n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0;
+    }
+    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
+
+    struct my_llama_kv_cache kv_self;
+
+
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+    kv_self.ctx = model.ctx;
+
+    my_llama_sampler sampler;
+
+
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
+    int n_batch  = params.n_batch;
+
+    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
+    memset(opt, 0, sizeof(struct ggml_opt_context));
+
+    struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
+    struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+    opt_params_adam.print_forward_graph  = false;
+    opt_params_adam.print_backward_graph = false;
+    opt_params_adam.n_threads            = params.n_threads;
+    opt_params_adam.past                 = params.opt_past;
+    opt_params_adam.delta                = params.opt_delta;
+    opt_params_adam.max_no_improvement   = params.opt_max_no_improvement;
+    opt_params_adam.adam.n_iter          = params.adam_n_iter;
+    opt_params_adam.adam.sched           = 1.0f;
+    opt_params_adam.adam.alpha           = params.adam_alpha;
+    opt_params_adam.adam.decay           = params.adam_decay;
+    opt_params_adam.adam.decay_min_ndim  = params.adam_decay_min_ndim;
+    opt_params_adam.adam.beta1           = params.adam_beta1;
+    opt_params_adam.adam.beta2           = params.adam_beta2;
+    opt_params_adam.adam.gclip           = params.adam_gclip;
+    opt_params_adam.adam.eps_f           = params.adam_eps_f;
+
+    opt_params_lbfgs.print_forward_graph  = false;
+    opt_params_lbfgs.print_backward_graph = false;
+    opt_params_lbfgs.n_threads            = params.n_threads;
+    opt_params_adam.past                  = params.opt_past;
+    opt_params_adam.delta                 = params.opt_delta;
+    opt_params_adam.max_no_improvement    = params.opt_max_no_improvement;
+    opt_params_lbfgs.lbfgs.n_iter         = params.lbfgs_n_iter;
+
+    opt->ctx = model.ctx;
+    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
+
+    printf("%s: init model\n", __func__);
+    bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true);
+    set_param_model(&model);
+
+    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
+
+    opt->iter = model.train_its;
+    printf("%s: opt iter %d\n", __func__, opt->iter);
+
+    bool from_scratch = !existed;
+    if (from_scratch) {
+        randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+    }
+
+    init_kv_cache(&kv_self, &model, 1);
+    // init_kv_cache(&kv_self, &model, n_batch);
+    init_sampler(&sampler, lctx);
+
+    printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
+    // ggml_print_tensor_objects(model.ctx);
+
+    // TODO: use std::vector<uint8_t> intead of "new"
+    size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
+    uint8_t * compute_addr = new uint8_t[compute_size];
+
+    size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
+    uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
+
+    ggml_allocr * alloc = NULL;
+    if (params.use_alloc) {
+        static const size_t tensor_alignment = 32;
+        alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
+    }
+
+    GGML_ASSERT(n_tokens < (int) train_tokens.size());
+    std::vector<int> train_samples;
+    train_samples.push_back(0);
+    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+            train_samples.push_back(i);
+        }
+    }
+    shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
+    for (int i = 0; i < (int) train_samples.size(); ++i) {
+        GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
+    }
+
+    std::vector<uint8_t> work_buffer;
+
+    printf("%s: begin training\n", __func__);
+
+    struct opt_callback_data opt_cb_data;
+    opt_cb_data.params = &params;
+    opt_cb_data.opt = opt;
+    opt_cb_data.tokens_data = train_tokens.data();
+    opt_cb_data.tokens_size = train_tokens.size();
+    opt_cb_data.samples_data = train_samples.data();
+    opt_cb_data.samples_size = train_samples.size();
+    opt_cb_data.shuffle_countdown = train_samples.size();
+    opt_cb_data.tokens_input  = NULL;
+    opt_cb_data.target_logits = NULL;
+    opt_cb_data.target_probs  = NULL;
+
+    int64_t t0 = ggml_time_ms();
+
+    for (int ex = 0; ex < params.n_examples; ++ex) {
+        if (ex*n_batch >= (int) train_samples.size()) {
+            shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
+            for (int i = 0; i < (int) train_samples.size(); ++i) {
+                GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
+            }
+        }
+
+        struct ggml_init_params cparams = {
+            compute_size, // mem_size
+            compute_addr, // mem_buffer
+            false,        // no_alloc
+        };
+        struct ggml_context * ctx0 = ggml_init(cparams);
+
+        ggml_set_no_alloc(ctx0, false);
+
+        // don't use alloc for input tensors, so we can safely fill them with data
+        struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+
+        ggml_set_no_alloc(ctx0, (alloc != NULL));
+
+        if (alloc) {
+            ggml_allocr_reset(alloc);
+        }
+
+        opt_cb_data.tokens_input  = tokens_input;
+        opt_cb_data.target_logits = target_logits;
+        opt_cb_data.target_probs  = target_probs;
+
+        int n_past = 0;
+
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+        struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+        struct ggml_cgraph * gb_tmp = params.use_checkpointing
+            ? ggml_new_graph(ctx0)
+            : NULL;
+
+        GGML_ASSERT(n_past == 0);
+
+        struct ggml_tensor * loss   = NULL;
+        struct ggml_tensor * logits = NULL;
+
+        loss = llama_build_train_graphs(
+            &model, alloc, ctx0,
+            gf, gb, gb_tmp,
+            &logits, tokens_input, target_probs,
+            n_tokens, n_batch,
+            params.use_flash,
+            params.use_checkpointing
+        );
+
+        size_t used_mem_before_opt = ggml_used_mem(ctx0);
+
+        opt->params.adam.sched = (opt->iter < params.warmup)
+            ? (float) opt->iter / (float) params.warmup
+            : cosine_decay_restart(
+                params.cos_decay_steps,
+                params.cos_decay_min,
+                opt->iter - params.warmup,
+                params.cos_decay_restart,
+                params.enable_restart);
+
+        float min_sched = params.adam_min_alpha / params.adam_alpha;
+        opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched);
+
+        printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
+
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
+
+        size_t used_mem_after_opt = ggml_used_mem(ctx0);
+
+        int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter;
+        model.train_its = opt->iter;
+        model.train_samples += n_batch * n_iter;
+        model.train_tokens  += n_batch * n_tokens * n_iter;
+
+        if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
+            printf("Example %d, opt iter %d\n", ex, opt->iter);
+            printf("error_before_opt: %.6f\n", opt->loss_before);
+            printf("error_after_opt:  %.6f\n", opt->loss_after);
+            printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
+            printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
+        }
+
+        if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) {
+            // set_logits_masked(logits, token_notavail, -1e9);
+            for (int i=0; i<n_batch; ++i) {
+                init_sampler(&sampler, lctx);
+                for (int k=0; k<n_tokens; ++k) {
+                    int32_t token = sample(&sampler,
+                        (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]),
+                        (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]),
+                        k);
+                    * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token;
+                }
+            }
+
+            // printf("probabilities after optimization:\n");
+            // print_matrix(after_opt_probs);
+            printf("Example:\n---\n");
+            print_tokens_batch(lctx, tokens_input);
+            printf("\n---\n");
+
+            // printf("best samples after optimization:\n---\n");
+            printf("samples after optimization:\n---\n");
+            print_tokens_batch(lctx, after_opt_best_samples);
+            printf("\n---\n");
+        }
+
+        ggml_free(ctx0);
+    }
+
+    int64_t t1 = ggml_time_ms();
+    int64_t d  = t1-t0;
+    double  dd = (double) d * 1e-3;
+    printf("%s: total training time=%f seconds\n", __func__, dd);
+
+    if (params.n_examples > 0) {
+        save_checkpoint(&model, opt, params.fn_checkpoint_out);
+    }
+
+    if (strlen(params.fn_model_out) > 0) {
+        save_as_llama_model(&vocab, &model, params.fn_model_out);
+    }
+
+    {
+        int n_gen = params.n_predict;
+        int sample_ctx = n_tokens - n_tokens/8;
+
+        // use defaults from common.h
+        sampler.params.top_k             = 40;
+        sampler.params.top_p             = 0.95f;
+        sampler.params.tfs_z             = 1.00f;
+        sampler.params.typical_p         = 1.00f;
+        sampler.params.temp              = 0.8f;
+        sampler.params.repeat_penalty    = 1.1f;
+        sampler.params.repeat_last_n     = 64;
+        sampler.params.frequency_penalty = 0.0f;
+        sampler.params.presence_penalty  = 0.0f;
+        sampler.params.mirostat          = 0;
+        sampler.params.mirostat_tau      = 5.00f;
+        sampler.params.mirostat_eta      = 0.10f;
+        init_sampler(&sampler, lctx);
+
+        printf("[Prediction context]\n");
+
+        struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
+        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
+
+        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
+        for (int i=sample_ctx; i<n_tokens; ++i) {
+            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
+        }
+
+        for (int i=0; i<sample_ctx-1; ++i) {
+            print_token(lctx, ggml_get_i32_1d(tokens_input, i));
+        }
+
+        printf("\n[Generating %d tokens]\n", n_gen);
+        for (int i=0; i<n_gen; ++i) {
+            struct ggml_init_params cparams = {
+                compute_size, // .mem_size
+                compute_addr, // .mem_buffer
+                false,        // .no_alloc
+            };
+            struct ggml_context * ctx0 = ggml_init(cparams);
+
+            struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+            int n_past = 0;
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
+
+            ggml_build_forward_expand(gf, logits);
+            ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
+
+            //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
+            //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
+
+            // set_logits_masked(logits, token_notavail, -1e9);
+            int token = sample(&sampler,
+                (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]),
+                (llama_token *) tokens_input->data,
+                sample_ctx-1);
+            //int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
+
+            // print_row(probs, sample_at);
+            print_token(lctx, token);
+
+            lshift_examples(tokens_input, target_logits, target_probs, 1);
+            ggml_set_i32_1d(tokens_input, 0, 0);
+            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
+
+            ggml_free(ctx0);
+        }
+    }
+
+    if (alloc) {
+        ggml_allocr_free(alloc);
+    }
+
+    delete[] compute_addr;
+    delete[] compute_buf_0;
+    ggml_free(model.ctx);
+    llama_free(lctx);
+    llama_free_model(lmodel);
+    return 0;
+}

From 9eb1ef86534ff61c9d517e3da72243b67a06714d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 15 Aug 2023 14:03:02 +0200
Subject: [PATCH 067/235] move and remove code

---
 examples/finetune/finetune.cpp | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 11754ffd9442b..c02f3f8591979 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -167,6 +167,17 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
     return tensor;
 }
 
+struct my_llama_kv_cache {
+    struct ggml_context * ctx = NULL;
+
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    // llama_ctx_buffer buf;
+
+    int n; // number of tokens currently in the cache
+};
+
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -213,17 +224,6 @@ struct my_llama_layer {
     struct ggml_tensor * w3;
 };
 
-struct my_llama_kv_cache {
-    struct ggml_context * ctx = NULL;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    // llama_ctx_buffer buf;
-
-    int n; // number of tokens currently in the cache
-};
-
 struct my_llama_model {
     struct ggml_context * ctx = NULL;
 
@@ -1165,14 +1165,6 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
     }
 }
 
-struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * target) {
-    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, target, a)));
-}
-
-struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * probs) {
-    return ggml_cross_entropy_loss(ctx, a, probs);
-}
-
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))

From c0a372fd3d338d15333a2489c4b538f51479f20c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 15:30:31 +0200
Subject: [PATCH 068/235] add API functions to access remaining model
 parameters:

mult, head and rot
---
 llama.cpp | 24 ++++++++++++++++++++++++
 llama.h   |  6 ++++++
 2 files changed, 30 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index c5112e5bab1f0..ccec53c86434d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4147,6 +4147,18 @@ int llama_n_embd_from_model(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
+int llama_n_mult_from_model(const struct llama_model * model) {
+    return model->hparams.n_mult;
+}
+
+int llama_n_head_from_model(const struct llama_model * model) {
+    return model->hparams.n_head;
+}
+
+int llama_n_rot_from_model(const struct llama_model * model) {
+    return model->hparams.n_rot;
+}
+
 int llama_n_layer_from_model(const struct llama_model * model) {
     return model->hparams.n_layer;
 }
@@ -4163,6 +4175,18 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
+int llama_n_mult(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_mult;
+}
+
+int llama_n_head(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_head;
+}
+
+int llama_n_rot(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_rot;
+}
+
 int llama_n_layer(const struct llama_context * ctx) {
     return ctx->model.hparams.n_layer;
 }
diff --git a/llama.h b/llama.h
index 647f9abdc0fc4..e74279abac093 100644
--- a/llama.h
+++ b/llama.h
@@ -330,11 +330,17 @@ extern "C" {
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+    LLAMA_API int llama_n_mult (const struct llama_context * ctx);
+    LLAMA_API int llama_n_head (const struct llama_context * ctx);
+    LLAMA_API int llama_n_rot  (const struct llama_context * ctx);
     LLAMA_API int llama_n_layer(const struct llama_context * ctx);
 
     LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
     LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
     LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+    LLAMA_API int llama_n_mult_from_model (const struct llama_model * model);
+    LLAMA_API int llama_n_head_from_model (const struct llama_model * model);
+    LLAMA_API int llama_n_rot_from_model  (const struct llama_model * model);
     LLAMA_API int llama_n_layer_from_model(const struct llama_model * model);
 
     // Get the vocabulary as output parameters.

From 28ee0c8583b75164eda40ed4fa428f4e7b34eb49 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 15:31:04 +0200
Subject: [PATCH 069/235] first draft for LORA finetune training

---
 examples/finetune/finetune.cpp | 984 +++++++++++++++++++++------------
 1 file changed, 618 insertions(+), 366 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index c02f3f8591979..bdc717ffea5b0 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -201,7 +201,7 @@ struct my_llama_hparams {
     uint32_t n_rot   = 64;
 
     bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(my_llama_hparams));
+        return memcmp(this, &other, sizeof(other));
     }
 };
 
@@ -225,8 +225,6 @@ struct my_llama_layer {
 };
 
 struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-
     my_llama_hparams hparams;
 
     struct ggml_tensor * tok_embeddings;
@@ -235,6 +233,69 @@ struct my_llama_model {
     struct ggml_tensor * output;
 
     std::vector<my_llama_layer> layers;
+};
+
+struct my_llama_lora_hparams {
+    uint32_t n_rank_attention_norm = 1;
+    uint32_t n_rank_wq = 4;
+    uint32_t n_rank_wk = 4;
+    uint32_t n_rank_wv = 4;
+    uint32_t n_rank_wo = 4;
+    uint32_t n_rank_ffn_norm = 1;
+    uint32_t n_rank_w1 = 4;
+    uint32_t n_rank_w2 = 4;
+    uint32_t n_rank_w3 = 4;
+    uint32_t n_rank_tok_embeddings = 4;
+    uint32_t n_rank_norm = 1;
+    uint32_t n_rank_output = 4;
+
+    bool operator!=(const my_llama_lora_hparams& other) const {
+        return memcmp(this, &other, sizeof(other));
+    }
+};
+
+struct my_llama_lora_layer {
+    // normalization
+    struct ggml_tensor * attention_norm_a;
+    struct ggml_tensor * attention_norm_b;
+
+    // attention
+    struct ggml_tensor * wq_a;
+    struct ggml_tensor * wq_b;
+    struct ggml_tensor * wk_a;
+    struct ggml_tensor * wk_b;
+    struct ggml_tensor * wv_a;
+    struct ggml_tensor * wv_b;
+    struct ggml_tensor * wo_a;
+    struct ggml_tensor * wo_b;
+
+    // normalization
+    struct ggml_tensor * ffn_norm_a;
+    struct ggml_tensor * ffn_norm_b;
+
+    // ff
+    struct ggml_tensor * w1_a;
+    struct ggml_tensor * w1_b;
+    struct ggml_tensor * w2_a;
+    struct ggml_tensor * w2_b;
+    struct ggml_tensor * w3_a;
+    struct ggml_tensor * w3_b;
+};
+
+struct my_llama_lora {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_lora_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings_a;
+    struct ggml_tensor * tok_embeddings_b;
+
+    struct ggml_tensor * norm_a;
+    struct ggml_tensor * norm_b;
+    struct ggml_tensor * output_a;
+    struct ggml_tensor * output_b;
+
+    std::vector<my_llama_lora_layer> layers;
 
     uint32_t train_its = 0;
     uint32_t train_samples = 0;
@@ -247,18 +308,41 @@ uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
 }
 
 void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
-}
-
-void init_model(struct my_llama_model * model) {
-    const auto & hparams = model->hparams;
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
+    printf("%s: n_ff:    %u\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
+}
+
+void print_lora_params(struct my_llama_lora_hparams * params) {
+    printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm);
+    printf("%s: n_rank_wq             : %u\n", __func__, params->n_rank_wq);
+    printf("%s: n_rank_wk             : %u\n", __func__, params->n_rank_wk);
+    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
+    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
+    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
+    printf("%s: n_rank_w1             : %u\n", __func__, params->n_rank_w1);
+    printf("%s: n_rank_w2             : %u\n", __func__, params->n_rank_w2);
+    printf("%s: n_rank_w3             : %u\n", __func__, params->n_rank_w3);
+    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
+    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
+    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
+}
+
+void init_model(const struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
+    auto & hparams = model->hparams;
+
+    hparams.n_vocab = llama_n_vocab_from_model(input);
+    hparams.n_ctx = n_ctx;
+    hparams.n_embd = llama_n_embd_from_model(input);
+    hparams.n_mult = llama_n_mult_from_model(input);
+    hparams.n_head = llama_n_head_from_model(input);
+    hparams.n_layer = llama_n_layer_from_model(input);
+    hparams.n_rot = llama_n_rot_from_model(input);
 
     const uint32_t n_embd  = hparams.n_embd;
     const uint32_t n_layer = hparams.n_layer;
@@ -266,106 +350,186 @@ void init_model(struct my_llama_model * model) {
 
     const uint32_t n_ff = get_n_ff(&hparams);
 
-    struct ggml_context * ctx = model->ctx;
-
     model->train_its = 0;
     model->train_samples = 0;
     model->train_tokens = 0;
 
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-
-    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
-    ggml_set_name(model->norm,           "norm.weight");
-    ggml_set_name(model->output,         "output.weight");
+    model->tok_embeddings = llama_get_model_tok_embeddings(input);
+    model->norm           = llama_get_model_norm(input);
+    model->output         = llama_get_model_output(input);
 
     model->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
+        struct llama_layer * ilayer = llama_get_layer_from_model(input, i);
         auto & layer = model->layers[i];
 
-        std::string layers_i = "layers." + std::to_string(i);
+        layer.attention_norm = llama_get_layer_attention_norm(ilayer);
 
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        layer.wq = llama_get_layer_wq(ilayer);
+        layer.wk = llama_get_layer_wk(ilayer);
+        layer.wv = llama_get_layer_wv(ilayer);
+        layer.wo = llama_get_layer_wo(ilayer);
 
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.ffn_norm = llama_get_layer_ffn_norm(ilayer);
 
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        layer.w1 = llama_get_layer_w1(ilayer);
+        layer.w2 = llama_get_layer_w2(ilayer);
+        layer.w3 = llama_get_layer_w3(ilayer);
+    }
+}
 
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
+    const auto & lparams = lora->hparams;
 
-        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+    const uint32_t n_embd  = model->hparams.n_embd;
+    const uint32_t n_layer = model->hparams.n_layer;
+    const uint32_t n_vocab = model->hparams.n_vocab;
+    const uint32_t n_ff = get_n_ff(&model->hparams);
 
-        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
-        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
-        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
-        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+    struct ggml_context * ctx = lora->ctx;
 
-        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+    lora->train_its = 0;
+    lora->train_samples = 0;
+    lora->train_tokens = 0;
 
-        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
-        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
-        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
-    }
-}
+    lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
+    lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
+    lora->norm_a           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
+    lora->norm_b           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1);
+    lora->output_a         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd);
+    lora->output_b         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab);
 
-void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
+    ggml_set_name(lora->tok_embeddings_a, "tok_embeddings.weight.loraA");
+    ggml_set_name(lora->tok_embeddings_b, "tok_embeddings.weight.loraB");
+    ggml_set_name(lora->norm_a,           "norm.weight.loraA");
+    ggml_set_name(lora->norm_b,           "norm.weight.loraB");
+    ggml_set_name(lora->output_a,         "output.weight.loraA");
+    ggml_set_name(lora->output_b,         "output.weight.loraB");
 
-    const uint32_t n_layer = hparams.n_layer;
+    lora->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = lora->layers[i];
 
-    struct ggml_context* ctx = model->ctx;
+        std::string layers_i = "layers." + std::to_string(i);
 
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
+        layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd);
+        layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1);
 
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
+        layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
+        layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
+        layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
+        layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
+        layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
+        layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
+        layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
+        layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
+
+        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
+        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
+
+        layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
+        layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
+        layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
+        layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
+        layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
+        layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
 
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
+        ggml_format_name(layer.attention_norm_a, "%s.attention_norm.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.attention_norm_b, "%s.attention_norm.weight.loraB", layers_i.c_str());
+
+        ggml_format_name(layer.wq_a, "%s.attention.wq.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.wq_b, "%s.attention.wq.weight.loraB", layers_i.c_str());
+        ggml_format_name(layer.wk_a, "%s.attention.wk.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.wk_b, "%s.attention.wk.weight.loraB", layers_i.c_str());
+        ggml_format_name(layer.wv_a, "%s.attention.wv.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.wv_b, "%s.attention.wv.weight.loraB", layers_i.c_str());
+        ggml_format_name(layer.wo_a, "%s.attention.wo.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.wo_b, "%s.attention.wo.weight.loraB", layers_i.c_str());
+
+        ggml_format_name(layer.ffn_norm_a, "%s.ffn_norm.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.ffn_norm_b, "%s.ffn_norm.weight.loraB", layers_i.c_str());
+
+        ggml_format_name(layer.w1_a, "%s.feed_forward.w1_a.weight", layers_i.c_str());
+        ggml_format_name(layer.w1_b, "%s.feed_forward.w1_b.weight", layers_i.c_str());
+        ggml_format_name(layer.w2_a, "%s.feed_forward.w2_a.weight", layers_i.c_str());
+        ggml_format_name(layer.w2_b, "%s.feed_forward.w2_b.weight", layers_i.c_str());
+        ggml_format_name(layer.w3_a, "%s.feed_forward.w3_a.weight", layers_i.c_str());
+        ggml_format_name(layer.w3_b, "%s.feed_forward.w3_b.weight", layers_i.c_str());
     }
 }
 
-void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
+void set_param_lora(struct my_llama_lora * lora) {
+    const uint32_t n_layer = lora->layers.size();
 
-    const uint32_t n_layer = hparams.n_layer;
+    struct ggml_context* ctx = lora->ctx;
+
+    ggml_set_param(ctx, lora->tok_embeddings_a);
+    ggml_set_param(ctx, lora->tok_embeddings_b);
+    ggml_set_param(ctx, lora->norm_a);
+    ggml_set_param(ctx, lora->norm_b);
+    ggml_set_param(ctx, lora->output_a);
+    ggml_set_param(ctx, lora->output_b);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = lora->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm_a);
+        ggml_set_param(ctx, layer.attention_norm_b);
+        ggml_set_param(ctx, layer.wq_a);
+        ggml_set_param(ctx, layer.wq_b);
+        ggml_set_param(ctx, layer.wk_a);
+        ggml_set_param(ctx, layer.wk_b);
+        ggml_set_param(ctx, layer.wv_a);
+        ggml_set_param(ctx, layer.wv_b);
+        ggml_set_param(ctx, layer.wo_a);
+        ggml_set_param(ctx, layer.wo_b);
+        ggml_set_param(ctx, layer.ffn_norm_a);
+        ggml_set_param(ctx, layer.ffn_norm_b);
+        ggml_set_param(ctx, layer.w1_a);
+        ggml_set_param(ctx, layer.w1_b);
+        ggml_set_param(ctx, layer.w2_a);
+        ggml_set_param(ctx, layer.w2_b);
+        ggml_set_param(ctx, layer.w3_a);
+        ggml_set_param(ctx, layer.w3_b);
+    }
+}
+
+void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
+    const uint32_t n_layer = lora->layers.size();
 
     struct random_normal_distribution rnd;
     init_random_normal_distribution(&rnd, seed, mean, std, min, max);
 
-    randomize_tensor_normal(model->tok_embeddings, &rnd);
-    randomize_tensor_normal(model->norm,           &rnd);
-    randomize_tensor_normal(model->output,         &rnd);
+    randomize_tensor_normal(lora->tok_embeddings_a, &rnd);
+    randomize_tensor_normal(lora->tok_embeddings_b, &rnd);
+    randomize_tensor_normal(lora->norm_a,           &rnd);
+    randomize_tensor_normal(lora->norm_b,           &rnd);
+    randomize_tensor_normal(lora->output_a,         &rnd);
+    randomize_tensor_normal(lora->output_b,         &rnd);
 
     for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, &rnd);
+        auto & layer = lora->layers[i];
+        randomize_tensor_normal(layer.attention_norm_a, &rnd);
+        randomize_tensor_normal(layer.attention_norm_b, &rnd);
 
-        randomize_tensor_normal(layer.wq, &rnd);
-        randomize_tensor_normal(layer.wk, &rnd);
-        randomize_tensor_normal(layer.wv, &rnd);
-        randomize_tensor_normal(layer.wo, &rnd);
+        randomize_tensor_normal(layer.wq_a, &rnd);
+        randomize_tensor_normal(layer.wq_b, &rnd);
+        randomize_tensor_normal(layer.wk_a, &rnd);
+        randomize_tensor_normal(layer.wk_b, &rnd);
+        randomize_tensor_normal(layer.wv_a, &rnd);
+        randomize_tensor_normal(layer.wv_b, &rnd);
+        randomize_tensor_normal(layer.wo_a, &rnd);
+        randomize_tensor_normal(layer.wo_b, &rnd);
 
-        randomize_tensor_normal(layer.ffn_norm, &rnd);
+        randomize_tensor_normal(layer.ffn_norm_a, &rnd);
+        randomize_tensor_normal(layer.ffn_norm_b, &rnd);
 
-        randomize_tensor_normal(layer.w1, &rnd);
-        randomize_tensor_normal(layer.w2, &rnd);
-        randomize_tensor_normal(layer.w3, &rnd);
+        randomize_tensor_normal(layer.w1_a, &rnd);
+        randomize_tensor_normal(layer.w1_b, &rnd);
+        randomize_tensor_normal(layer.w2_a, &rnd);
+        randomize_tensor_normal(layer.w2_b, &rnd);
+        randomize_tensor_normal(layer.w3_a, &rnd);
+        randomize_tensor_normal(layer.w3_b, &rnd);
     }
 }
 
@@ -407,6 +571,7 @@ bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * mode
 
 struct ggml_tensor * forward(
         struct my_llama_model    * model,
+        struct my_llama_lora     * lora,
         struct my_llama_kv_cache * cache,
         struct ggml_context   * ctx0,
         struct ggml_cgraph    * gf,
@@ -424,20 +589,36 @@ struct ggml_tensor * forward(
     const int n_head  = hparams.n_head;
     const int n_rot   = hparams.n_rot;
 
+    GGML_ASSERT(n_layer == lora.layers.size());
+
     struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
 
     struct ggml_tensor * kc = kv_self.k;
     struct ggml_tensor * vc = kv_self.v;
 
+    struct ggml_tensor * tok_embeddings = ggml_add(ctx0, model->tok_embeddings, ggml_mul_mat(ctx0, lora->tok_embeddings_a, lora->tok_embeddings_b));
+    struct ggml_tensor * norm = ggml_add(ctx0, model->norm, ggml_mul_mat(ctx0, lora->norm_a, lora->norm_b));
+    struct ggml_tensor * output = ggml_add(ctx0, model->output, ggml_mul_mat(ctx0, lora->output_a, lora->output_b));
+
+
     // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, tok_embeddings, tokens);
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
         struct ggml_tensor * cur;
 
         // lctx.use_buf(ctx0, 0);
+        struct ggml_tensor * attention_norm = ggml_add(ctx0, model->layers[il].attention_norm, ggml_mul_mat(ctx0, lora->layers[il].attention_norm_a, lora->layers[il].attention_norm_b));
+        struct ggml_tensor * ffn_norm = ggml_add(ctx0, model->layers[il].ffn_norm, ggml_mul_mat(ctx0, lora->layers[il].ffn_norm_a, lora->layers[il].ffn_norm_b));
+        struct ggml_tensor * wq = ggml_add(ctx0, model->layers[il].wq, ggml_mul_mat(ctx0, lora->layers[il].wq_a, lora->layers[il].wq_b));
+        struct ggml_tensor * wk = ggml_add(ctx0, model->layers[il].wk, ggml_mul_mat(ctx0, lora->layers[il].wk_a, lora->layers[il].wk_b));
+        struct ggml_tensor * wv = ggml_add(ctx0, model->layers[il].wv, ggml_mul_mat(ctx0, lora->layers[il].wv_a, lora->layers[il].wv_b));
+        struct ggml_tensor * wo = ggml_add(ctx0, model->layers[il].wo, ggml_mul_mat(ctx0, lora->layers[il].wo_a, lora->layers[il].wo_b));
+        struct ggml_tensor * w1 = ggml_add(ctx0, model->layers[il].w1, ggml_mul_mat(ctx0, lora->layers[il].w1_a, lora->layers[il].w1_b));
+        struct ggml_tensor * w2 = ggml_add(ctx0, model->layers[il].w2, ggml_mul_mat(ctx0, lora->layers[il].w2_a, lora->layers[il].w2_b));
+        struct ggml_tensor * w3 = ggml_add(ctx0, model->layers[il].w3, ggml_mul_mat(ctx0, lora->layers[il].w3_a, lora->layers[il].w3_b));
 
         // norm
         {
@@ -446,7 +627,9 @@ struct ggml_tensor * forward(
 
             // cur = attention_norm*cur
             cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].attention_norm, cur),
+                        ggml_repeat(ctx0,
+                            attention_norm,
+                            cur),
                         cur);
         }
 
@@ -457,15 +640,16 @@ struct ggml_tensor * forward(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, 1]
             // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+
+            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
+            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
 
             // store key and value to memory
             {
                 // compute the transposed [N, n_embd] V matrix
                 // wv   shape [n_embd, n_embd, 1, 1]
                 // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, wv, cur), n_embd, N)));
 
                 // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
                 // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
@@ -551,7 +735,7 @@ struct ggml_tensor * forward(
             // projection (no bias)
             // cur shape [n_embd,N,1,1]
             cur = ggml_mul_mat(ctx0,
-                    model->layers[il].wo,
+                    wo,
                     cur);
         }
 
@@ -570,18 +754,20 @@ struct ggml_tensor * forward(
                 // cur = ffn_norm*cur
                 // cur shape [n_embd,N,1,1]
                 cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ffn_norm, cur),
+                        ggml_repeat(ctx0,
+                            ffn_norm,
+                            cur),
                         cur);
             }
 
             // tmp shape [n_ff,N,1,1]
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model->layers[il].w3,
+                    w3,
                     cur);
 
             // cur shape [n_ff,N,1,1]
             cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w1,
+                    w1,
                     cur);
 
             // SILU activation
@@ -593,7 +779,7 @@ struct ggml_tensor * forward(
 
             // cur shape [n_embd,N,1,1]
             cur = ggml_mul_mat(ctx0,
-                    model->layers[il].w2,
+                    w2,
                     cur);
         }
 
@@ -614,7 +800,11 @@ struct ggml_tensor * forward(
         // inpL = norm*inpL
         // inpL shape [n_embd,N,1,1]
         inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->norm, inpL),
+                    ggml_repeat(ctx0,
+                        ggml_add(ctx0,
+                            model->norm,
+                            lora->norm),
+                        inpL),
                     inpL);
 
         //embeddings = inpL;
@@ -622,7 +812,7 @@ struct ggml_tensor * forward(
 
     // lm_head
     // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0, model->output, inpL);
+    inpL = ggml_mul_mat(ctx0, output, inpL);
 
     // run the computation
     ggml_build_forward_expand(gf, inpL);
@@ -854,8 +1044,9 @@ void ggml_build_backward_gradient_checkpointing(
     free_hash_map(replacements);
 }
 
-struct ggml_tensor * llama_build_train_graphs(
+struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct my_llama_model * model,
+        struct my_llama_lora  * lora,
         struct ggml_allocr    * alloc,
         struct ggml_context   * ctx,
         struct ggml_cgraph    * gf,
@@ -882,6 +1073,8 @@ struct ggml_tensor * llama_build_train_graphs(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
+    GGML_ASSERT(n_layer == lora.layers.size());
+
     auto set_name = [](struct ggml_tensor * t, const char * n) {
         ggml_set_name(t, n);
         if (t->grad) {
@@ -893,8 +1086,14 @@ struct ggml_tensor * llama_build_train_graphs(
     set_name(targets,      "targets");
 
     GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
+
+
+    struct ggml_tensor * tok_embeddings = ggml_add(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
+    struct ggml_tensor * norm           = ggml_add(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
+    struct ggml_tensor * output         = ggml_add(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
+
     struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
+    struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00);        set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
 
     struct ggml_tensor * cur = t01;
 
@@ -911,16 +1110,27 @@ struct ggml_tensor * llama_build_train_graphs(
 
     for (int il = 0; il < n_layer; ++il) {
         struct my_llama_layer & layer = model->layers[il];
+        struct my_llama_lora_layer & llayer = lora->layers[il];
+
+        struct ggml_tensor * attention_norm = ggml_add(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
+        struct ggml_tensor * ffn_norm = ggml_add(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
+        struct ggml_tensor * wq = ggml_add(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
+        struct ggml_tensor * wk = ggml_add(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
+        struct ggml_tensor * wv = ggml_add(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+        struct ggml_tensor * w1 = ggml_add(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
+        struct ggml_tensor * w2 = ggml_add(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
+        struct ggml_tensor * w3 = ggml_add(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
+
         struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                    set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
         struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
         struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
         struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, wv);                                set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
         struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
         struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
@@ -938,24 +1148,24 @@ struct ggml_tensor * llama_build_train_graphs(
         struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, wo, t19);                                set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
         struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
         struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                          set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
         struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, w3, t24);                                set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, w1, t24);                                set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
         struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
         struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
         struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
         checkpoints.push_back(cur);
     }
     struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                           set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
     struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, output, t33);                         set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
     struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
     struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
 
@@ -1432,19 +1642,6 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
     return token;
 }
 
-void set_logits_masked(struct ggml_tensor * logits, std::vector<bool>& mask, float value) {
-    GGML_ASSERT(logits->ne[0] == (int64_t) mask.size());
-    for (int i2 = 0; i2 < logits->ne[2]; ++i2) {
-        for (int i1 = 0; i1 < logits->ne[1]; ++i1) {
-            for (int i0 = 0; i0 < logits->ne[0]; ++i0) {
-                if (!mask[i0]) continue;
-                float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]);
-                *ptr = value;
-            }
-        }
-    }
-}
-
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     if (tensor == NULL) {
         file->write_u32(0);
@@ -1554,87 +1751,6 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     }
 }
 
-struct ggml_opt_params_v0 {
-    enum ggml_opt_type type;
-    int n_threads;
-    int past;
-    float delta;
-    int max_no_improvement;
-    bool print_forward_graph;
-    bool print_backward_graph;
-    struct {
-        int n_iter;
-        float sched;
-        float decay;
-        float alpha;
-        float beta1;
-        float beta2;
-        float eps;
-        float eps_f;
-        float eps_g;
-    } adam;
-    struct {
-        int m;
-        int n_iter;
-        int max_linesearch;
-        float eps;
-        float ftol;
-        float wolfe;
-        float min_step;
-        float max_step;
-        enum ggml_linesearch linesearch;
-    } lbfgs;
-};
-
-void read_opt_context_v0(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    ggml_opt_params_v0 pv0;
-    file->read_raw(&pv0, sizeof(pv0));
-    opt->params.past = pv0.past;
-    opt->params.lbfgs.m = pv0.lbfgs.m;
-    file->read_raw(&opt->nx, sizeof(opt->nx));
-    ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-    file->read_raw(&opt->iter,   sizeof(opt->iter));
-    opt->just_initialized = (bool) file->read_u32();
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                skip_tensor(file);
-                skip_tensor(file);
-                skip_tensor(file);
-                read_tensor(file, opt->adam.m);
-                read_tensor(file, opt->adam.v);
-                skip_tensor(file);
-                skip_tensor(file);
-                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                read_tensor(file, opt->lbfgs.x);
-                read_tensor(file, opt->lbfgs.xp);
-                read_tensor(file, opt->lbfgs.g);
-                read_tensor(file, opt->lbfgs.gp);
-                read_tensor(file, opt->lbfgs.d);
-                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-                read_tensor(file, opt->lbfgs.lmal);
-                read_tensor(file, opt->lbfgs.lmys);
-                read_tensor(file, opt->lbfgs.lms);
-                read_tensor(file, opt->lbfgs.lmy);
-                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
 void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
     opt->params.past    = (int) file->read_u32();
     opt->params.lbfgs.m = (int) file->read_u32();
@@ -1683,7 +1799,7 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     switch (version) {
         case 0:
             {
-                read_opt_context_v0(file, ctx, opt);
+                GGML_ASSERT(false); // not supported in finetune
             } break;
         case 1:
             {
@@ -1696,49 +1812,73 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     }
 }
 
-void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) {
+void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename) {
     struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
         return;
     }
 
-    const uint32_t magic   = 'ggcp';
+    const uint32_t magic   = 'ggcl';
     const uint32_t version = 0;
 
     file.write_u32(magic);
     file.write_u32(version);
-    file.write_u32(model->train_its);
-    file.write_u32(model->train_samples);
-    file.write_u32(model->train_tokens);
+    file.write_u32(lora->train_its);
+    file.write_u32(lora->train_samples);
+    file.write_u32(lora->train_tokens);
     file.write_u32(model->hparams.n_vocab);
     file.write_u32(model->hparams.n_embd);
     file.write_u32(model->hparams.n_mult);
     file.write_u32(model->hparams.n_head);
     file.write_u32(model->hparams.n_layer);
     file.write_u32(model->hparams.n_rot);
-
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
+    file.write_u32(lora->hparams.n_rank_attention_norm);
+    file.write_u32(lora->hparams.n_rank_wq);
+    file.write_u32(lora->hparams.n_rank_wk);
+    file.write_u32(lora->hparams.n_rank_wv);
+    file.write_u32(lora->hparams.n_rank_wo);
+    file.write_u32(lora->hparams.n_rank_ffn_norm);
+    file.write_u32(lora->hparams.n_rank_w1);
+    file.write_u32(lora->hparams.n_rank_w2);
+    file.write_u32(lora->hparams.n_rank_w3);
+    file.write_u32(lora->hparams.n_rank_tok_embeddings);
+    file.write_u32(lora->hparams.n_rank_norm);
+    file.write_u32(lora->hparams.n_rank_output);
+
+    write_tensor(&file, lora->tok_embeddings_a);
+    write_tensor(&file, lora->tok_embeddings_b);
+    write_tensor(&file, lora->norm_a);
+    write_tensor(&file, lora->norm_b);
+    write_tensor(&file, lora->output_a);
+    write_tensor(&file, lora->output_b);
+
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+
+        write_tensor(&file, layer.attention_norm_a);
+        write_tensor(&file, layer.attention_norm_b);
+        write_tensor(&file, layer.wq_a);
+        write_tensor(&file, layer.wq_b);
+        write_tensor(&file, layer.wk_a);
+        write_tensor(&file, layer.wk_b);
+        write_tensor(&file, layer.wv_a);
+        write_tensor(&file, layer.wv_b);
+        write_tensor(&file, layer.wo_a);
+        write_tensor(&file, layer.wo_b);
+        write_tensor(&file, layer.ffn_norm_a);
+        write_tensor(&file, layer.ffn_norm_b);
+        write_tensor(&file, layer.w1_a);
+        write_tensor(&file, layer.w1_b);
+        write_tensor(&file, layer.w2_a);
+        write_tensor(&file, layer.w2_b);
+        write_tensor(&file, layer.w3_a);
+        write_tensor(&file, layer.w3_b);
     }
 
     write_opt_context(&file, opt);
 }
 
-bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) {
+bool load_checkpoint(struct my_llama_lora * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, bool init) {
     struct llama_file file(filename, "rb");
 
     uint32_t magic;
@@ -1751,52 +1891,84 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op
     if (file.fp) {
         printf("%s: Loading model from '%s'.\n", __func__, filename);
         magic                  = file.read_u32();
-        GGML_ASSERT(magic     == 'ggcp');
+        GGML_ASSERT(magic     == 'ggcl');
         version                = file.read_u32();
         GGML_ASSERT(version   == 0);
         train_its              = file.read_u32();
         train_samples          = file.read_u32();
         train_tokens           = file.read_u32();
-        model->hparams.n_vocab = file.read_u32();
-        model->hparams.n_embd  = file.read_u32();
-        model->hparams.n_mult  = file.read_u32();
-        model->hparams.n_head  = file.read_u32();
-        model->hparams.n_layer = file.read_u32();
-        model->hparams.n_rot   = file.read_u32();
+        uint32_t n_vocab       = file.read_u32();
+        uint32_t n_embd        = file.read_u32();
+        uint32_t n_mult        = file.read_u32();
+        uint32_t n_head        = file.read_u32();
+        uint32_t n_layer       = file.read_u32();
+        uint32_t n_rot         = file.read_u32();
+        GGML_ASSERT(n_vocab == model->hparams.n_vocab);
+        GGML_ASSERT(n_embd  == model->hparams.n_embd);
+        GGML_ASSERT(n_mult  == model->hparams.n_mult);
+        GGML_ASSERT(n_head  == model->hparams.n_head);
+        GGML_ASSERT(n_layer == model->hparams.n_layer);
+        GGML_ASSERT(n_rot   == model->hparams.n_rot);
+        lora->hparams.n_rank_attention_norm = file.read_u32();
+        lora->hparams.n_rank_wq             = file.read_u32();
+        lora->hparams.n_rank_wk             = file.read_u32();
+        lora->hparams.n_rank_wv             = file.read_u32();
+        lora->hparams.n_rank_wo             = file.read_u32();
+        lora->hparams.n_rank_ffn_norm       = file.read_u32();
+        lora->hparams.n_rank_w1             = file.read_u32();
+        lora->hparams.n_rank_w2             = file.read_u32();
+        lora->hparams.n_rank_w3             = file.read_u32();
+        lora->hparams.n_rank_tok_embeddings = file.read_u32();
+        lora->hparams.n_rank_norm           = file.read_u32();
+        lora->hparams.n_rank_output         = file.read_u32();
+
         print_params(&model->hparams);
+        print_lora_params(&lora->hparams);
     }
 
     if (init) {
-        init_model(model);
+        init_lora(model, lora);
     }
 
     if (file.fp) {
-        model->train_its = train_its;
-        model->train_samples = train_samples;
-        model->train_tokens = train_tokens;
+        lora->train_its = train_its;
+        lora->train_samples = train_samples;
+        lora->train_tokens = train_tokens;
     }
 
-    printf("%s: Training iterations: %u.\n", __func__, model->train_its);
-    printf("%s: Training samples:    %u.\n", __func__, model->train_samples);
-    printf("%s: Training tokens:     %u.\n", __func__, model->train_tokens);
+    printf("%s: Training iterations: %u.\n", __func__, lora->train_its);
+    printf("%s: Training samples:    %u.\n", __func__, lora->train_samples);
+    printf("%s: Training tokens:     %u.\n", __func__, lora->train_tokens);
 
     if (file.fp) {
-        read_tensor(&file, model->tok_embeddings);
-        read_tensor(&file, model->norm);
-        read_tensor(&file, model->output);
-
-        for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-            auto & layer = model->layers[i];
-
-            read_tensor(&file, layer.attention_norm);
-            read_tensor(&file, layer.wq);
-            read_tensor(&file, layer.wk);
-            read_tensor(&file, layer.wv);
-            read_tensor(&file, layer.wo);
-            read_tensor(&file, layer.ffn_norm);
-            read_tensor(&file, layer.w1);
-            read_tensor(&file, layer.w2);
-            read_tensor(&file, layer.w3);
+        read_tensor(&file, lora->tok_embeddings_a);
+        read_tensor(&file, lora->tok_embeddings_b);
+        read_tensor(&file, lora->norm_a);
+        read_tensor(&file, lora->norm_b);
+        read_tensor(&file, lora->output_a);
+        read_tensor(&file, lora->output_b);
+
+        for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+            auto & layer = lora->layers[i];
+
+            read_tensor(&file, layer.attention_norm_a);
+            read_tensor(&file, layer.attention_norm_b);
+            read_tensor(&file, layer.wq_a);
+            read_tensor(&file, layer.wq_b);
+            read_tensor(&file, layer.wk_a);
+            read_tensor(&file, layer.wk_b);
+            read_tensor(&file, layer.wv_a);
+            read_tensor(&file, layer.wv_b);
+            read_tensor(&file, layer.wo_a);
+            read_tensor(&file, layer.wo_b);
+            read_tensor(&file, layer.ffn_norm_a);
+            read_tensor(&file, layer.ffn_norm_b);
+            read_tensor(&file, layer.w1_a);
+            read_tensor(&file, layer.w1_b);
+            read_tensor(&file, layer.w2_a);
+            read_tensor(&file, layer.w2_b);
+            read_tensor(&file, layer.w3_a);
+            read_tensor(&file, layer.w3_b);
         }
 
         read_opt_context(&file, model->ctx, opt);
@@ -1805,47 +1977,45 @@ bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * op
     return (file.fp != NULL);
 }
 
-void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) {
+void save_as_llama_lora(struct my_llama_lora * lora, const char * filename) {
     struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
         return;
     }
 
     // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
+    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
+    file.write_u32(1); // version
     // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
+    file.write_u32(lora->hparams.lora_r);
+    file.write_u32(lora->hparams.lora_alpha);
     // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
+    write_tensor(&file, lora->tok_embeddings_a);
+    write_tensor(&file, lora->tok_embeddings_b);
+    write_tensor(&file, lora->norm_a);
+    write_tensor(&file, lora->norm_b);
+    write_tensor(&file, lora->output_a);
+    write_tensor(&file, lora->output_b);
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+        write_tensor(&file, layer.attention_norm_a);
+        write_tensor(&file, layer.attention_norm_b);
+        write_tensor(&file, layer.wq_a);
+        write_tensor(&file, layer.wq_b);
+        write_tensor(&file, layer.wk_a);
+        write_tensor(&file, layer.wk_b);
+        write_tensor(&file, layer.wv_a);
+        write_tensor(&file, layer.wv_b);
+        write_tensor(&file, layer.wo_a);
+        write_tensor(&file, layer.wo_b);
+        write_tensor(&file, layer.ffn_norm_a);
+        write_tensor(&file, layer.ffn_norm_b);
+        write_tensor(&file, layer.w1_a);
+        write_tensor(&file, layer.w1_b);
+        write_tensor(&file, layer.w2_a);
+        write_tensor(&file, layer.w2_b);
+        write_tensor(&file, layer.w3_a);
+        write_tensor(&file, layer.w3_b);
     }
 }
 
@@ -1869,26 +2039,36 @@ float cosine_decay_restart(int decay_steps, const float minimum, int step, float
 }
 
 struct train_params {
-    const char * fn_vocab_model;
+    const char * fn_model_base;
     const char * fn_train_data;
     const char * fn_checkpoint_in;
     const char * fn_checkpoint_out;
-    const char * fn_model_out;
+    const char * fn_lora_out;
 
     uint32_t seed;
 
     int n_ctx;
-    int n_embd;
-    int n_mult;
-    int n_head;
-    int n_layer;
-    int n_rotmax;
-
     int n_threads;
     int n_batch;
     int n_examples;
     int n_predict;
 
+    int32_t lora_r;
+    int32_t lora_alpha;
+
+    int n_rank_attention_norm;
+    int n_rank_wq;
+    int n_rank_wk;
+    int n_rank_wv;
+    int n_rank_wo;
+    int n_rank_ffn_norm;
+    int n_rank_w1;
+    int n_rank_w2;
+    int n_rank_w3;
+    int n_rank_tok_embeddings;
+    int n_rank_norm;
+    int n_rank_output;
+
     int print_info_interval;
     int print_details_interval;
 
@@ -1920,33 +2100,43 @@ struct train_params {
     float adam_gclip;
     float adam_eps_f;
 
-    int mem_model_gb;
+    int mem_lora_gb;
     int mem_compute_gb;
     int mem_compute0_gb;
 };
 
 struct train_params get_default_train_params() {
     struct train_params params;
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
+    params.fn_model_base     = "";
     params.fn_train_data     = "shakespeare.txt";
     params.fn_checkpoint_in  = "checkpoint.bin";
     params.fn_checkpoint_out = "checkpoint.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+    params.fn_lora_out       = "ggml-lora-f32.bin";
 
     params.seed       =   -1;
 
     params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_mult     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_rotmax   =   64;
-
     params.n_threads  =    6;
     params.n_batch    =    8;
     params.n_examples =    1;
     params.n_predict  = 1024;
 
+    params.lora_alpha  =  100;
+    params.lora_r      =  100;
+
+    params.n_rank_attention_norm = 1;
+    params.n_rank_wq             = 4;
+    params.n_rank_wk             = 4;
+    params.n_rank_wv             = 4;
+    params.n_rank_wo             = 4;
+    params.n_rank_ffn_norm       = 1;
+    params.n_rank_w1             = 4;
+    params.n_rank_w2             = 4;
+    params.n_rank_w3             = 4;
+    params.n_rank_tok_embeddings = 4;
+    params.n_rank_norm           = 1;
+    params.n_rank_output         = 4;
+
     params.print_info_interval    = 1;
     params.print_details_interval = 2;
 
@@ -1978,9 +2168,9 @@ struct train_params get_default_train_params() {
     params.adam_gclip          = 1.0f;
     params.adam_eps_f          = 0.0f;
 
-    params.mem_model_gb   =  2;
-    params.mem_compute_gb = 24;
-    params.mem_compute0_gb = 8;
+    params.mem_lora_gb     =  2;
+    params.mem_compute_gb  = 24;
+    params.mem_compute0_gb =  8;
     return params;
 }
 
@@ -1989,22 +2179,30 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --model-base FNAME         model path from which to load base model (default '%s')\n", params->fn_model_base);
     fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
     fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
     fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
-    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
+    fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
     fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
-    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
-    fprintf(stderr, "  --mult N                   Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
-    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
-    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
-    fprintf(stderr, "  --rotmax N                 Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
     fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
+    fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
+    fprintf(stderr, "  --lora-r N                 LORA r     : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_r);
+    fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor (default %d)\n", params->n_rank_attention_norm);
+    fprintf(stderr, "  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default %d)\n", params->n_rank_ffn_norm);
+    fprintf(stderr, "  --rank-out-norm N          LORA rank for output norm tensor (default %d)\n", params->n_rank_norm);
+    fprintf(stderr, "  --rank-tok-embd N          LORA rank for token embeddings tensor (default %d)\n", params->n_rank_tok_embeddings);
+    fprintf(stderr, "  --rank-out N               LORA rank for output tensor (default %d)\n", params->n_rank_output);
+    fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor (default %d)\n", params->n_rank_wq);
+    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor (default %d)\n", params->n_rank_wk);
+    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor (default %d)\n", params->n_rank_wv);
+    fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor (default %d)\n", params->n_rank_w1);
+    fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor (default %d)\n", params->n_rank_w2);
+    fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor (default %d)\n", params->n_rank_w3);
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
     fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
@@ -2035,7 +2233,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
-    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
+    fprintf(stderr, "  --mem-lora N               Memory to allocate for LORA and cache in gigabytes. (default %d)\n", params->mem_lora_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "\n");
@@ -2053,12 +2251,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
 
-        if (arg == "--vocab-model") {
+        if (arg == "--model-base") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->fn_vocab_model = argv[i];
+            params->fn_model_base = argv[i];
         } else if (arg == "--train-data") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2077,12 +2275,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->fn_checkpoint_out = argv[i];
-        } else if (arg == "--model-out") {
+        } else if (arg == "--lora-out") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->fn_model_out = argv[i];
+            params->fn_lora_out = argv[i];
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2095,60 +2293,108 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--embd") {
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_batch = std::stoi(argv[i]);
+        } else if (arg == "-n" || arg == "--examples") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_embd = std::stoi(argv[i]);
-        } else if (arg == "--mult") {
+            params->n_examples = std::stoi(argv[i]);
+        } else if (arg == "--predict") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_mult = std::stoi(argv[i]);
-        } else if (arg == "--head") {
+            params->n_predict = std::stoi(argv[i]);
+        } else if (arg == "--lora-alpha") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_head = std::stoi(argv[i]);
-        } else if (arg == "--layer") {
+            params->lora_alpha = std::stoi(argv[i]);
+        } else if (arg == "--lora-r") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_layer = std::stoi(argv[i]);
-        } else if (arg == "--rotmax") {
+            params->lora_r = std::stoi(argv[i]);
+        } else if (arg == "--rank-att-norm") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_rotmax = std::stoi(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
+            params->n_rank_attention_norm = std::stoi(argv[i]);
+        } else if (arg == "--rank-ffn-norm") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_threads = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch") {
+            params->n_rank_ffn_norm = std::stoi(argv[i]);
+        } else if (arg == "--rank-out-norm") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_batch = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--examples") {
+            params->n_rank_norm = std::stoi(argv[i]);
+        } else if (arg == "--rank-tok-embd") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_examples = std::stoi(argv[i]);
-        } else if (arg == "--predict") {
+            params->n_rank_tok_embeddings = std::stoi(argv[i]);
+        } else if (arg == "--rank-out") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->n_predict = std::stoi(argv[i]);
+            params->n_rank_output = std::stoi(argv[i]);
+        } else if (arg == "--rank-wq") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_wq = std::stoi(argv[i]);
+        } else if (arg == "--rank-wk") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_wk = std::stoi(argv[i]);
+        } else if (arg == "--rank-wv") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_wv = std::stoi(argv[i]);
+        } else if (arg == "--rank-w1") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_w1 = std::stoi(argv[i]);
+        } else if (arg == "--rank-w2") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_w2 = std::stoi(argv[i]);
+        } else if (arg == "--rank-w3") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_w3 = std::stoi(argv[i]);
         } else if (arg == "--print-info-interval") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2285,12 +2531,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->lbfgs_n_iter = std::stoi(argv[i]);
-        } else if (arg == "--mem-model") {
+        } else if (arg == "--mem-lora") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->mem_model_gb = std::stoi(argv[i]);
+            params->mem_lora_gb = std::stoi(argv[i]);
         } else if (arg == "--mem-compute") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2390,9 +2636,9 @@ int main(int argc, char ** argv) {
     srand(params.seed);
 
     struct llama_context_params llama_params = llama_context_default_params();
-    llama_params.vocab_only = true;
+    llama_params.vocab_only = false;
 
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
+    struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
     struct llama_vocab vocab;
@@ -2422,15 +2668,23 @@ int main(int argc, char ** argv) {
     printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
 
     struct my_llama_model model;
-    model.hparams.n_vocab = llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
-    model.hparams.n_embd  = params.n_embd;
-    model.hparams.n_mult  = params.n_mult;
-    model.hparams.n_head  = params.n_head;
-    model.hparams.n_layer = params.n_layer;
-    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
-
-    print_params(&model.hparams);
+    init_model(lmodel, &model, params.n_ctx);
+
+    struct my_llama_model lora;
+    lora.hparams.n_rank_attention_norm = params.n_rank_attention_norm;
+    lora.hparams.n_rank_wq             = params.n_rank_wq;
+    lora.hparams.n_rank_wk             = params.n_rank_wk;
+    lora.hparams.n_rank_wv             = params.n_rank_wv;
+    lora.hparams.n_rank_wo             = params.n_rank_wo;
+    lora.hparams.n_rank_ffn_norm       = params.n_rank_ffn_norm;
+    lora.hparams.n_rank_w1             = params.n_rank_w1;
+    lora.hparams.n_rank_w2             = params.n_rank_w2;
+    lora.hparams.n_rank_w3             = params.n_rank_w3;
+    lora.hparams.n_rank_tok_embeddings = params.n_rank_tok_embeddings;
+    lora.hparams.n_rank_norm           = params.n_rank_norm;
+    lora.hparams.n_rank_output         = params.n_rank_output;
+
+    print_lora_params(&lora.hparams);
 
     std::vector<size_t> token_noccurs;
     std::vector<bool>   token_notavail;
@@ -2452,18 +2706,16 @@ int main(int argc, char ** argv) {
 
     struct my_llama_kv_cache kv_self;
 
-
     struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_lora_gb);
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
-    model.ctx = ggml_init(lcparams);
-    kv_self.ctx = model.ctx;
+    lora.ctx = ggml_init(lcparams);
+    kv_self.ctx = lora.ctx;
 
     my_llama_sampler sampler;
 
-
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.n_batch;
@@ -2497,12 +2749,12 @@ int main(int argc, char ** argv) {
     opt_params_adam.max_no_improvement    = params.opt_max_no_improvement;
     opt_params_lbfgs.lbfgs.n_iter         = params.lbfgs_n_iter;
 
-    opt->ctx = model.ctx;
+    opt->ctx = lora.ctx;
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
     printf("%s: init model\n", __func__);
     bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true);
-    set_param_model(&model);
+    set_param_lora(&lora);
 
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
@@ -2518,8 +2770,8 @@ int main(int argc, char ** argv) {
     // init_kv_cache(&kv_self, &model, n_batch);
     init_sampler(&sampler, lctx);
 
-    printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
-    // ggml_print_tensor_objects(model.ctx);
+    printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(lora.ctx));
+    // ggml_print_tensor_objects(lora.ctx);
 
     // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
@@ -2612,8 +2864,8 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * loss   = NULL;
         struct ggml_tensor * logits = NULL;
 
-        loss = llama_build_train_graphs(
-            &model, alloc, ctx0,
+        loss = llama_build_lora_finetune_graphs(
+            &model, &lora, alloc, ctx0,
             gf, gb, gb_tmp,
             &logits, tokens_input, target_probs,
             n_tokens, n_batch,
@@ -2688,11 +2940,11 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     if (params.n_examples > 0) {
-        save_checkpoint(&model, opt, params.fn_checkpoint_out);
+        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out);
     }
 
-    if (strlen(params.fn_model_out) > 0) {
-        save_as_llama_model(&vocab, &model, params.fn_model_out);
+    if (strlen(params.fn_lora_out) > 0) {
+        save_as_llama_lora(&lora, params.fn_lora_out);
     }
 
     {
@@ -2716,9 +2968,9 @@ int main(int argc, char ** argv) {
 
         printf("[Prediction context]\n");
 
-        struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens);
-        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
-        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
+        struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(lora.ctx, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * target_logits = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
+        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
 
         get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
         for (int i=sample_ctx; i<n_tokens; ++i) {
@@ -2773,7 +3025,7 @@ int main(int argc, char ** argv) {
 
     delete[] compute_addr;
     delete[] compute_buf_0;
-    ggml_free(model.ctx);
+    ggml_free(lora.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);
     return 0;

From 50b1e66200cadaa3e0b1bc83d82a9d718b1bcdfd Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 16:21:02 +0200
Subject: [PATCH 070/235] remove const model and layer arguments in API
 functions for accessing model tensors

---
 llama.cpp | 32 +++++++++++++-------------------
 llama.h   | 34 +++++++++++++++-------------------
 2 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ccec53c86434d..92a787096482e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4213,7 +4213,7 @@ int llama_get_vocab(
 }
 
 struct llama_layer * llama_get_layer_from_model(
-        const struct llama_model * model,
+        struct llama_model * model,
         int layer_idx) {
     if (layer_idx < 0 || layer_idx >= model->hparams.n_layer) {
         return NULL;
@@ -4222,57 +4222,51 @@ struct llama_layer * llama_get_layer_from_model(
     }
 }
 
-struct llama_layer * llama_get_layer(
-        const struct llama_context * ctx,
-        int layer_idx) {
-    return llama_get_layer_from_model(&ctx->model, layer_idx);
-}
-
-struct ggml_tensor * llama_get_model_tok_embeddings(const struct llama_model * model) {
+struct ggml_tensor * llama_get_model_tok_embeddings(struct llama_model * model) {
     return model->tok_embeddings;
 }
 
-struct ggml_tensor * llama_get_model_norm(const struct llama_model * model) {
+struct ggml_tensor * llama_get_model_norm(struct llama_model * model) {
     return model->norm;
 }
 
-struct ggml_tensor * llama_get_model_output(const struct llama_model * model) {
+struct ggml_tensor * llama_get_model_output(struct llama_model * model) {
     return model->output;
 }
 
-struct ggml_tensor * llama_get_layer_attention_norm(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_attention_norm(struct llama_layer * layer) {
     return layer->attention_norm;
 }
 
-struct ggml_tensor * llama_get_layer_wq(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_wq(struct llama_layer * layer) {
     return layer->wq;
 }
 
-struct ggml_tensor * llama_get_layer_wk(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_wk(struct llama_layer * layer) {
     return layer->wk;
 }
 
-struct ggml_tensor * llama_get_layer_wv(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_wv(struct llama_layer * layer) {
     return layer->wv;
 }
 
-struct ggml_tensor * llama_get_layer_wo(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_wo(struct llama_layer * layer) {
     return layer->wo;
 }
 
-struct ggml_tensor * llama_get_layer_ffn_norm(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_ffn_norm(struct llama_layer * layer) {
     return layer->ffn_norm;
 }
 
-struct ggml_tensor * llama_get_layer_w1(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_w1(struct llama_layer * layer) {
     return layer->w1;
 }
 
-struct ggml_tensor * llama_get_layer_w2(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_w2(struct llama_layer * layer) {
     return layer->w2;
 }
 
-struct ggml_tensor * llama_get_layer_w3(const struct llama_layer * layer) {
+struct ggml_tensor * llama_get_layer_w3(struct llama_layer * layer) {
     return layer->w3;
 }
 
diff --git a/llama.h b/llama.h
index e74279abac093..365bb185ff451 100644
--- a/llama.h
+++ b/llama.h
@@ -358,27 +358,23 @@ extern "C" {
                                    int   capacity);
 
     // Get a llama layer
-    LLAMA_API struct llama_layer * llama_get_layer(
-            const struct llama_context * ctx,
-                                     int layer);
-
     LLAMA_API struct llama_layer * llama_get_layer_from_model(
-              const struct llama_model * model,
-                                     int layer);
+            struct llama_model * model,
+                           int   layer);
     
-    LLAMA_API struct ggml_tensor * llama_get_model_tok_embeddings(const struct llama_model * model);
-    LLAMA_API struct ggml_tensor * llama_get_model_norm          (const struct llama_model * model);
-    LLAMA_API struct ggml_tensor * llama_get_model_output        (const struct llama_model * model);
-
-    LLAMA_API struct ggml_tensor * llama_get_layer_attention_norm(const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wq            (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wk            (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wv            (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wo            (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_ffn_norm      (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_w1            (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_w2            (const struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_w3            (const struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_model_tok_embeddings(struct llama_model * model);
+    LLAMA_API struct ggml_tensor * llama_get_model_norm          (struct llama_model * model);
+    LLAMA_API struct ggml_tensor * llama_get_model_output        (struct llama_model * model);
+
+    LLAMA_API struct ggml_tensor * llama_get_layer_attention_norm(struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wq            (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wk            (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wv            (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_wo            (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_ffn_norm      (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_w1            (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_w2            (struct llama_layer * layer);
+    LLAMA_API struct ggml_tensor * llama_get_layer_w3            (struct llama_layer * layer);
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row

From be7e564b112baca4581b96275112cf35e59570a7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 16:21:43 +0200
Subject: [PATCH 071/235] bug fixes to make finetune compile

automatic allocator does not work yet
---
 examples/finetune/finetune.cpp | 40 +++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index bdc717ffea5b0..d7c0f3623ee13 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -236,6 +236,8 @@ struct my_llama_model {
 };
 
 struct my_llama_lora_hparams {
+    uint32_t lora_r = 1;
+    uint32_t lora_alpha = 1;
     uint32_t n_rank_attention_norm = 1;
     uint32_t n_rank_wq = 4;
     uint32_t n_rank_wk = 4;
@@ -333,7 +335,7 @@ void print_lora_params(struct my_llama_lora_hparams * params) {
     printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
 }
 
-void init_model(const struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
+void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
     auto & hparams = model->hparams;
 
     hparams.n_vocab = llama_n_vocab_from_model(input);
@@ -350,10 +352,6 @@ void init_model(const struct llama_model * input, struct my_llama_model * model,
 
     const uint32_t n_ff = get_n_ff(&hparams);
 
-    model->train_its = 0;
-    model->train_samples = 0;
-    model->train_tokens = 0;
-
     model->tok_embeddings = llama_get_model_tok_embeddings(input);
     model->norm           = llama_get_model_norm(input);
     model->output         = llama_get_model_output(input);
@@ -589,7 +587,7 @@ struct ggml_tensor * forward(
     const int n_head  = hparams.n_head;
     const int n_rot   = hparams.n_rot;
 
-    GGML_ASSERT(n_layer == lora.layers.size());
+    GGML_ASSERT(n_layer == lora->layers.size());
 
     struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
@@ -801,9 +799,7 @@ struct ggml_tensor * forward(
         // inpL shape [n_embd,N,1,1]
         inpL = ggml_mul(ctx0,
                     ggml_repeat(ctx0,
-                        ggml_add(ctx0,
-                            model->norm,
-                            lora->norm),
+                        norm,
                         inpL),
                     inpL);
 
@@ -1073,7 +1069,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     const int n_ff       = get_n_ff(&hparams);
     const int rope_mode  = 0;
 
-    GGML_ASSERT(n_layer == lora.layers.size());
+    GGML_ASSERT(n_layer == lora->layers.size());
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
         ggml_set_name(t, n);
@@ -1117,6 +1113,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct ggml_tensor * wq = ggml_add(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
         struct ggml_tensor * wk = ggml_add(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
         struct ggml_tensor * wv = ggml_add(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+        struct ggml_tensor * wo = ggml_add(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
         struct ggml_tensor * w1 = ggml_add(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
         struct ggml_tensor * w2 = ggml_add(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
         struct ggml_tensor * w3 = ggml_add(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
@@ -1878,7 +1875,7 @@ void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
     write_opt_context(&file, opt);
 }
 
-bool load_checkpoint(struct my_llama_lora * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, bool init) {
+bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, bool init) {
     struct llama_file file(filename, "rb");
 
     uint32_t magic;
@@ -1971,7 +1968,7 @@ bool load_checkpoint(struct my_llama_lora * model, struct my_llama_lora * lora,
             read_tensor(&file, layer.w3_b);
         }
 
-        read_opt_context(&file, model->ctx, opt);
+        read_opt_context(&file, lora->ctx, opt);
     }
 
     return (file.fp != NULL);
@@ -2638,6 +2635,7 @@ int main(int argc, char ** argv) {
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = false;
 
+    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
@@ -2670,7 +2668,9 @@ int main(int argc, char ** argv) {
     struct my_llama_model model;
     init_model(lmodel, &model, params.n_ctx);
 
-    struct my_llama_model lora;
+    struct my_llama_lora lora;
+    lora.hparams.lora_r                = params.lora_r;
+    lora.hparams.lora_alpha            = params.lora_alpha;
     lora.hparams.n_rank_attention_norm = params.n_rank_attention_norm;
     lora.hparams.n_rank_wq             = params.n_rank_wq;
     lora.hparams.n_rank_wk             = params.n_rank_wk;
@@ -2753,17 +2753,17 @@ int main(int argc, char ** argv) {
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true);
+    bool existed = load_checkpoint(&model, &lora, opt, params.fn_checkpoint_in, true);
     set_param_lora(&lora);
 
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
-    opt->iter = model.train_its;
+    opt->iter = lora.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
     bool from_scratch = !existed;
     if (from_scratch) {
-        randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+        randomize_lora(&lora, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
     init_kv_cache(&kv_self, &model, 1);
@@ -2894,9 +2894,9 @@ int main(int argc, char ** argv) {
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
         int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter;
-        model.train_its = opt->iter;
-        model.train_samples += n_batch * n_iter;
-        model.train_tokens  += n_batch * n_tokens * n_iter;
+        lora.train_its = opt->iter;
+        lora.train_samples += n_batch * n_iter;
+        lora.train_tokens  += n_batch * n_tokens * n_iter;
 
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
@@ -2993,7 +2993,7 @@ int main(int argc, char ** argv) {
             struct ggml_cgraph * gf = ggml_new_graph(ctx0);
 
             int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &lora, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(gf, logits);
             ggml_graph_compute_helper(work_buffer, gf, params.n_threads);

From 620275361dfd718a4f7465ab0dca2bf6e6b25789 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 16:23:21 +0200
Subject: [PATCH 072/235] add debug prints for training memory improvements

---
 ggml-alloc.c | 25 ++++++++++++++++++++++++-
 ggml.c       | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index ddf973daec7e4..438db4537579e 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -162,12 +162,22 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
         printf("\n");
     }
 #endif
-
+    if ((char*)addr - (char*)alloc->data + size > alloc->max_size) {
+        printf("%s: op=%s name=%s max_size=%zu\n", __func__, ggml_op_name(tensor->op), ggml_get_name(tensor), (char*)addr - (char*)alloc->data + size);
+    }
     alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    // static int counter = 0;
+    // counter++;
+    // if (counter > 2) {
+    //     printf("%s: counter=%d OMIT\n", __func__, counter);
+    //     return;
+    // } else {
+    //     printf("%s: counter=%d\n", __func__, counter);
+    // }
     void * ptr = tensor->data;
 
     if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
@@ -179,6 +189,7 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
 
     size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
+    // printf("%s:          free            data=[%p..%p] op=%s name=%s n_free_blocks=%d\n", __func__, tensor->data, (char*) tensor->data + size, ggml_op_name(tensor->op), ggml_get_name(tensor), alloc->n_free_blocks);
     AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -478,11 +489,23 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                 if (parent == NULL) {
                     break;
                 }
+                bool was_null = parent->data == NULL;
                 allocate_node(alloc, parent);
+                // if (was_null) {
+                //     printf("%s: alloc n[%02d]  %d data=[%p..%p] %s %s\n", __func__, i, j, parent->data, (char*) parent->data + ggml_nbytes(parent), ggml_op_name(parent->op), ggml_get_name(parent));
+                // } else {
+                //     printf("%s: exist n[%02d]  %d data=[%p..%p] %s %s\n", __func__, i, j, parent->data, (char*) parent->data + ggml_nbytes(parent), ggml_op_name(parent->op), ggml_get_name(parent));
+                // }
             }
 
             // allocate node
+            bool was_null = node->data == NULL;
             allocate_node(alloc, node);
+            // if (was_null) {
+            //     printf("%s: alloc node[%02d] data=[%p..%p] %s %s\n", __func__, i, node->data, (char*) node->data + ggml_nbytes(node), ggml_op_name(node->op), ggml_get_name(node));
+            // } else {
+            //     printf("%s: exist node[%02d] data=[%p..%p] %s %s\n", __func__, i, node->data, (char*) node->data + ggml_nbytes(node), ggml_op_name(node->op), ggml_get_name(node));
+            // }
 
             AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
             for (int j = 0; j < GGML_MAX_SRC; j++) {
diff --git a/ggml.c b/ggml.c
index 79098a2fccb38..030649eeff450 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16557,6 +16557,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
         struct ggml_tensor * grad = cgraph->grads[i];
 
         if (grad) {
+            // printf("%s: set_zero data=[%p] op=%s name=%s\n", __func__, grad->data, ggml_op_name(grad->op), ggml_get_name(grad));
             ggml_set_zero(grad);
         }
     }
@@ -17312,6 +17313,48 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
 //   ref: https://arxiv.org/pdf/1412.6980.pdf
 //
 
+uint32_t compute_data_checksum(struct ggml_tensor * tensor) {
+    const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1;
+    const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1;
+    const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1;
+    const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1;
+    const size_t nb0 = tensor->nb[0];
+    const size_t nb1 = tensor->nb[1];
+    const size_t nb2 = tensor->nb[2];
+    const size_t nb3 = tensor->nb[3];
+    const size_t nb  = ggml_element_size(tensor);
+    uint32_t result = 0;
+    for (int i3 = 0; i3 < n3; ++i3) {
+        for (int i2 = 0; i2 < n2; ++i2) {
+            for (int i1 = 0; i1 < n1; ++i1) {
+                for (int i0 = 0; i0 < n0; ++i0) {
+                    char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                    uint32_t val;
+                    memcpy(&val, ptr, nb);
+                    result = result ^ val;
+                    result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+void print_data_checksums(struct ggml_cgraph * g) {
+    for (int i = 0; i < g->n_nodes; ++i) {
+        struct ggml_tensor * node = g->nodes[i];
+        for (int j = 0; j<GGML_MAX_SRC; ++j) {
+            if (node->src[j]) {
+                struct ggml_tensor * src = node->src[j];
+                uint32_t chk = compute_data_checksum(src);
+                printf("%s: node[%3d]->src[%d] chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, j, chk, src->data, ggml_op_name(src->op), ggml_get_name(src));
+            }
+        }
+        uint32_t chk = compute_data_checksum(node);
+        printf("%s: node[%3d]         chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, chk, node->data, ggml_op_name(node->op), ggml_get_name(node));
+    }
+}
+
 static enum ggml_opt_result ggml_opt_adam(
         struct ggml_context * ctx,
         struct ggml_opt_context * opt,
@@ -17373,6 +17416,8 @@ static enum ggml_opt_result ggml_opt_adam(
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
     ggml_graph_compute(gb, &cplan);
 
+    print_data_checksums(gb);
+
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
     if (pf) {
@@ -17434,6 +17479,8 @@ static enum ggml_opt_result ggml_opt_adam(
             const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
+                printf("%s: para[%3d]          chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]), ggml_op_name(ps[p]->op), ggml_get_name(ps[p]));
+                printf("%s: para[%3d]->grad    chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]->grad), ggml_op_name(ps[p]->grad->op), ggml_get_name(ps[p]->grad));
                 const int64_t ne = ggml_nelements(ps[p]);
                 const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0) * sched;
                 for (int64_t j = 0; j < ne; ++j) {
@@ -17512,6 +17559,11 @@ static enum ggml_opt_result ggml_opt_adam(
         }
     }
 
+    print_data_checksums(gb);
+    for (int p = 0; p < np; ++p) {
+        printf("%s: para[%3d]          chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]), ggml_op_name(ps[p]->op), ggml_get_name(ps[p]));
+        printf("%s: para[%3d]->grad    chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]->grad), ggml_op_name(ps[p]->grad->op), ggml_get_name(ps[p]->grad));
+    }
     return GGML_OPT_DID_NOT_CONVERGE;
 }
 

From 0ab2507ce555d7688671c6544b61cf531a34692c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 16:41:20 +0200
Subject: [PATCH 073/235] fix names of lora tensors

---
 examples/finetune/finetune.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d7c0f3623ee13..83ee4df026d67 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -447,12 +447,12 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
         ggml_format_name(layer.ffn_norm_a, "%s.ffn_norm.weight.loraA", layers_i.c_str());
         ggml_format_name(layer.ffn_norm_b, "%s.ffn_norm.weight.loraB", layers_i.c_str());
 
-        ggml_format_name(layer.w1_a, "%s.feed_forward.w1_a.weight", layers_i.c_str());
-        ggml_format_name(layer.w1_b, "%s.feed_forward.w1_b.weight", layers_i.c_str());
-        ggml_format_name(layer.w2_a, "%s.feed_forward.w2_a.weight", layers_i.c_str());
-        ggml_format_name(layer.w2_b, "%s.feed_forward.w2_b.weight", layers_i.c_str());
-        ggml_format_name(layer.w3_a, "%s.feed_forward.w3_a.weight", layers_i.c_str());
-        ggml_format_name(layer.w3_b, "%s.feed_forward.w3_b.weight", layers_i.c_str());
+        ggml_format_name(layer.w1_a, "%s.feed_forward.w1.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.w1_b, "%s.feed_forward.w1.weight.loraB", layers_i.c_str());
+        ggml_format_name(layer.w2_a, "%s.feed_forward.w2.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.w2_b, "%s.feed_forward.w2.weight.loraB", layers_i.c_str());
+        ggml_format_name(layer.w3_a, "%s.feed_forward.w3.weight.loraA", layers_i.c_str());
+        ggml_format_name(layer.w3_b, "%s.feed_forward.w3.weight.loraB", layers_i.c_str());
     }
 }
 

From 39a2d154610a743df8fec7a242e7d8abe8676071 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 16:42:25 +0200
Subject: [PATCH 074/235] avoid stack overflow resulting from big ggml_cgraph

replace stack allocation and ggml_build_forward by ggml_new_graph in combination with ggml_build_forward_expand
---
 llama.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 92a787096482e..35ea68075de1d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3653,9 +3653,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
                 ggml_set_name(r, "r_cpy");
             }
 
-            struct ggml_cgraph gf = ggml_build_forward(r);
+            struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
+            ggml_build_forward_expand(gf, r);
 
-            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
+            ggml_graph_compute_helper(work_buffer, gf, n_threads);
 
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);

From 1151653b159a679a5ec47c177e7c8aab4b8ed2ad Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 21:36:40 +0200
Subject: [PATCH 075/235] replace llama API functions to get model tensors by
 one function to get model tensor by name

LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
---
 examples/finetune/finetune.cpp | 51 ++++++++++++++++++++----------
 llama.cpp                      | 58 ++--------------------------------
 llama.h                        | 21 ++----------
 3 files changed, 38 insertions(+), 92 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 83ee4df026d67..05174b94065be 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -339,12 +339,12 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
     auto & hparams = model->hparams;
 
     hparams.n_vocab = llama_n_vocab_from_model(input);
-    hparams.n_ctx = n_ctx;
-    hparams.n_embd = llama_n_embd_from_model(input);
-    hparams.n_mult = llama_n_mult_from_model(input);
-    hparams.n_head = llama_n_head_from_model(input);
+    hparams.n_ctx   = n_ctx;
+    hparams.n_embd  = llama_n_embd_from_model(input);
+    hparams.n_mult  = llama_n_mult_from_model(input);
+    hparams.n_head  = llama_n_head_from_model(input);
     hparams.n_layer = llama_n_layer_from_model(input);
-    hparams.n_rot = llama_n_rot_from_model(input);
+    hparams.n_rot   = llama_n_rot_from_model(input);
 
     const uint32_t n_embd  = hparams.n_embd;
     const uint32_t n_layer = hparams.n_layer;
@@ -352,27 +352,44 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
 
     const uint32_t n_ff = get_n_ff(&hparams);
 
-    model->tok_embeddings = llama_get_model_tok_embeddings(input);
-    model->norm           = llama_get_model_norm(input);
-    model->output         = llama_get_model_output(input);
+    model->tok_embeddings = llama_get_model_tensor(input, "tok_embeddings.weight");
+    model->norm           = llama_get_model_tensor(input, "norm.weight");
+    model->output         = llama_get_model_tensor(input, "output.weight");
 
     model->layers.resize(n_layer);
+
+    char name[GGML_MAX_NAME];
+
     for (uint32_t i = 0; i < n_layer; ++i) {
         struct llama_layer * ilayer = llama_get_layer_from_model(input, i);
         auto & layer = model->layers[i];
 
-        layer.attention_norm = llama_get_layer_attention_norm(ilayer);
+        snprintf(name, GGML_MAX_NAME, "layers.%d.attention_norm.weight", i);
+        layer.attention_norm = llama_get_model_tensor(input, name);
+
+        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wq.weight", i);
+        layer.wq = llama_get_model_tensor(input, name);
+
+        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wk.weight", i);
+        layer.wk = llama_get_model_tensor(input, name);
+
+        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wv.weight", i);
+        layer.wv = llama_get_model_tensor(input, name);
+
+        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wo.weight", i);
+        layer.wo = llama_get_model_tensor(input, name);
 
-        layer.wq = llama_get_layer_wq(ilayer);
-        layer.wk = llama_get_layer_wk(ilayer);
-        layer.wv = llama_get_layer_wv(ilayer);
-        layer.wo = llama_get_layer_wo(ilayer);
+        snprintf(name, GGML_MAX_NAME, "layers.%d.ffn_norm.weight", i);
+        layer.ffn_norm = llama_get_model_tensor(input, name);
 
-        layer.ffn_norm = llama_get_layer_ffn_norm(ilayer);
+        snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w1.weight", i);
+        layer.w1 = llama_get_model_tensor(input, name);
 
-        layer.w1 = llama_get_layer_w1(ilayer);
-        layer.w2 = llama_get_layer_w2(ilayer);
-        layer.w3 = llama_get_layer_w3(ilayer);
+        snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w2.weight", i);
+        layer.w2 = llama_get_model_tensor(input, name);
+        
+        snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w3.weight", i);
+        layer.w3 = llama_get_model_tensor(input, name);
     }
 }
 
diff --git a/llama.cpp b/llama.cpp
index 35ea68075de1d..6af1e003ca40b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4213,62 +4213,8 @@ int llama_get_vocab(
     return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
 }
 
-struct llama_layer * llama_get_layer_from_model(
-        struct llama_model * model,
-        int layer_idx) {
-    if (layer_idx < 0 || layer_idx >= model->hparams.n_layer) {
-        return NULL;
-    } else {
-        return &model->layers[layer_idx];
-    }
-}
-
-struct ggml_tensor * llama_get_model_tok_embeddings(struct llama_model * model) {
-    return model->tok_embeddings;
-}
-
-struct ggml_tensor * llama_get_model_norm(struct llama_model * model) {
-    return model->norm;
-}
-
-struct ggml_tensor * llama_get_model_output(struct llama_model * model) {
-    return model->output;
-}
-
-struct ggml_tensor * llama_get_layer_attention_norm(struct llama_layer * layer) {
-    return layer->attention_norm;
-}
-
-struct ggml_tensor * llama_get_layer_wq(struct llama_layer * layer) {
-    return layer->wq;
-}
-
-struct ggml_tensor * llama_get_layer_wk(struct llama_layer * layer) {
-    return layer->wk;
-}
-
-struct ggml_tensor * llama_get_layer_wv(struct llama_layer * layer) {
-    return layer->wv;
-}
-
-struct ggml_tensor * llama_get_layer_wo(struct llama_layer * layer) {
-    return layer->wo;
-}
-
-struct ggml_tensor * llama_get_layer_ffn_norm(struct llama_layer * layer) {
-    return layer->ffn_norm;
-}
-
-struct ggml_tensor * llama_get_layer_w1(struct llama_layer * layer) {
-    return layer->w1;
-}
-
-struct ggml_tensor * llama_get_layer_w2(struct llama_layer * layer) {
-    return layer->w2;
-}
-
-struct ggml_tensor * llama_get_layer_w3(struct llama_layer * layer) {
-    return layer->w3;
+struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
+    return ggml_get_tensor(model->ctx, name);
 }
 
 float * llama_get_logits(struct llama_context * ctx) {
diff --git a/llama.h b/llama.h
index 365bb185ff451..bb6c3c107ce39 100644
--- a/llama.h
+++ b/llama.h
@@ -69,7 +69,6 @@ extern "C" {
 
     struct llama_model;
     struct llama_context;
-    struct llama_layer;
 
     typedef int llama_token;
 
@@ -357,24 +356,8 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
 
-    // Get a llama layer
-    LLAMA_API struct llama_layer * llama_get_layer_from_model(
-            struct llama_model * model,
-                           int   layer);
-    
-    LLAMA_API struct ggml_tensor * llama_get_model_tok_embeddings(struct llama_model * model);
-    LLAMA_API struct ggml_tensor * llama_get_model_norm          (struct llama_model * model);
-    LLAMA_API struct ggml_tensor * llama_get_model_output        (struct llama_model * model);
-
-    LLAMA_API struct ggml_tensor * llama_get_layer_attention_norm(struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wq            (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wk            (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wv            (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_wo            (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_ffn_norm      (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_w1            (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_w2            (struct llama_layer * layer);
-    LLAMA_API struct ggml_tensor * llama_get_layer_w3            (struct llama_layer * layer);
+    // Get a llama model tensor
+    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row

From 79ad888768d17837e4a8fd9ae22164b652d688f9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 21:56:36 +0200
Subject: [PATCH 076/235] remove unused call to not existing
 llama_get_layer_from_model

---
 examples/finetune/finetune.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 05174b94065be..8884656ab5605 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -361,7 +361,6 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
     char name[GGML_MAX_NAME];
 
     for (uint32_t i = 0; i < n_layer; ++i) {
-        struct llama_layer * ilayer = llama_get_layer_from_model(input, i);
         auto & layer = model->layers[i];
 
         snprintf(name, GGML_MAX_NAME, "layers.%d.attention_norm.weight", i);

From 83cb9ed4f563249c319040e2cac786f6b17658bc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 22:00:37 +0200
Subject: [PATCH 077/235] implement ggml_compute_forward_out_prod_q_f32

---
 ggml.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 130 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index 030649eeff450..1e2ba6de04a91 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10623,8 +10623,8 @@ static void ggml_compute_forward_out_prod_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
+    // int64_t t0 = ggml_perf_time_us();
+    // UNUSED(t0);
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
@@ -10725,6 +10725,116 @@ static void ggml_compute_forward_out_prod_f32(
     //}
 }
 
+static void ggml_compute_forward_out_prod_q_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    // int64_t t0 = ggml_perf_time_us();
+    // UNUSED(t0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const enum ggml_type type = src0->type;
+    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // we don't support permuted src0 dim0
+    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+
+    // dst dim0 cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    // GGML_ASSERT(nb0 <= nb1);
+    // GGML_ASSERT(nb1 <= nb2);
+    // GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne00);
+    GGML_ASSERT(ne1 == ne10);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    // nb01 >= nb00 - src0 is not transposed
+    //   compute by src0 rows
+
+    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
+    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+
+    if (params->type == GGML_TASK_INIT) {
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by last three dimensions
+
+    // total rows in dst
+    const int64_t nr = ne1*ne2*ne3;
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
+    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        // dst indices
+        const int64_t i3 = ir/(ne2*ne1);
+        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const int64_t i02 = i2;
+        const int64_t i03 = i3;
+
+        //const int64_t i10 = i1;
+        const int64_t i12 = i2;
+        const int64_t i13 = i3;
+
+        for (int64_t i01 = 0; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+    
+            dequantize_row_q(s0, wdata, ne0);
+            ggml_vec_mad_f32(ne0, d, wdata, *s1);
+        }
+    }
+
+    //int64_t t1 = ggml_perf_time_us();
+    //static int64_t acc = 0;
+    //acc += t1 - t0;
+    //if (t1 - t0 > 10) {
+    //    printf("\n");
+    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
+    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
+    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
+    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
+
+    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
+    //}
+}
+
 static void ggml_compute_forward_out_prod(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10736,10 +10846,13 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
             {
-                GGML_ASSERT(false); // todo
-                // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
+                ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F16:
             {
@@ -16216,7 +16329,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     n_tasks = n_threads;
                 } break;
             case GGML_OP_MUL_MAT:
-            case GGML_OP_OUT_PROD:
                 {
                     n_tasks = n_threads;
 
@@ -16258,6 +16370,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                         cur = 0;
                     }
 
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_OUT_PROD:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = 0;
+
+                    if (ggml_is_quantized(node->src[0]->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
+                    }
+
                     work_size = MAX(work_size, cur);
                 } break;
             case GGML_OP_SCALE:

From 83a4ad79869ee3602a60cd10edf0618ad565f31e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 22:05:41 +0200
Subject: [PATCH 078/235] remove trailing whitespace

---
 examples/finetune/finetune.cpp | 2 +-
 ggml.c                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 8884656ab5605..2e368fd9bcced 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -386,7 +386,7 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
 
         snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w2.weight", i);
         layer.w2 = llama_get_model_tensor(input, name);
-        
+
         snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w3.weight", i);
         layer.w3 = llama_get_model_tensor(input, name);
     }
diff --git a/ggml.c b/ggml.c
index 1e2ba6de04a91..52efe0d0f8c65 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10815,7 +10815,7 @@ static void ggml_compute_forward_out_prod_q_f32(
             float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
             float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
             float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-    
+
             dequantize_row_q(s0, wdata, ne0);
             ggml_vec_mad_f32(ne0, d, wdata, *s1);
         }

From f80e245d7b5e9939792b7e4700b8a8bd5e760b48 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 22:06:20 +0200
Subject: [PATCH 079/235] add lora finetune support on quantized base model
 tensors

---
 examples/finetune/finetune.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 2e368fd9bcced..c5f4a46658dbe 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1143,7 +1143,16 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
         struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, wv);                                set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+
+        struct ggml_tensor * t11;
+        if (ggml_is_quantized(wv->type)) {
+            struct ggml_tensor * t11_1 = ggml_mul_mat  (ctx, wv, t04);                              set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd, N*n_batch);
+            struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1);                                set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd);
+                                 t11   = ggml_cont     (ctx, t11_2);                                set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+        } else {
+                                 t11   = ggml_mul_mat  (ctx, t04, wv);                              set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+        }
+
         struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
         struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
         struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);

From 9198b24e4e9aa193c1ffb35121dbd1fb44ca5a55 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 23:50:46 +0200
Subject: [PATCH 080/235] add ggml_add_cast API function

this function works like ggml_add, but accepts a data type for the resulting tensor.
only supported for quantized src0 input.
---
 ggml.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 ggml.h |  6 ++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 52efe0d0f8c65..5e3f4c199cbc7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5115,6 +5115,44 @@ struct ggml_tensor * ggml_add_inplace(
     return ggml_add_impl(ctx, a, b, true);
 }
 
+// ggml_add_cast
+
+static struct ggml_tensor * ggml_add_cast_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        enum   ggml_type     type) {
+    // TODO: support less-strict constraint
+    //       GGML_ASSERT(ggml_can_repeat(b, a));
+    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+    GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        // TODO: support backward pass for broadcasting
+        GGML_ASSERT(ggml_are_same_shape(a, b));
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
+
+    result->op   = GGML_OP_ADD;
+    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_add_cast(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        enum   ggml_type     type) {
+    return ggml_add_impl(ctx, a, b, false);
+}
+
 // ggml_add1
 
 static struct ggml_tensor * ggml_add1_impl(
@@ -8317,8 +8355,9 @@ static void ggml_compute_forward_add_q_f32(
     const int nth = params->nth;
 
     const enum ggml_type type = src0->type;
+    const enum ggml_type dtype = dst->type;
     ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
+    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
@@ -8368,7 +8407,11 @@ static void ggml_compute_forward_add_q_f32(
         // add src1
         ggml_vec_acc_f32(ne00, wdata, src1_row);
         // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne00);
+        if (quantize_row_q != NULL) {
+            quantize_row_q(wdata, dst_row, ne00);
+        } else {
+            memcpy(dst_row, wdata, ne0*nb0);
+        }
     }
 }
 
diff --git a/ggml.h b/ggml.h
index cc43afdde915c..02db9ad2b91d5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -670,6 +670,12 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    GGML_API struct ggml_tensor * ggml_add_cast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            enum   ggml_type      type);
+
     GGML_API struct ggml_tensor * ggml_add1(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From 714fec06ee8d456d8cd28944cdbfda614eec2984 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 16 Aug 2023 23:53:12 +0200
Subject: [PATCH 081/235] use ggml_add_cast in finetuning

lora-applied weights will now have data type F32, which improves gradients when finetuning quantized base models
---
 examples/finetune/finetune.cpp | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index c5f4a46658dbe..d942f159d759b 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1099,10 +1099,19 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
 
     GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
 
+    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
+        if (ggml_is_quantized(a->type)) {
+            // todo make sure that ggml-alloc.c cannot make it inplace (of tensor a)
+            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
+        } else {
+            GGML_ASSERT(a->type == GGML_TYPE_F32);
+            return ggml_add(ctx, a, b);
+        }
+    };
 
-    struct ggml_tensor * tok_embeddings = ggml_add(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
-    struct ggml_tensor * norm           = ggml_add(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
-    struct ggml_tensor * output         = ggml_add(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
+    struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
+    struct ggml_tensor * norm           = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
+    struct ggml_tensor * output         = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
 
     struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
     struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00);        set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
@@ -1124,15 +1133,15 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct my_llama_layer & layer = model->layers[il];
         struct my_llama_lora_layer & llayer = lora->layers[il];
 
-        struct ggml_tensor * attention_norm = ggml_add(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
-        struct ggml_tensor * ffn_norm = ggml_add(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq = ggml_add(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk = ggml_add(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv = ggml_add(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo = ggml_add(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * w1 = ggml_add(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
-        struct ggml_tensor * w2 = ggml_add(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
-        struct ggml_tensor * w3 = ggml_add(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
+        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
+        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
+        struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
+        struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
+        struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+        struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
+        struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
+        struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
+        struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
 
         struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
         struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                    set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);

From 0bb897c82af4f5b1250970ca4e73b26b1ff99335 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 17 Aug 2023 23:48:30 +0200
Subject: [PATCH 082/235] bug fix: actually use result type passed to
 ggml_add_cast

---
 ggml.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 5e3f4c199cbc7..f9f523f368a34 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5150,7 +5150,7 @@ struct ggml_tensor * ggml_add_cast(
         struct ggml_tensor * a,
         struct ggml_tensor * b,
         enum   ggml_type     type) {
-    return ggml_add_impl(ctx, a, b, false);
+    return ggml_add_cast_impl(ctx, a, b, type);
 }
 
 // ggml_add1
@@ -8369,7 +8369,6 @@ static void ggml_compute_forward_add_q_f32(
     GGML_ASSERT(nb2 <= nb3);
 
     GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(dst->type == src0->type);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
     // rows per thread

From 44526cb261003dc7ec3b0d2ceda5dc056794cc84 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 15:03:17 +0200
Subject: [PATCH 083/235] make sure base model tensors data cannot be used in
 viewable operations

memory allocator would try to make lora application inplace on base model tensors.
since those are memory mapped this will result in memory access violations
---
 examples/finetune/finetune.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d942f159d759b..529ab5e8c1230 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1224,6 +1224,24 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         // output tensors
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+
+        // make sure base model tensors data cannot be used in viewable operations
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
+        for (int il = 0; il < n_layer; ++il) {
+            struct my_llama_layer & layer = model->layers[il];
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
+            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
+        }
+
         // gradient tensors (will be set to zero by ggml_graph_reset)
         for (int i = 0; i < gf->n_nodes; ++i) {
             if (!gf->grads[i]) continue;

From a252111b45d5b5080ab6d578ae81a3d8bbc5c1bc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 15:03:57 +0200
Subject: [PATCH 084/235] fix bug in ggml_out_prod which resulted in wrong
 n_dims of result tensors

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index f9f523f368a34..c1fc3a7f21fc8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5928,7 +5928,7 @@ struct ggml_tensor * ggml_out_prod(
     }
 
     const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
 
     result->op   = GGML_OP_OUT_PROD;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;

From f358204a5f271c6c84934b654688ede02ea9aad8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 16:01:43 +0200
Subject: [PATCH 085/235] avoid keeping in memory ALL of the gradients

The problem here stems from ggml_graph_reset. This function is called in the optimization function, before each graph computation, to reset the gradients to zero. This required a unique memory slot for each gradient: allocating memory from a previosly freed memory location might lead to non-zero input gradients.

During ggml_compute_backward the gradients are build stepwise by adding or substracting new values, starting from a OP_NONE tensor which needs to contain zero-values. This requires the graph reset.

To avoid this I now remember in ggml_build_backward_expand the original OP_NONE gradient tensors in a hash table, which is passed to ggml_compute_backward. There instead of using add (or sub or similar) I test whether the existing gradient to be changed is a zero-valued-tensor by looking up its existence in the hash table. When it is such a zero-tensor it will not be modified, but replaced by the value to be added, otherwise the regular add (not inplace, allocator will take care of this) will be used. This way none of those zero-tensor values will be necessary in the final backward graph and more importantly they won't need a unique memory slot, just to make them zero.
---
 examples/finetune/finetune.cpp |   8 -
 ggml.c                         | 305 ++++++++++++++++++++-------------
 2 files changed, 187 insertions(+), 126 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 529ab5e8c1230..c089056be5524 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1242,14 +1242,6 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
             ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
         }
 
-        // gradient tensors (will be set to zero by ggml_graph_reset)
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            if (!gf->grads[i]) continue;
-            if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) {
-                ggml_allocr_alloc(alloc, gf->grads[i]);
-            }
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
-        }
         for (int i = 0; i < checkpoints.size(); ++i) {
             if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);
diff --git a/ggml.c b/ggml.c
index c1fc3a7f21fc8..1c9349fd833dd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15009,7 +15009,89 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
+static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    // insert
+    hash_table[i] = p;
+    return false;
+}
+
+static bool hash_contains(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            return false;
+        }
+    }
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    return false;
+}
+
+// functions to change gradients considering the case that input a might be initial gradient with zero value
+
+static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return b;
+    } else {
+        return ggml_add_impl(ctx, a, b, false);
+    }
+}
+
+static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return b;
+    } else {
+        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
+    }
+}
+
+static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return ggml_repeat(ctx, b, a);
+    } else {
+        return ggml_add1_impl(ctx, a, b, false);
+    }
+}
+
+static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
+    if (hash_contains(zero_table, a)) {
+        return ggml_neg(ctx, b);
+    } else {
+        return ggml_sub_impl(ctx, a, b, false);
+    }
+}
+
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace, void * zero_table[]) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
 
@@ -15017,34 +15099,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_DUP:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
             } break;
         case GGML_OP_ADD:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
                 if (src1->grad) {
-                    src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace);
+                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
                 }
             } break;
         case GGML_OP_ADD1:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
                 if (src1->grad) {
-                    src1->grad = ggml_add_impl(ctx,
+                    src1->grad = ggml_add_or_set(ctx,
                         src1->grad,
                         ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
-                        inplace);
+                        zero_table);
                 }
             } break;
         case GGML_OP_ACC:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
                 if (src1->grad) {
                     const size_t nb1     = ((int32_t *) tensor->op_params)[0];
@@ -15061,117 +15143,117 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         nb1, nb2, nb3, offset);
 
                     src1->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                             src1->grad,
                             ggml_reshape(ctx,
                                 ggml_cont(ctx, tensor_grad_view),
                                 src1->grad),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_SUB:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
                 if (src1->grad) {
-                    src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace);
+                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
                 }
             } break;
         case GGML_OP_MUL:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_mul(ctx, src1, tensor->grad),
-                                inplace);
+                                zero_table);
                 }
                 if (src1->grad) {
                     src1->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src1->grad,
                                 ggml_mul(ctx, src0, tensor->grad),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_DIV:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_div(ctx, tensor->grad, src1),
-                                inplace);
+                                zero_table);
                 }
                 if (src1->grad) {
                     src1->grad =
-                        ggml_sub_impl(ctx,
+                        ggml_sub_or_set(ctx,
                                 src1->grad,
                                 ggml_mul(ctx,
                                     tensor->grad,
                                     ggml_div(ctx, tensor, src1)),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_SQR:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_scale(ctx,
                                     ggml_mul(ctx, src0, tensor->grad),
                                     ggml_new_f32(ctx, 2.0f)),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_SQRT:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_scale(ctx,
                                     ggml_div(ctx,
                                         tensor->grad,
                                         tensor),
                                     ggml_new_f32(ctx, 0.5f)),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_LOG:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_div(ctx,
                                     tensor->grad,
                                     src0),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_SUM:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add1_impl(ctx,
+                        ggml_add1_or_set(ctx,
                                 src0->grad,
                                 tensor->grad,
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_SUM_ROWS:
             {
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_repeat(ctx,
                                     tensor->grad,
                                     src0->grad),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_MEAN:
@@ -15183,20 +15265,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_repeat_back(ctx, tensor->grad, src0->grad),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_REPEAT_BACK:
             {
                 if (src0->grad) {
                     // TODO: test this
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_repeat(ctx, tensor->grad, src0->grad),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_SILU_BACK:
@@ -15214,10 +15296,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     float eps;
                     memcpy(&eps, tensor->op_params, sizeof(float));
 
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_RMS_NORM_BACK:
@@ -15244,16 +15326,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_out_prod(ctx, // [n,m]
                                     src1,          // [n,p]
                                     tensor->grad), // [m,p]
-                                inplace);
+                                zero_table);
                 }
                 if (src1->grad) {
                     src1->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                                 src1->grad,
                                 // ggml_mul_mat(ctx,                   // [n,p]
                                 //     ggml_cont(ctx,                  // [m,n]
@@ -15267,7 +15349,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                     src0,                           // [n,m]
                                     ggml_transpose(ctx,             // [p,m]
                                         tensor->grad)),             // [m,p]
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_OUT_PROD:
@@ -15279,17 +15361,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_scale_impl(ctx, tensor->grad, src1, false),
-                            inplace);
+                            zero_table);
                 }
                 if (src1->grad) {
                     src1->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                             src1->grad,
                             ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_SET:
@@ -15316,23 +15398,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 }
 
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                         src0->grad,
                         ggml_acc_impl(ctx,
                             tensor->grad,
                             ggml_neg(ctx, tensor_grad_view),
                             nb1, nb2, nb3, offset, false),
-                        inplace);
+                        zero_table);
                 }
 
                 if (src1->grad) {
                     src1->grad =
-                        ggml_add_impl(ctx,
+                        ggml_add_or_set(ctx,
                             src1->grad,
                             ggml_reshape(ctx,
                                 ggml_cont(ctx, tensor_grad_view),
                                 src1->grad),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_CPY:
@@ -15343,7 +15425,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // tensor = src0 * 1 + src1 * 0
                 if (src0->grad) {
                     // dsrc0 = dtensor * 1
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
                 if (src1->grad) {
                     // dsrc1 = dtensor * 0 -> noop
@@ -15355,7 +15437,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     GGML_ASSERT(ggml_is_contiguous(src0->grad));
                     GGML_ASSERT(ggml_is_contiguous(tensor->grad));
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
+                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                 }
             } break;
         case GGML_OP_RESHAPE:
@@ -15363,9 +15445,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
-                            ggml_reshape(ctx, tensor->grad, src0->grad),
-                        inplace);
+                        ggml_add_or_set(ctx, src0->grad,
+                            ggml_reshape(ctx, 
+                                ggml_is_contiguous(tensor->grad)
+                                    ? tensor->grad 
+                                    : ggml_cont(ctx, tensor->grad),
+                                src0->grad),
+                        zero_table);
                 }
             } break;
         case GGML_OP_VIEW:
@@ -15394,7 +15480,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         nb3 = (nb3 / n0) * ng;
                     }
 
-                    src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace);
+                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
                 }
             } break;
         case GGML_OP_PERMUTE:
@@ -15412,14 +15498,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     axes_backward[axis2] = 2;
                     axes_backward[axis3] = 3;
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
+                        ggml_add_or_set(ctx, src0->grad,
                             ggml_permute(ctx,
                                 tensor->grad,
                                 axes_backward[0],
                                 axes_backward[1],
                                 axes_backward[2],
                                 axes_backward[3]),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_TRANSPOSE:
@@ -15427,9 +15513,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
+                        ggml_add_or_set(ctx, src0->grad,
                             ggml_transpose(ctx, tensor->grad),
-                        inplace);
+                        zero_table);
                 }
             } break;
         case GGML_OP_GET_ROWS:
@@ -15437,9 +15523,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama (only for tokenizer)
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
+                        ggml_add_or_set(ctx, src0->grad,
                             ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
-                        inplace);
+                        zero_table);
                 }
                 if (src1->grad) {
                     // noop
@@ -15459,9 +15545,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     const int n_past = ((int32_t *) tensor->op_params)[0];
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
+                        ggml_add_or_set(ctx, src0->grad,
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        inplace);
+                        zero_table);
                 }
             } break;
         case GGML_OP_DIAG_MASK_ZERO:
@@ -15470,9 +15556,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     const int n_past = ((int32_t *) tensor->op_params)[0];
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
+                        ggml_add_or_set(ctx, src0->grad,
                             ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        inplace);
+                        zero_table);
                 }
             } break;
         case GGML_OP_SOFT_MAX:
@@ -15480,9 +15566,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // necessary for llama
                 if (src0->grad) {
                     src0->grad =
-                        ggml_add_impl(ctx, src0->grad,
+                        ggml_add_or_set(ctx, src0->grad,
                             ggml_soft_max_back(ctx, tensor->grad, tensor),
-                        inplace);
+                        zero_table);
                 }
 
             } break;
@@ -15498,7 +15584,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     const int n_dims = ((int32_t *) tensor->op_params)[1];
                     const int mode   = ((int32_t *) tensor->op_params)[2];
                     const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_rope_back(ctx,
                                 tensor->grad,
@@ -15506,7 +15592,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 n_dims,
                                 mode,
                                 n_ctx),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_ROPE_BACK:
@@ -15516,7 +15602,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     const int n_dims = ((int32_t *) tensor->op_params)[1];
                     const int mode   = ((int32_t *) tensor->op_params)[2];
                     const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             ggml_rope(ctx,
                                 tensor->grad,
@@ -15524,7 +15610,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 n_dims,
                                 mode,
                                 n_ctx),
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_ALIBI:
@@ -15607,10 +15693,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             } break;
                     }
 
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             grad_q,
-                            inplace);
+                            zero_table);
                 }
 
                 if (src1->grad) {
@@ -15653,10 +15739,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             } break;
                     }
 
-                    src1->grad = ggml_add_impl(ctx,
+                    src1->grad = ggml_add_or_set(ctx,
                             src1->grad,
                             grad_k,
-                            inplace);
+                            zero_table);
                 }
 
                 struct ggml_tensor * opt0 = tensor->src[2];
@@ -15702,10 +15788,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             } break;
                     }
 
-                    opt0->grad = ggml_add_impl(ctx,
+                    opt0->grad = ggml_add_or_set(ctx,
                             opt0->grad,
                             grad_v,
-                            inplace);
+                            zero_table);
                 }
             } break;
         case GGML_OP_FLASH_FF:
@@ -15725,12 +15811,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         {
                             if (src0->grad) {
                                 src0->grad =
-                                    ggml_add_impl(ctx,
+                                    ggml_add_or_set(ctx,
                                             src0->grad,
                                             ggml_mul(ctx,
                                                 ggml_sgn(ctx, src0),
                                                 tensor->grad),
-                                            inplace);
+                                            zero_table);
                             }
                         } break;
                     case GGML_UNARY_OP_SGN:
@@ -15742,7 +15828,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     case GGML_UNARY_OP_NEG:
                         {
                             if (src0->grad) {
-                                src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
+                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
                             }
                         } break;
                     case GGML_UNARY_OP_STEP:
@@ -15762,12 +15848,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     case GGML_UNARY_OP_RELU:
                         {
                             if (src0->grad) {
-                                src0->grad = ggml_add_impl(ctx,
+                                src0->grad = ggml_add_or_set(ctx,
                                         src0->grad,
                                         ggml_mul(ctx,
                                             ggml_step(ctx, src0),
                                             tensor->grad),
-                                        inplace);
+                                        zero_table);
                             }
                         } break;
                     case GGML_UNARY_OP_GELU:
@@ -15782,10 +15868,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         {
                             // necessary for llama
                             if (src0->grad) {
-                                src0->grad = ggml_add_impl(ctx,
+                                src0->grad = ggml_add_or_set(ctx,
                                         src0->grad,
                                         ggml_silu_back(ctx, src0, tensor->grad),
-                                        inplace);
+                                        zero_table);
                             }
                         } break;
                     default:
@@ -15803,13 +15889,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_CROSS_ENTROPY_LOSS:
             {
                 if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx,
+                    src0->grad = ggml_add_or_set(ctx,
                                 src0->grad,
                                 ggml_cross_entropy_loss_back(ctx,
                                     src0,
                                     src1,
                                     tensor->grad),
-                                inplace);
+                                zero_table);
                 }
             } break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
@@ -15827,34 +15913,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
     }
 }
 
-static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
-
-static size_t hash(void * p) {
-    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
-}
-
-static bool hash_insert(void * hash_table[], void * p) {
-    size_t h = hash(p);
-
-    // linear probing
-    size_t i = h;
-    while (hash_table[i] != NULL && hash_table[i] != p) {
-        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
-        if (i == h) {
-            // hash table is full
-            GGML_ASSERT(false);
-        }
-    }
-
-    if (hash_table[i] == p) {
-        return true;
-    }
-
-    // insert
-    hash_table[i] = p;
-    return false;
-}
-
 static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
     if (node->grad == NULL) {
         // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -15955,12 +16013,21 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         }
     }
 
+    // remember original gradients which start with zero values
+    void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
+    memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
+    for (int i = 0; i < gf->n_nodes; i++) {
+        if (gf->grads[i]) {
+            hash_insert(zero_table, gf->grads[i]);
+        }
+    }
+
     for (int i = gf->n_nodes - 1; i >= 0; i--) {
         struct ggml_tensor * node = gf->nodes[i];
 
         // because we detached the grad nodes from the original graph, we can afford inplace operations
         if (node->grad) {
-            ggml_compute_backward(ctx, node, keep);
+            ggml_compute_backward(ctx, node, keep, zero_table);
         }
     }
 
@@ -15972,6 +16039,8 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
             ggml_build_forward_expand(gb, node->grad);
         }
     }
+
+    free(zero_table);
 }
 
 struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
@@ -17574,7 +17643,7 @@ static enum ggml_opt_result ggml_opt_adam(
     }
 
     // compute the function value
-    ggml_graph_reset  (gf);
+    // ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
     struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
@@ -17668,7 +17737,7 @@ static enum ggml_opt_result ggml_opt_adam(
             callback(callback_data, &sched);
         }
 
-        ggml_graph_reset  (gf);
+        // ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
         ggml_graph_compute(gb, &cplan);
@@ -17806,7 +17875,7 @@ static enum ggml_opt_result linesearch_backtracking(
         {
             ggml_opt_set_params(np, ps, x);
 
-            ggml_graph_reset  (gf);
+            //ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
             ggml_graph_compute(gb, cplan);
@@ -17938,7 +18007,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     {
         ggml_opt_set_params(np, ps, x);
 
-        ggml_graph_reset  (gf);
+        //ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
         ggml_graph_compute(gb, &cplan);

From 011f47f9724f6fba8d15235484f33d186739fe89 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 16:02:46 +0200
Subject: [PATCH 086/235] remove trailing whitespace

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1c9349fd833dd..b0e1a376cf1aa 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15446,9 +15446,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
-                            ggml_reshape(ctx, 
+                            ggml_reshape(ctx,
                                 ggml_is_contiguous(tensor->grad)
-                                    ? tensor->grad 
+                                    ? tensor->grad
                                     : ggml_cont(ctx, tensor->grad),
                                 src0->grad),
                         zero_table);

From a0c2752ba7c3ad4222d82f8ff4e9d2c5eede554d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 16:24:13 +0200
Subject: [PATCH 087/235] remove debug prints and function to compute tensor
 data hash

---
 ggml-alloc.c | 25 +------------------------
 ggml.c       | 51 ---------------------------------------------------
 2 files changed, 1 insertion(+), 75 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 438db4537579e..ddf973daec7e4 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -162,22 +162,12 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
         printf("\n");
     }
 #endif
-    if ((char*)addr - (char*)alloc->data + size > alloc->max_size) {
-        printf("%s: op=%s name=%s max_size=%zu\n", __func__, ggml_op_name(tensor->op), ggml_get_name(tensor), (char*)addr - (char*)alloc->data + size);
-    }
+
     alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
 }
 
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    // static int counter = 0;
-    // counter++;
-    // if (counter > 2) {
-    //     printf("%s: counter=%d OMIT\n", __func__, counter);
-    //     return;
-    // } else {
-    //     printf("%s: counter=%d\n", __func__, counter);
-    // }
     void * ptr = tensor->data;
 
     if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
@@ -189,7 +179,6 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
 
     size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
-    // printf("%s:          free            data=[%p..%p] op=%s name=%s n_free_blocks=%d\n", __func__, tensor->data, (char*) tensor->data + size, ggml_op_name(tensor->op), ggml_get_name(tensor), alloc->n_free_blocks);
     AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -489,23 +478,11 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                 if (parent == NULL) {
                     break;
                 }
-                bool was_null = parent->data == NULL;
                 allocate_node(alloc, parent);
-                // if (was_null) {
-                //     printf("%s: alloc n[%02d]  %d data=[%p..%p] %s %s\n", __func__, i, j, parent->data, (char*) parent->data + ggml_nbytes(parent), ggml_op_name(parent->op), ggml_get_name(parent));
-                // } else {
-                //     printf("%s: exist n[%02d]  %d data=[%p..%p] %s %s\n", __func__, i, j, parent->data, (char*) parent->data + ggml_nbytes(parent), ggml_op_name(parent->op), ggml_get_name(parent));
-                // }
             }
 
             // allocate node
-            bool was_null = node->data == NULL;
             allocate_node(alloc, node);
-            // if (was_null) {
-            //     printf("%s: alloc node[%02d] data=[%p..%p] %s %s\n", __func__, i, node->data, (char*) node->data + ggml_nbytes(node), ggml_op_name(node->op), ggml_get_name(node));
-            // } else {
-            //     printf("%s: exist node[%02d] data=[%p..%p] %s %s\n", __func__, i, node->data, (char*) node->data + ggml_nbytes(node), ggml_op_name(node->op), ggml_get_name(node));
-            // }
 
             AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
             for (int j = 0; j < GGML_MAX_SRC; j++) {
diff --git a/ggml.c b/ggml.c
index b0e1a376cf1aa..90b610721df83 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17548,48 +17548,6 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
 //   ref: https://arxiv.org/pdf/1412.6980.pdf
 //
 
-uint32_t compute_data_checksum(struct ggml_tensor * tensor) {
-    const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1;
-    const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1;
-    const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1;
-    const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1;
-    const size_t nb0 = tensor->nb[0];
-    const size_t nb1 = tensor->nb[1];
-    const size_t nb2 = tensor->nb[2];
-    const size_t nb3 = tensor->nb[3];
-    const size_t nb  = ggml_element_size(tensor);
-    uint32_t result = 0;
-    for (int i3 = 0; i3 < n3; ++i3) {
-        for (int i2 = 0; i2 < n2; ++i2) {
-            for (int i1 = 0; i1 < n1; ++i1) {
-                for (int i0 = 0; i0 < n0; ++i0) {
-                    char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                    uint32_t val;
-                    memcpy(&val, ptr, nb);
-                    result = result ^ val;
-                    result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu;
-                }
-            }
-        }
-    }
-    return result;
-}
-
-void print_data_checksums(struct ggml_cgraph * g) {
-    for (int i = 0; i < g->n_nodes; ++i) {
-        struct ggml_tensor * node = g->nodes[i];
-        for (int j = 0; j<GGML_MAX_SRC; ++j) {
-            if (node->src[j]) {
-                struct ggml_tensor * src = node->src[j];
-                uint32_t chk = compute_data_checksum(src);
-                printf("%s: node[%3d]->src[%d] chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, j, chk, src->data, ggml_op_name(src->op), ggml_get_name(src));
-            }
-        }
-        uint32_t chk = compute_data_checksum(node);
-        printf("%s: node[%3d]         chk=[%08x] data=[%p] op=%s name=%s\n", __func__, i, chk, node->data, ggml_op_name(node->op), ggml_get_name(node));
-    }
-}
-
 static enum ggml_opt_result ggml_opt_adam(
         struct ggml_context * ctx,
         struct ggml_opt_context * opt,
@@ -17651,8 +17609,6 @@ static enum ggml_opt_result ggml_opt_adam(
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
     ggml_graph_compute(gb, &cplan);
 
-    print_data_checksums(gb);
-
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
     if (pf) {
@@ -17714,8 +17670,6 @@ static enum ggml_opt_result ggml_opt_adam(
             const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
             int64_t i = 0;
             for (int p = 0; p < np; ++p) {
-                printf("%s: para[%3d]          chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]), ggml_op_name(ps[p]->op), ggml_get_name(ps[p]));
-                printf("%s: para[%3d]->grad    chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]->grad), ggml_op_name(ps[p]->grad->op), ggml_get_name(ps[p]->grad));
                 const int64_t ne = ggml_nelements(ps[p]);
                 const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0) * sched;
                 for (int64_t j = 0; j < ne; ++j) {
@@ -17794,11 +17748,6 @@ static enum ggml_opt_result ggml_opt_adam(
         }
     }
 
-    print_data_checksums(gb);
-    for (int p = 0; p < np; ++p) {
-        printf("%s: para[%3d]          chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]), ggml_op_name(ps[p]->op), ggml_get_name(ps[p]));
-        printf("%s: para[%3d]->grad    chk=[%08x] op=%s name=%s\n", __func__, p, compute_data_checksum(ps[p]->grad), ggml_op_name(ps[p]->grad->op), ggml_get_name(ps[p]->grad));
-    }
     return GGML_OPT_DID_NOT_CONVERGE;
 }
 

From 113c90f1cca8dadcf500caced8ce1d4a4a4d2cef Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 16:24:42 +0200
Subject: [PATCH 088/235] improve optimization iteration prints

---
 examples/finetune/finetune.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index c089056be5524..588f1f3bbe9e5 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2639,6 +2639,7 @@ void opt_callback(void * vdata, float * sched) {
     *sched = min_sched + *sched * (1.0f - min_sched);
 
     int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+    if (impr_plot > 0) impr_plot = 0;
     printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
 
     if (data->shuffle_countdown < n_batch) {

From 7a63d429afe96709194ff398ec8a1b280584c736 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 17:32:31 +0200
Subject: [PATCH 089/235] adjust maximal values to support finetuning 3B models

---
 ggml-alloc.c | 2 +-
 ggml.h       | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index ddf973daec7e4..16f5a9e428f3d 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -56,7 +56,7 @@ struct free_block {
     size_t size;
 };
 
-#define MAX_FREE_BLOCKS 128
+#define MAX_FREE_BLOCKS 256
 
 struct ggml_allocr {
     void * data;
diff --git a/ggml.h b/ggml.h
index 02db9ad2b91d5..1400ced0c61ce 100644
--- a/ggml.h
+++ b/ggml.h
@@ -194,8 +194,8 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        256
+#define GGML_MAX_NODES         16384
+#define GGML_MAX_PARAMS        1024
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          48
@@ -475,7 +475,9 @@ extern "C" {
     // next prime after GGML_MAX_NODES
     // #define GGML_GRAPH_HASHTABLE_SIZE 4099
     // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
-    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 16411
+    #define GGML_GRAPH_HASHTABLE_SIZE 32771
 
     // computation graph
     struct ggml_cgraph {

From 63cb374a990fd9926396667f01e18a4101c2c4cb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 19:08:15 +0200
Subject: [PATCH 090/235] change default finetune params lora_r and lora_alpha
 to match the n_rank parameters of 4

---
 examples/finetune/finetune.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 588f1f3bbe9e5..375c8e4ba6ee2 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2162,8 +2162,8 @@ struct train_params get_default_train_params() {
     params.n_examples =    1;
     params.n_predict  = 1024;
 
-    params.lora_alpha  =  100;
-    params.lora_r      =  100;
+    params.lora_alpha  = 4;
+    params.lora_r      = 4;
 
     params.n_rank_attention_norm = 1;
     params.n_rank_wq             = 4;

From 6c9864003576d86cddbda8d938fe3c7d9622ff0a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 20:10:04 +0200
Subject: [PATCH 091/235] bug fix: make sure finetune input gradient is
 allocated at begin and kept until end

---
 examples/finetune/finetune.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 375c8e4ba6ee2..23d9b2bfd5ca4 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1224,6 +1224,10 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         // output tensors
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+        // input gradient
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
+        GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
+        ggml_allocr_alloc(alloc, t36->grad);
 
         // make sure base model tensors data cannot be used in viewable operations
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));

From 65b0561637511d3565c4b6f3d48dac4e28690241 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 20:25:03 +0200
Subject: [PATCH 092/235] remove unnecessary src tensor from ggml_get_rows_back

we don't need data of src[2] for computation, only to setup the correct output shape.
remove dependency on src[2], so that allocator can work more freely.

the computational graph is still completely determined, because the output shape is naturally included.
this is similar to how ggml_reshape does it.
---
 ggml.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/ggml.c b/ggml.c
index 90b610721df83..24f06b4ca14fd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6564,7 +6564,9 @@ struct ggml_tensor * ggml_get_rows_back(
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
-    result->src[2] = c;
+    // we don't need data of c for computation, only to setup the correct output shape.
+    // break dependency on c, so that allocator can work more freely.
+    //result->src[2] = c;
 
     return result;
 }
@@ -11282,14 +11284,15 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        const struct ggml_tensor * opt0,
               struct ggml_tensor * dst) {
     GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_are_same_shape(opt0, dst));
-    GGML_ASSERT(ggml_is_contiguous(opt0));
     GGML_ASSERT(ggml_is_contiguous(dst));
 
-    ggml_compute_forward_dup_same_cont(params, opt0, dst);
+    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
+
+    if (params->type == GGML_TASK_INIT) {
+        memset(dst->data, 0, ggml_nbytes(dst));
+    }
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
@@ -11315,11 +11318,8 @@ static void ggml_compute_forward_get_rows_back_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        const struct ggml_tensor * opt0,
               struct ggml_tensor * dst) {
     GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_are_same_shape(opt0, dst));
-    GGML_ASSERT(ggml_is_contiguous(opt0));
     GGML_ASSERT(ggml_is_contiguous(dst));
 
     // ggml_compute_forward_dup_same_cont(params, opt0, dst);
@@ -11353,16 +11353,15 @@ static void ggml_compute_forward_get_rows_back(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
-        const struct ggml_tensor * opt0,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst);
+                ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst);
+                ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -14867,7 +14866,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_GET_ROWS_BACK:
             {
-                ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+                ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_DIAG:
             {
@@ -15524,6 +15523,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
+                            // last ggml_get_rows_back argument src0->grad is only 
+                            // necessary to setup correct output shape
                             ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
                         zero_table);
                 }

From 3e47890760e079f4857343d5f7f13b1c6a014bb9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 20:51:00 +0200
Subject: [PATCH 093/235] remove unnecessary src tensor from ggml_repeat &
 ggml_repeat_back

we don't need data of src[1] for computation, only to setup the correct output shape.
remove dependency on src[1], so that allocator can work more freely.

the computational graph is still completely determined, because the output shape is naturally included
---
 ggml.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 24f06b4ca14fd..930dde1f85698 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5593,7 +5593,6 @@ struct ggml_tensor * ggml_repeat(
     result->op   = GGML_OP_REPEAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
-    result->src[1] = b;
 
     return result;
 }
@@ -5621,7 +5620,6 @@ struct ggml_tensor * ggml_repeat_back(
     result->op   = GGML_OP_REPEAT_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
-    result->src[1] = b;
 
     return result;
 }

From 37dfb544aa044ce4c0505ef9fd3b0d8c5e389891 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 18 Aug 2023 21:22:41 +0200
Subject: [PATCH 094/235] resolve todo

allocator will only make it inplace when they are of the same type
---
 examples/finetune/finetune.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 23d9b2bfd5ca4..67c2aef34623e 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1101,7 +1101,6 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
 
     auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
         if (ggml_is_quantized(a->type)) {
-            // todo make sure that ggml-alloc.c cannot make it inplace (of tensor a)
             return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
         } else {
             GGML_ASSERT(a->type == GGML_TYPE_F32);

From d61ed6b4316029b56d1c152ce04ab3bff96951d8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 20 Aug 2023 18:36:20 +0200
Subject: [PATCH 095/235] mixing multiple LORA adapters is now possible

pass more than one '--lora FNAME' argument to apply more than one LORA.
use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter.
---
 examples/common.cpp        | 26 ++++++++++++++++++++++----
 examples/common.h          |  4 ++--
 examples/server/server.cpp | 18 +++++++++++++++++-
 llama.cpp                  | 12 ++++++------
 llama.h                    |  2 ++
 5 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 21f4a0357d422..73fd16f36d197 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -310,7 +310,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter = argv[i];
+            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.use_mmap = false;
+        } else if (arg == "--lora-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            const char * lora_adapter = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
             params.use_mmap = false;
         } else if (arg == "--lora-base") {
             if (++i >= argc) {
@@ -601,6 +613,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
     fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stdout, "  -m FNAME, --model FNAME\n");
     fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@@ -677,10 +690,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
-    if (!params.lora_adapter.empty()) {
+    for (int i = 0; i < params.lora_adapter.size(); ++i) {
+        const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
+        float lora_scale = std::get<1>(params.lora_adapter[i]);
         int err = llama_model_apply_lora_from_file(model,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             lora_adapter.c_str(),
+                                             lora_scale,
+                                             ((i > 0) || params.lora_base.empty())
+                                                ? NULL
+                                                : params.lora_base.c_str(),
                                              params.n_threads);
         if (err != 0) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
diff --git a/examples/common.h b/examples/common.h
index 375bc0a3db416..6fa906c3f1e46 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -62,8 +62,8 @@ struct gpt_params {
     std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
-    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base    = "";  // base model path for the lora adapter
+    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::string lora_base  = "";                              // base model path for the lora adapter
 
     bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6f7a66da108c8..5cc5b67b1ee58 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -869,7 +869,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter = argv[i];
+            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.use_mmap = false;
+        }
+        else if (arg == "--lora-scaled")
+        {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            const char * lora_adapter = argv[i];
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
             params.use_mmap = false;
         }
         else if (arg == "--lora-base")
diff --git a/llama.cpp b/llama.cpp
index 6af1e003ca40b..33b7836bce13d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3401,7 +3401,7 @@ int llama_model_quantize(
     }
 }
 
-int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
     fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
     const int64_t t_start_lora_us = ggml_time_us();
@@ -3433,7 +3433,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     int32_t lora_alpha;
     fin.read((char *) &lora_r, sizeof(lora_r));
     fin.read((char *) &lora_alpha, sizeof(lora_alpha));
-    float scaling = (float)lora_alpha / (float)lora_r;
+    float scaling = scale * (float)lora_alpha / (float)lora_r;
 
     fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
 
@@ -3682,18 +3682,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     return 0;
 }
 
-int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
     try {
-        return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
+        return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
     } catch (const std::exception & err) {
         fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return 1;
     }
 }
 
-int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
     try {
-        return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
     } catch (const std::exception & err) {
         fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
         return 1;
diff --git a/llama.h b/llama.h
index bb6c3c107ce39..70df37c8d8efd 100644
--- a/llama.h
+++ b/llama.h
@@ -249,6 +249,7 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
             struct llama_context * ctx,
                       const char * path_lora,
+                           float   scale,
                       const char * path_base_model,
                              int   n_threads),
             "please use llama_model_apply_lora_from_file instead");
@@ -256,6 +257,7 @@ extern "C" {
     LLAMA_API int llama_model_apply_lora_from_file(
             const struct llama_model * model,
                       const char * path_lora,
+                           float   scale,
                       const char * path_base_model,
                              int   n_threads);
 

From 27c24ffa1b0fdd6e39c8528b4683433f2874fce2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 20 Aug 2023 20:16:46 +0200
Subject: [PATCH 096/235] add option to save finetune output every N iterations

---
 examples/finetune/finetune.cpp | 66 +++++++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 67c2aef34623e..ad5631da1b65a 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1856,8 +1856,19 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     }
 }
 
-void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename) {
-    struct llama_file file(filename, "wb");
+std::string replace_str(const char * s, const char * needle, const char * replacement) {
+    std::string str = s;
+    size_t pos = str.find(needle);
+    if (pos != std::string::npos) {
+        str.replace(pos, strlen(needle), replacement);
+    }
+    return str;
+}
+
+void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, const char * pattern_it, int iteration) {
+    std::string sit = std::to_string(iteration);
+    std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    struct llama_file file(fn.c_str(), "wb");
     if (file.fp == NULL) {
         return;
     }
@@ -2021,8 +2032,10 @@ bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
     return (file.fp != NULL);
 }
 
-void save_as_llama_lora(struct my_llama_lora * lora, const char * filename) {
-    struct llama_file file(filename, "wb");
+void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration) {
+    std::string sit = std::to_string(iteration);
+    std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    struct llama_file file(fn.c_str(), "wb");
     if (file.fp == NULL) {
         return;
     }
@@ -2088,6 +2101,9 @@ struct train_params {
     const char * fn_checkpoint_in;
     const char * fn_checkpoint_out;
     const char * fn_lora_out;
+    const char * pattern_fn_it;
+
+    int save_every;
 
     uint32_t seed;
 
@@ -2154,8 +2170,11 @@ struct train_params get_default_train_params() {
     params.fn_model_base     = "";
     params.fn_train_data     = "shakespeare.txt";
     params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint.bin";
-    params.fn_lora_out       = "ggml-lora-f32.bin";
+    params.fn_checkpoint_out = "checkpoint-ITERATION.bin";
+    params.fn_lora_out       = "ggml-lora-ITERATION-f32.bin";
+    params.pattern_fn_it     = "ITERATION";
+
+    params.save_every = 10;
 
     params.seed       =   -1;
 
@@ -2228,6 +2247,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
     fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
     fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
+    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
+    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%s')\n", params->save_every);
     fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
@@ -2325,6 +2346,18 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->fn_lora_out = argv[i];
+        } else if (arg == "--pattern-fn-it") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->pattern_fn_it = argv[i];
+        } else if (arg == "--save-every") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->save_every = std::stoi(argv[i]);
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2614,6 +2647,9 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
 struct opt_callback_data {
     struct train_params *     params;
     struct ggml_opt_context * opt;
+    struct my_llama_model *   model;
+    struct my_llama_lora  *   lora;
+    int                       last_save_iter;
     llama_token *             tokens_data;
     size_t                    tokens_size;
     int *                     samples_data;
@@ -2630,6 +2666,17 @@ void opt_callback(void * vdata, float * sched) {
     struct ggml_opt_context * opt   = data->opt;
     int n_batch = params->n_batch;
 
+    const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
+    if (save_now) {
+        if (strlen(params->fn_checkpoint_out) > 0) {
+            save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter);
+        }
+        if (strlen(params->fn_lora_out) > 0) {
+            save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter);
+        }
+        data->last_save_iter = opt->iter;
+    }
+
     *sched = (opt->iter < params->warmup)
                 ? (float) opt->iter / (float) params->warmup
                 : cosine_decay_restart(
@@ -2854,6 +2901,9 @@ int main(int argc, char ** argv) {
     struct opt_callback_data opt_cb_data;
     opt_cb_data.params = &params;
     opt_cb_data.opt = opt;
+    opt_cb_data.model = &model;
+    opt_cb_data.lora = &lora;
+    opt_cb_data.last_save_iter = opt->iter;
     opt_cb_data.tokens_data = train_tokens.data();
     opt_cb_data.tokens_size = train_tokens.size();
     opt_cb_data.samples_data = train_samples.data();
@@ -2988,11 +3038,11 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     if (params.n_examples > 0) {
-        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out);
+        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, opt->iter);
     }
 
     if (strlen(params.fn_lora_out) > 0) {
-        save_as_llama_lora(&lora, params.fn_lora_out);
+        save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter);
     }
 
     {

From 8b4106ae33085da0f882da17fc95eab1b891eec2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 21 Aug 2023 02:24:25 +0200
Subject: [PATCH 097/235] also save latest finetune output with
 ITERATION="LATEST" and print where files are saved

saving with LATEST makes it easier to resume training from the latest checkpoint
the string "LATEST" can be configured with command line option "--fn-latest STR"
---
 examples/finetune/finetune.cpp | 37 ++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ad5631da1b65a..eaceb71dff61f 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1865,9 +1865,10 @@ std::string replace_str(const char * s, const char * needle, const char * replac
     return str;
 }
 
-void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, const char * pattern_it, int iteration) {
-    std::string sit = std::to_string(iteration);
+void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, const char * pattern_it, int iteration, const char * latest) {
+    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    printf("%s: saving to %s\n", __func__, fn.c_str());
     struct llama_file file(fn.c_str(), "wb");
     if (file.fp == NULL) {
         return;
@@ -2032,9 +2033,10 @@ bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
     return (file.fp != NULL);
 }
 
-void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration) {
-    std::string sit = std::to_string(iteration);
+void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration, const char * latest) {
+    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    printf("%s: saving to %s\n", __func__, fn.c_str());
     struct llama_file file(fn.c_str(), "wb");
     if (file.fp == NULL) {
         return;
@@ -2102,6 +2104,7 @@ struct train_params {
     const char * fn_checkpoint_out;
     const char * fn_lora_out;
     const char * pattern_fn_it;
+    const char * fn_latest;
 
     int save_every;
 
@@ -2173,6 +2176,7 @@ struct train_params get_default_train_params() {
     params.fn_checkpoint_out = "checkpoint-ITERATION.bin";
     params.fn_lora_out       = "ggml-lora-ITERATION-f32.bin";
     params.pattern_fn_it     = "ITERATION";
+    params.fn_latest         = "LATEST";
 
     params.save_every = 10;
 
@@ -2248,7 +2252,8 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
     fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
     fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
-    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%s')\n", params->save_every);
+    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
+    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
     fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
@@ -2352,6 +2357,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->pattern_fn_it = argv[i];
+        } else if (arg == "--fn-latest") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_latest = argv[i];
         } else if (arg == "--save-every") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2669,11 +2680,13 @@ void opt_callback(void * vdata, float * sched) {
     const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
     if (save_now) {
         if (strlen(params->fn_checkpoint_out) > 0) {
-            save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter);
-        }
+            save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter, params->fn_latest);
+            save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, -1, params->fn_latest);
+       }
         if (strlen(params->fn_lora_out) > 0) {
-            save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter);
-        }
+            save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
+            save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, -1, params->fn_latest);
+       }
         data->last_save_iter = opt->iter;
     }
 
@@ -3038,11 +3051,13 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     if (params.n_examples > 0) {
-        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, opt->iter);
+        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, -1, params.fn_latest);
     }
 
     if (strlen(params.fn_lora_out) > 0) {
-        save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter);
+        save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
     }
 
     {

From 77a3092c830d58a9dfac26aa0445dac57f7b4f3e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 23 Aug 2023 19:34:45 +0200
Subject: [PATCH 098/235] update checkpoint train stats before saving via
 "--save-every"

---
 examples/finetune/finetune.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index eaceb71dff61f..f4beb59e25761 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2676,17 +2676,23 @@ void opt_callback(void * vdata, float * sched) {
     struct train_params * params    = data->params;
     struct ggml_opt_context * opt   = data->opt;
     int n_batch = params->n_batch;
+    int n_ctx = params->n_ctx;
 
     const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
     if (save_now) {
+        int new_iters = opt->iter - data->last_save_iter;
+        data->lora->train_its += new_iters;
+        data->lora->train_samples += new_iters * n_batch;
+        data->lora->train_tokens  += new_iters * n_batch * n_ctx;
+
         if (strlen(params->fn_checkpoint_out) > 0) {
             save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter, params->fn_latest);
             save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, -1, params->fn_latest);
-       }
+        }
         if (strlen(params->fn_lora_out) > 0) {
             save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
             save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, -1, params->fn_latest);
-       }
+        }
         data->last_save_iter = opt->iter;
     }
 
@@ -3004,10 +3010,6 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
-        int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter;
-        lora.train_its = opt->iter;
-        lora.train_samples += n_batch * n_iter;
-        lora.train_tokens  += n_batch * n_tokens * n_iter;
 
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
@@ -3050,6 +3052,11 @@ int main(int argc, char ** argv) {
     double  dd = (double) d * 1e-3;
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
+    int new_iters = opt->iter - opt_cb_data.last_save_iter;
+    lora.train_its += new_iters;
+    lora.train_samples += new_iters * n_batch;
+    lora.train_tokens  += new_iters * n_batch * n_tokens;
+
     if (params.n_examples > 0) {
         save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, opt->iter, params.fn_latest);
         save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, -1, params.fn_latest);
@@ -3060,6 +3067,8 @@ int main(int argc, char ** argv) {
         save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
     }
 
+    opt_cb_data.last_save_iter = opt->iter;
+
     {
         int n_gen = params.n_predict;
         int sample_ctx = n_tokens - n_tokens/8;

From 1a5f0a30e0eac106a08ab1c40d5b603dfecbd3f0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 23 Aug 2023 20:00:48 +0200
Subject: [PATCH 099/235] add command line option `--rank-wo N` for rank of wo
 tensor

---
 examples/finetune/finetune.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index f4beb59e25761..77d0cacda5162 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2270,6 +2270,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor (default %d)\n", params->n_rank_wq);
     fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor (default %d)\n", params->n_rank_wk);
     fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor (default %d)\n", params->n_rank_wv);
+    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor (default %d)\n", params->n_rank_wo);
     fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor (default %d)\n", params->n_rank_w1);
     fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor (default %d)\n", params->n_rank_w2);
     fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor (default %d)\n", params->n_rank_w3);
@@ -2465,6 +2466,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_rank_wv = std::stoi(argv[i]);
+        } else if (arg == "--rank-wo") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_rank_wo = std::stoi(argv[i]);
         } else if (arg == "--rank-w1") {
             if (++i >= argc) {
                 invalid_param = true;

From 7df517c797872e5b235659046099b5a60daa64b4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 23 Aug 2023 20:08:48 +0200
Subject: [PATCH 100/235] update finetune README

---
 examples/finetune/README.md | 55 +++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index 726ec47c0ce4f..ea17c38d9ee3d 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -1,4 +1,4 @@
-# train-text-from-scratch
+# finetune
 
 Basic usage instructions:
 
@@ -6,17 +6,50 @@ Basic usage instructions:
 # get training data
 wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
 
-# train
-./bin/train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab.bin \
-        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.bin \
-        --checkpoint-out chk-shakespeare-256x16.bin \
-        --model-out ggml-shakespeare-256x16-f32.bin \
+# finetune LORA adapter
+./bin/finetune \
+        --model-base open-llama-3b-v2-q8_0.bin \
+        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
+        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
+        --model-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
         --train-data "shakespeare.txt" \
-        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
-        --print-details-interval 0 --predict 16 --use-flash
+        --save-every 10 \
+        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
+        --print-details-interval 0 --predict 0 \
+        --use-checkpointing --use-alloc \
+        --mem-lora 2 --mem-compute 1 --mem-compute0 20
 
 # predict
-./bin/main -m ggml-shakespeare-256x16-f32.bin
+./bin/main -m open-llama-3b-v2-q8_0.bin --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```
+
+Finetune output files will be saved every N iterations (config with `--save-every N`).
+The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
+
+Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
+If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
+
+To change the amount of memory for finetuning with memory allocator (`--use-alloc`, used by default), you can use `--mem-compute0 N` to specify the number of gigabytes.
+
+After training, text is generated using the trained LORA. 
+But this text prediction is not optimized as well as it is in `main`. 
+It may result in out-of-memory crash, to disable the text prediction after training use `--predict 0`.
+
+The LORA rank is configured for each model tensor type separately with these command line options:
+
+```bash
+  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
+  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
+  --rank-out-norm N          LORA rank for output norm tensor (default 1)
+  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
+  --rank-out N               LORA rank for output tensor (default 4)
+  --rank-wq N                LORA rank for wq tensor (default 4)
+  --rank-wk N                LORA rank for wk tensor (default 4)
+  --rank-wv N                LORA rank for wv tensor (default 4)
+  --rank-wo N                LORA rank for wo tensor (default 4)
+  --rank-w1 N                LORA rank for w1 tensor (default 4)
+  --rank-w2 N                LORA rank for w2 tensor (default 4)
+  --rank-w3 N                LORA rank for w3 tensor (default 4)
+```
+
+To see all available options use `finetune --help`.

From aecc3b38904718b80a11aa94ce44577122b26907 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:39:59 +0200
Subject: [PATCH 101/235] fix dump_non_result_info_yaml to output multiple lora
 adapters

---
 common/common.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1a07107c3532c..8503da88abe3a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1046,7 +1046,20 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
         fprintf(stream, "  %d: %f", lb.first, lb.second);
     }
 
-    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
+    fprintf(stream, "lora:\n");
+    for (std::tuple<std::string, float> la : params.lora_adapter) {
+        if (std::get<1>(la) != 1.0f) {
+            continue;
+        }
+        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
+    }
+    fprintf(stream, "lora_scaled:\n");
+    for (std::tuple<std::string, float> la : params.lora_adapter) {
+        if (std::get<1>(la) == 1.0f) {
+            continue;
+        }
+        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
+    }
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);

From aa8016e95d2ff1b74d74ea186bb7d9cf0e5fd829 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:40:30 +0200
Subject: [PATCH 102/235] bug fix: replace GGML_TYPE_SIZE[t] by
 ggml_type_size(t)

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index ba44c5fbe0a6a..681891c8c98dc 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11418,7 +11418,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     GGML_ASSERT(ne3  == ne13);
 
     // we don't support permuted src0 dim0
-    GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
 
     // dst dim0 cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
@@ -17507,7 +17507,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     size_t cur = 0;
 
                     if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
+                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
 
                     work_size = MAX(work_size, cur);

From daedc6f419f1a5b42f238e89364f4d15dd54e423 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:40:53 +0200
Subject: [PATCH 103/235] replace llama_n_mult by llama_n_ff

---
 llama.cpp | 8 ++++----
 llama.h   | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4a73a62ef2867..58b5a6b444053 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5572,8 +5572,8 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
-int llama_n_mult(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_mult;
+int llama_n_ff(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_ff;
 }
 
 int llama_n_head(const struct llama_context * ctx) {
@@ -5604,8 +5604,8 @@ int llama_model_n_embd(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
-int llama_model_n_mult(const struct llama_model * model) {
-    return model->hparams.n_mult;
+int llama_model_n_ff(const struct llama_model * model) {
+    return model->hparams.n_ff;
 }
 
 int llama_model_n_head(const struct llama_model * model) {
diff --git a/llama.h b/llama.h
index 8062013c1d551..16f01cc18e131 100644
--- a/llama.h
+++ b/llama.h
@@ -247,7 +247,7 @@ extern "C" {
     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-    LLAMA_API int llama_n_mult (const struct llama_context * ctx);
+    LLAMA_API int llama_n_ff   (const struct llama_context * ctx);
     LLAMA_API int llama_n_head (const struct llama_context * ctx);
     LLAMA_API int llama_n_rot  (const struct llama_context * ctx);
     LLAMA_API int llama_n_layer(const struct llama_context * ctx);
@@ -257,7 +257,7 @@ extern "C" {
     LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
     LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
     LLAMA_API int llama_model_n_embd (const struct llama_model * model);
-    LLAMA_API int llama_model_n_mult (const struct llama_model * model);
+    LLAMA_API int llama_model_n_ff   (const struct llama_model * model);
     LLAMA_API int llama_model_n_head (const struct llama_model * model);
     LLAMA_API int llama_model_n_rot  (const struct llama_model * model);
     LLAMA_API int llama_model_n_layer(const struct llama_model * model);

From 5ce92aed37e520fe1282b3d16cbe892e3f3b9ef3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:41:19 +0200
Subject: [PATCH 104/235] finetune bug fixes to compile with merged in code
 from master

---
 examples/finetune/finetune.cpp | 108 ++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 77d0cacda5162..38fba3b8d620a 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -17,8 +17,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
-
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -195,11 +193,13 @@ struct my_llama_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;   // this is provided as user input?
     uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 4;
+    uint32_t n_ff    = 11008;
     uint32_t n_head  = 32;
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
 
+    float f_rms_norm_eps = 1e-5f;
+
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
     }
@@ -304,18 +304,12 @@ struct my_llama_lora {
     uint32_t train_tokens = 0;
 };
 
-uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
-    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
-    return n_ff;
-}
-
 void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
     printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
     printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, get_n_ff(params));
     printf("%s: n_layer: %u\n", __func__, params->n_layer);
     printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
@@ -338,19 +332,18 @@ void print_lora_params(struct my_llama_lora_hparams * params) {
 void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
     auto & hparams = model->hparams;
 
-    hparams.n_vocab = llama_n_vocab_from_model(input);
+    hparams.n_vocab = llama_model_n_vocab(input);
     hparams.n_ctx   = n_ctx;
-    hparams.n_embd  = llama_n_embd_from_model(input);
-    hparams.n_mult  = llama_n_mult_from_model(input);
-    hparams.n_head  = llama_n_head_from_model(input);
-    hparams.n_layer = llama_n_layer_from_model(input);
-    hparams.n_rot   = llama_n_rot_from_model(input);
+    hparams.n_embd  = llama_model_n_embd(input);
+    hparams.n_ff    = llama_model_n_ff(input);
+    hparams.n_head  = llama_model_n_head(input);
+    hparams.n_layer = llama_model_n_layer(input);
+    hparams.n_rot   = llama_model_n_rot(input);
 
     const uint32_t n_embd  = hparams.n_embd;
     const uint32_t n_layer = hparams.n_layer;
     const uint32_t n_vocab = hparams.n_vocab;
-
-    const uint32_t n_ff = get_n_ff(&hparams);
+    const uint32_t n_ff    = hparams.n_ff;
 
     model->tok_embeddings = llama_get_model_tensor(input, "tok_embeddings.weight");
     model->norm           = llama_get_model_tensor(input, "norm.weight");
@@ -398,7 +391,7 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
     const uint32_t n_embd  = model->hparams.n_embd;
     const uint32_t n_layer = model->hparams.n_layer;
     const uint32_t n_vocab = model->hparams.n_vocab;
-    const uint32_t n_ff = get_n_ff(&model->hparams);
+    const uint32_t n_ff    = model->hparams.n_ff;
 
     struct ggml_context * ctx = lora->ctx;
 
@@ -603,6 +596,8 @@ struct ggml_tensor * forward(
     const int n_head  = hparams.n_head;
     const int n_rot   = hparams.n_rot;
 
+    const float rms_norm_eps = hparams.f_rms_norm_eps;
+
     GGML_ASSERT(n_layer == lora->layers.size());
 
     struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -1082,7 +1077,8 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     const int n_layer    = hparams.n_layer;
     const int n_head     = hparams.n_head;
     const int n_rot      = hparams.n_rot;
-    const int n_ff       = get_n_ff(&hparams);
+    const int n_ff       = hparams.n_ff;
+    const float rms_norm_eps = hparams.f_rms_norm_eps;
     const int rope_mode  = 0;
 
     GGML_ASSERT(n_layer == lora->layers.size());
@@ -1317,7 +1313,7 @@ void print_matrix(struct ggml_tensor * probs) {
 
 
 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_get_text(ctx, token));
 }
 
 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -1351,7 +1347,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
     }
 }
 
-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
 
@@ -1360,7 +1356,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
 
     ggml_set_f32(target_logits, -1.0f/n_vocab);
     ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
         set_f32_2d(target_logits, token, i-1, +1.0f);
@@ -1371,7 +1367,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
     }
 }
 
-void get_example_targets_batch(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context* lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -1394,7 +1390,7 @@ void get_example_targets_batch(const int * train_samples, size_t n_train_samples
         // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
             set_f32_3d(target_logits, token, i-1, k, +1.0f);
@@ -1544,7 +1540,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
         const char * in  = buf.data();
         const char * end = buf.data() + buf.size();
         for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
+            const char * s = llama_token_get_text(lctx, out[i]);
             int len = strlen(s);
             if (in >= end) {
                 printf("%s: unexpected end of original text.\n", __func__);
@@ -1617,7 +1613,7 @@ void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx)
     sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
 }
 
-llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
+llama_token sample(struct llama_context * lctx, struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
     GGML_ASSERT(sampler->ctx != NULL);
 
     struct llama_context * ctx = sampler->ctx;
@@ -1638,7 +1634,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
     const auto params = sampler->params;
 
     // Apply penalties
-    const float nl_logit = logits[llama_token_nl()];
+    const float nl_logit = logits[llama_token_nl(lctx)];
 
     const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
 
@@ -1657,7 +1653,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
         params.presence_penalty);
 
     if (!params.penalize_nl) {
-        logits[llama_token_nl()] = nl_logit;
+        logits[llama_token_nl(lctx)] = nl_logit;
     }
 
     llama_token token = 0;
@@ -1884,7 +1880,7 @@ void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
     file.write_u32(lora->train_tokens);
     file.write_u32(model->hparams.n_vocab);
     file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
+    //file.write_u32(model->hparams.n_mult); 
     file.write_u32(model->hparams.n_head);
     file.write_u32(model->hparams.n_layer);
     file.write_u32(model->hparams.n_rot);
@@ -1961,7 +1957,7 @@ bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
         uint32_t n_rot         = file.read_u32();
         GGML_ASSERT(n_vocab == model->hparams.n_vocab);
         GGML_ASSERT(n_embd  == model->hparams.n_embd);
-        GGML_ASSERT(n_mult  == model->hparams.n_mult);
+        //GGML_ASSERT(n_mult  == model->hparams.n_mult);
         GGML_ASSERT(n_head  == model->hparams.n_head);
         GGML_ASSERT(n_layer == model->hparams.n_layer);
         GGML_ASSERT(n_rot   == model->hparams.n_rot);
@@ -2042,8 +2038,9 @@ void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, cons
         return;
     }
 
+    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
     // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
+    file.write_u32(LLAMA_FILE_MAGIC_LORA);   // magic
     file.write_u32(1); // version
     // write_hparams
     file.write_u32(lora->hparams.lora_r);
@@ -2667,6 +2664,7 @@ struct opt_callback_data {
     struct ggml_opt_context * opt;
     struct my_llama_model *   model;
     struct my_llama_lora  *   lora;
+    struct llama_context *    lctx;
     int                       last_save_iter;
     llama_token *             tokens_data;
     size_t                    tokens_size;
@@ -2728,6 +2726,7 @@ void opt_callback(void * vdata, float * sched) {
     }
 
     get_example_targets_batch(
+        data->lctx,
         data->samples_data,
         data->samples_size,
         data->tokens_data,
@@ -2760,24 +2759,24 @@ int main(int argc, char ** argv) {
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-    struct llama_vocab vocab;
-    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-        vocab.id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
-        }
-    }
+    //struct llama_vocab vocab;
+    //{
+    //    std::vector<const char *> strings;
+    //    std::vector<float> scores;
+    //    int n_vocab = llama_n_vocab(lctx);
+    //    strings.resize(n_vocab, NULL);
+    //    scores.resize(n_vocab, 0);
+    //    n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+    //    GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+    //    vocab.id_to_token.resize(n_vocab);
+    //    for (int i=0; i<n_vocab; ++i) {
+    //        std::string tok   = std::string(strings[i]);
+    //        float       score = scores[i];
+    //        vocab.id_to_token[i].tok   = tok;
+    //        vocab.id_to_token[i].score = score;
+    //        vocab.token_to_id.emplace(tok, i);
+    //    }
+    //}
 
     printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
@@ -2911,7 +2910,7 @@ int main(int argc, char ** argv) {
     std::vector<int> train_samples;
     train_samples.push_back(0);
     for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
             train_samples.push_back(i);
         }
     }
@@ -2929,6 +2928,7 @@ int main(int argc, char ** argv) {
     opt_cb_data.opt = opt;
     opt_cb_data.model = &model;
     opt_cb_data.lora = &lora;
+    opt_cb_data.lctx = lctx;
     opt_cb_data.last_save_iter = opt->iter;
     opt_cb_data.tokens_data = train_tokens.data();
     opt_cb_data.tokens_size = train_tokens.size();
@@ -3031,7 +3031,7 @@ int main(int argc, char ** argv) {
             for (int i=0; i<n_batch; ++i) {
                 init_sampler(&sampler, lctx);
                 for (int k=0; k<n_tokens; ++k) {
-                    int32_t token = sample(&sampler,
+                    int32_t token = sample(lctx, &sampler,
                         (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]),
                         (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]),
                         k);
@@ -3101,7 +3101,7 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * target_logits = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
         struct ggml_tensor * target_probs  = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
 
-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
+        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
         for (int i=sample_ctx; i<n_tokens; ++i) {
             ggml_set_i32_1d(tokens_input, i, n_vocab/2);
         }
@@ -3131,7 +3131,7 @@ int main(int argc, char ** argv) {
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
 
             // set_logits_masked(logits, token_notavail, -1e9);
-            int token = sample(&sampler,
+            int token = sample(lctx, &sampler,
                 (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]),
                 (llama_token *) tokens_input->data,
                 sample_ctx-1);

From 271c0300de977c1d4bb0ccada5421823be2dd0c1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:50:59 +0200
Subject: [PATCH 105/235] remove prediction related code to reduce duplicated
 code with main

use main instead
---
 examples/finetune/finetune.cpp | 615 +--------------------------------
 1 file changed, 2 insertions(+), 613 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 38fba3b8d620a..0e14c1ae3db9b 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -61,17 +61,6 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
@@ -165,17 +154,6 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
     return tensor;
 }
 
-struct my_llama_kv_cache {
-    struct ggml_context * ctx = NULL;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    // llama_ctx_buffer buf;
-
-    int n; // number of tokens currently in the cache
-};
-
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -540,293 +518,6 @@ void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std
     }
 }
 
-bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_ctx   = hparams.n_ctx;
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = n_layer*n_ctx*n_batch;
-    const int64_t n_elements = n_embd*n_mem;
-
-    // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-
-    // struct ggml_init_params params;
-    // params.mem_size   = cache.buf.size;
-    // params.mem_buffer = cache.buf.addr;
-    // params.no_alloc   = false;
-    if (!cache->ctx) {
-        struct ggml_init_params params;
-        params.mem_size   = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024;
-        params.mem_buffer = NULL;
-        params.no_alloc   = false;
-
-        cache->ctx = ggml_init(params);
-
-        if (!cache->ctx) {
-            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            return false;
-        }
-    }
-
-    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
-
-    return true;
-}
-
-struct ggml_tensor * forward(
-        struct my_llama_model    * model,
-        struct my_llama_lora     * lora,
-        struct my_llama_kv_cache * cache,
-        struct ggml_context   * ctx0,
-        struct ggml_cgraph    * gf,
-        struct ggml_tensor    * tokens_input,
-        const  int              n_tokens,
-        const  int              n_past) {
-
-    const int N = n_tokens;
-
-    struct my_llama_kv_cache& kv_self = *cache;
-    const auto & hparams = model->hparams;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_head  = hparams.n_head;
-    const int n_rot   = hparams.n_rot;
-
-    const float rms_norm_eps = hparams.f_rms_norm_eps;
-
-    GGML_ASSERT(n_layer == lora->layers.size());
-
-    struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens));
-
-    struct ggml_tensor * kc = kv_self.k;
-    struct ggml_tensor * vc = kv_self.v;
-
-    struct ggml_tensor * tok_embeddings = ggml_add(ctx0, model->tok_embeddings, ggml_mul_mat(ctx0, lora->tok_embeddings_a, lora->tok_embeddings_b));
-    struct ggml_tensor * norm = ggml_add(ctx0, model->norm, ggml_mul_mat(ctx0, lora->norm_a, lora->norm_b));
-    struct ggml_tensor * output = ggml_add(ctx0, model->output, ggml_mul_mat(ctx0, lora->output_a, lora->output_b));
-
-
-    // inpL shape [n_embd,N,1,1]
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, tok_embeddings, tokens);
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * inpSA = inpL;
-
-        struct ggml_tensor * cur;
-
-        // lctx.use_buf(ctx0, 0);
-        struct ggml_tensor * attention_norm = ggml_add(ctx0, model->layers[il].attention_norm, ggml_mul_mat(ctx0, lora->layers[il].attention_norm_a, lora->layers[il].attention_norm_b));
-        struct ggml_tensor * ffn_norm = ggml_add(ctx0, model->layers[il].ffn_norm, ggml_mul_mat(ctx0, lora->layers[il].ffn_norm_a, lora->layers[il].ffn_norm_b));
-        struct ggml_tensor * wq = ggml_add(ctx0, model->layers[il].wq, ggml_mul_mat(ctx0, lora->layers[il].wq_a, lora->layers[il].wq_b));
-        struct ggml_tensor * wk = ggml_add(ctx0, model->layers[il].wk, ggml_mul_mat(ctx0, lora->layers[il].wk_a, lora->layers[il].wk_b));
-        struct ggml_tensor * wv = ggml_add(ctx0, model->layers[il].wv, ggml_mul_mat(ctx0, lora->layers[il].wv_a, lora->layers[il].wv_b));
-        struct ggml_tensor * wo = ggml_add(ctx0, model->layers[il].wo, ggml_mul_mat(ctx0, lora->layers[il].wo_a, lora->layers[il].wo_b));
-        struct ggml_tensor * w1 = ggml_add(ctx0, model->layers[il].w1, ggml_mul_mat(ctx0, lora->layers[il].w1_a, lora->layers[il].w1_b));
-        struct ggml_tensor * w2 = ggml_add(ctx0, model->layers[il].w2, ggml_mul_mat(ctx0, lora->layers[il].w2_a, lora->layers[il].w2_b));
-        struct ggml_tensor * w3 = ggml_add(ctx0, model->layers[il].w3, ggml_mul_mat(ctx0, lora->layers[il].w3_a, lora->layers[il].w3_b));
-
-        // norm
-        {
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0,
-                            attention_norm,
-                            cur),
-                        cur);
-        }
-
-        // self-attention
-        {
-            // compute Q and K and RoPE them
-            // wq   shape [n_embd, n_embd, 1, 1]
-            // wk   shape [n_embd, n_embd, 1, 1]
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Kcur shape [n_embd/n_head, n_head, N, 1]
-
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
-
-            // store key and value to memory
-            {
-                // compute the transposed [N, n_embd] V matrix
-                // wv   shape [n_embd, n_embd, 1, 1]
-                // Vcur shape [n_embd, N, 1, 1]
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, wv, cur), n_embd, N)));
-
-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
-                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
-                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
-
-                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-                } //*/
-
-                kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
-                vc = ggml_set_2d_inplace(ctx0, vc, Vcur, (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
-            }
-
-            // Qcur shape [n_embd/n_head, n_head, N, 1]
-            // Q shape    [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-            // K shape [n_embd/n_head, n_past + N, n_head, 1]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // KQ shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // KQ_masked shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // KQ_soft_max shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // split cached V into n_head heads
-            //// V shape [n_past + N, n_embd/n_head, n_head, 1]
-            // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1]
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, vc,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(vc),
-                        n_ctx*ggml_element_size(vc)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(vc)*n_embd);
-
-            // KQV shape [n_embd/n_head, N, n_head, 1]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // KQV_merged shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            // KQV_merged shape
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N);
-            // cur = ggml_cpy(ctx0,
-            //         KQV_merged,
-            //         ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    wo,
-                    cur);
-        }
-
-        // lctx.use_buf(ctx0, 1);
-
-        // inpFF shape [n_embd,N,1,1]
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-
-        // feed-forward network
-        {
-            // norm
-            {
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
-
-                // cur = ffn_norm*cur
-                // cur shape [n_embd,N,1,1]
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0,
-                            ffn_norm,
-                            cur),
-                        cur);
-            }
-
-            // tmp shape [n_ff,N,1,1]
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    w3,
-                    cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    w1,
-                    cur);
-
-            // SILU activation
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_silu(ctx0, cur);
-
-            // cur shape [n_ff,N,1,1]
-            cur = ggml_mul(ctx0, cur, tmp);
-
-            // cur shape [n_embd,N,1,1]
-            cur = ggml_mul_mat(ctx0,
-                    w2,
-                    cur);
-        }
-
-        // cur shape [n_embd,N,1,1]
-        cur = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        // inpL shape [n_embd,N,1,1]
-        inpL = cur;
-    }
-
-    // norm
-    {
-
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
-
-        // inpL = norm*inpL
-        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0,
-                        norm,
-                        inpL),
-                    inpL);
-
-        //embeddings = inpL;
-    }
-
-    // lm_head
-    // inpL shape [n_vocab,N,1,1]
-    inpL = ggml_mul_mat(ctx0, output, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-
-    return inpL;
-}
-
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
     GGML_ASSERT(tensor->n_dims == 1);
     GGML_ASSERT(tensor->ne[0] == ne0);
@@ -1292,61 +983,6 @@ int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
     return *ptr;
 }
 
-void print_row(struct ggml_tensor * probs, int i) {
-    for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = get_f32_2d(probs, k, i);
-        printf(" %.2f", p);
-    }
-    printf("\n");
-}
-
-void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
-    for (int i = 0; i < probs->ne[1]; ++i) {
-        for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = get_f32_2d(probs, k, i);
-            printf(" %.2f", p);
-        }
-        printf("\n");
-    }
-}
-
-
-void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_get_text(ctx, token));
-}
-
-void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i=0; i<tokens->ne[0]; ++i) {
-        int token = ggml_get_i32_1d(tokens, i);
-        print_token(ctx, token);
-    }
-}
-
-void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) {
-    for (int i1=0; i1<tokens->ne[1]; ++i1) {
-        //int num_newline = 0;
-        for (int i0=0; i0<tokens->ne[0]; ++i0) {
-            int token = get_i32_2d(tokens, i0, i1);
-            print_token(ctx, token);
-            // bool isnl = (token == llama_token_nl());
-            // if (isnl) {
-            //     ++num_newline;
-            // }
-            // if (isnl) {
-            //     if (num_newline < 2) {
-            //         print_token(ctx, token);
-            //     } else {
-            //         printf("\\n");
-            //     }
-            // } else {
-            //     print_token(ctx, token);
-            // }
-        }
-        printf("\n--\n");
-    }
-}
-
 void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
@@ -1402,19 +1038,6 @@ void get_example_targets_batch(struct llama_context* lctx, const int * train_sam
     }
 }
 
-
-void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab = target_logits->ne[0];
-    for (int i=0; i<n_tokens-n_shift; ++i) {
-        ggml_set_i32_1d(tokens_input, i, ggml_get_i32_1d(tokens_input, i + n_shift));
-        for (int k=0; k<n_vocab; ++k) {
-            ggml_set_f32_1d(target_logits, i*n_vocab + k, ggml_get_f32_1d(target_logits, (i + n_shift)*n_vocab + k));
-            ggml_set_f32_1d(target_probs, i*n_vocab + k,  ggml_get_f32_1d(target_probs,  (i + n_shift)*n_vocab + k));
-        }
-    }
-}
-
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
@@ -1576,112 +1199,6 @@ void shuffle_ints(int * begin, int * end) {
     });
 }
 
-struct my_llama_sampler_params {
-    float temp              = 0.0f;  // <= 0.0 disabled
-    int   top_k             = 20;    // <= 0 to use vocab size
-    float top_p             = 0.95f; // 1.0 = disabled
-    float tfs_z             = 1.00f; // 1.0 = disabled
-    float typical_p         = 1.00f; // 1.0 = disabled
-    int   repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float repeat_penalty    = 1.0f;  // 1.0 = disabled
-    float presence_penalty  = 0.0f;  // 0.0 = disabled
-    float frequency_penalty = 0.0f;  // 0.0 = disabled
-    int   mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float mirostat_tau      = 5.00f; // target entropy
-    float mirostat_eta      = 0.10f; // learning rate
-    bool  penalize_nl       = true;  // consider newlines as a repeatable token
-};
-
-struct my_llama_sampler {
-    struct llama_context * ctx = NULL;
-    my_llama_sampler_params params;
-
-    int n_vocab = 0;
-    int n_ctx = 0;
-
-    float mirostat_mu;
-
-    std::vector<llama_token_data> candidates;
-    llama_token_data_array candidates_p;
-
-};
-
-void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) {
-    sampler->ctx = ctx;
-    sampler->n_vocab = llama_n_vocab(sampler->ctx);
-    sampler->n_ctx   = llama_n_ctx(sampler->ctx);
-    sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau;
-}
-
-llama_token sample(struct llama_context * lctx, struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) {
-    GGML_ASSERT(sampler->ctx != NULL);
-
-    struct llama_context * ctx = sampler->ctx;
-
-    sampler->candidates.resize(sampler->n_vocab);
-    for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) {
-        sampler->candidates[token_id].id = token_id;
-        sampler->candidates[token_id].logit = logits[token_id];
-        sampler->candidates[token_id].p = 0.0;
-    }
-
-    llama_token_data_array * candidates_p = & sampler->candidates_p;
-
-    candidates_p->data = sampler->candidates.data();
-    candidates_p->size = sampler->candidates.size();
-    candidates_p->sorted = false;
-
-    const auto params = sampler->params;
-
-    // Apply penalties
-    const float nl_logit = logits[llama_token_nl(lctx)];
-
-    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
-
-    llama_sample_repetition_penalty(
-        ctx,
-        candidates_p,
-        last_tokens + n_last_tokens - n_last,
-        n_last,
-        params.repeat_penalty);
-    llama_sample_frequency_and_presence_penalties(
-        ctx,
-        candidates_p,
-        last_tokens + n_last_tokens - n_last,
-        n_last,
-        params.frequency_penalty,
-        params.presence_penalty);
-
-    if (!params.penalize_nl) {
-        logits[llama_token_nl(lctx)] = nl_logit;
-    }
-
-    llama_token token = 0;
-    if (params.temp <= 0) {
-        // Greedy sampling
-        token = llama_sample_token_greedy(ctx, candidates_p);
-    } else {
-        if (params.mirostat == 1) {
-            int mirostat_m = 100;
-            llama_sample_temperature(ctx, candidates_p, params.temp);
-            token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu);
-        } else if (params.mirostat == 2) {
-            llama_sample_temperature(ctx, candidates_p, params.temp);
-            token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu);
-        } else {
-            // Temperature sampling
-            llama_sample_top_k        (ctx, candidates_p, params.top_k, 1);
-            llama_sample_tail_free    (ctx, candidates_p, params.tfs_z, 1);
-            llama_sample_typical      (ctx, candidates_p, params.typical_p, 1);
-
-            llama_sample_top_p        (ctx, candidates_p, params.top_p, 1);
-            llama_sample_temperature  (ctx, candidates_p, params.temp);
-            token = llama_sample_token(ctx, candidates_p);
-        }
-    }
-    return token;
-}
-
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     if (tensor == NULL) {
         file->write_u32(0);
@@ -2111,7 +1628,6 @@ struct train_params {
     int n_threads;
     int n_batch;
     int n_examples;
-    int n_predict;
 
     int32_t lora_r;
     int32_t lora_alpha;
@@ -2130,7 +1646,6 @@ struct train_params {
     int n_rank_output;
 
     int print_info_interval;
-    int print_details_interval;
 
     bool samples_start_after_nl;
     bool use_adam;
@@ -2183,7 +1698,6 @@ struct train_params get_default_train_params() {
     params.n_threads  =    6;
     params.n_batch    =    8;
     params.n_examples =    1;
-    params.n_predict  = 1024;
 
     params.lora_alpha  = 4;
     params.lora_r      = 4;
@@ -2202,7 +1716,6 @@ struct train_params get_default_train_params() {
     params.n_rank_output         = 4;
 
     params.print_info_interval    = 1;
-    params.print_details_interval = 2;
 
     params.samples_start_after_nl = false;
     params.use_adam               = true;
@@ -2256,7 +1769,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
-    fprintf(stderr, "  --predict N                Number of tokens to generate after training (default %d)\n", params->n_predict);
     fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
     fprintf(stderr, "  --lora-r N                 LORA r     : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_r);
     fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor (default %d)\n", params->n_rank_attention_norm);
@@ -2272,7 +1784,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor (default %d)\n", params->n_rank_w2);
     fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor (default %d)\n", params->n_rank_w3);
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
-    fprintf(stderr, "  --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval);
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
@@ -2301,7 +1812,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
-    fprintf(stderr, "  --mem-lora N               Memory to allocate for LORA and cache in gigabytes. (default %d)\n", params->mem_lora_gb);
+    fprintf(stderr, "  --mem-lora N               Memory to allocate for LORA in gigabytes. (default %d)\n", params->mem_lora_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "\n");
@@ -2397,12 +1908,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_examples = std::stoi(argv[i]);
-        } else if (arg == "--predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_predict = std::stoi(argv[i]);
         } else if (arg == "--lora-alpha") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2493,12 +1998,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->print_info_interval = std::stoi(argv[i]);
-        } else if (arg == "--print-details-interval") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->print_details_interval = std::stoi(argv[i]);
         } else if (arg == "--samples-after-nl") {
             params->samples_start_after_nl = true;
         } else if (arg == "--use-lbfgs") {
@@ -2824,17 +2323,12 @@ int main(int argc, char ** argv) {
     }
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
-    struct my_llama_kv_cache kv_self;
-
     struct ggml_init_params lcparams;
     lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_lora_gb);
     lcparams.mem_buffer = NULL;
     lcparams.no_alloc   = false;
 
     lora.ctx = ggml_init(lcparams);
-    kv_self.ctx = lora.ctx;
-
-    my_llama_sampler sampler;
 
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
@@ -2886,11 +2380,7 @@ int main(int argc, char ** argv) {
         randomize_lora(&lora, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
-    init_kv_cache(&kv_self, &model, 1);
-    // init_kv_cache(&kv_self, &model, n_batch);
-    init_sampler(&sampler, lctx);
-
-    printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(lora.ctx));
+    printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
     // ggml_print_tensor_objects(lora.ctx);
 
     // TODO: use std::vector<uint8_t> intead of "new"
@@ -2919,8 +2409,6 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
-    std::vector<uint8_t> work_buffer;
-
     printf("%s: begin training\n", __func__);
 
     struct opt_callback_data opt_cb_data;
@@ -2959,8 +2447,6 @@ int main(int argc, char ** argv) {
         ggml_set_no_alloc(ctx0, false);
 
         // don't use alloc for input tensors, so we can safely fill them with data
-        struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
         struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
@@ -3026,31 +2512,6 @@ int main(int argc, char ** argv) {
             printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
         }
 
-        if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) {
-            // set_logits_masked(logits, token_notavail, -1e9);
-            for (int i=0; i<n_batch; ++i) {
-                init_sampler(&sampler, lctx);
-                for (int k=0; k<n_tokens; ++k) {
-                    int32_t token = sample(lctx, &sampler,
-                        (float *)       ((char *) logits->data + i*logits->nb[2] + k*logits->nb[1]),
-                        (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]),
-                        k);
-                    * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token;
-                }
-            }
-
-            // printf("probabilities after optimization:\n");
-            // print_matrix(after_opt_probs);
-            printf("Example:\n---\n");
-            print_tokens_batch(lctx, tokens_input);
-            printf("\n---\n");
-
-            // printf("best samples after optimization:\n---\n");
-            printf("samples after optimization:\n---\n");
-            print_tokens_batch(lctx, after_opt_best_samples);
-            printf("\n---\n");
-        }
-
         ggml_free(ctx0);
     }
 
@@ -3076,78 +2537,6 @@ int main(int argc, char ** argv) {
 
     opt_cb_data.last_save_iter = opt->iter;
 
-    {
-        int n_gen = params.n_predict;
-        int sample_ctx = n_tokens - n_tokens/8;
-
-        // use defaults from common.h
-        sampler.params.top_k             = 40;
-        sampler.params.top_p             = 0.95f;
-        sampler.params.tfs_z             = 1.00f;
-        sampler.params.typical_p         = 1.00f;
-        sampler.params.temp              = 0.8f;
-        sampler.params.repeat_penalty    = 1.1f;
-        sampler.params.repeat_last_n     = 64;
-        sampler.params.frequency_penalty = 0.0f;
-        sampler.params.presence_penalty  = 0.0f;
-        sampler.params.mirostat          = 0;
-        sampler.params.mirostat_tau      = 5.00f;
-        sampler.params.mirostat_eta      = 0.10f;
-        init_sampler(&sampler, lctx);
-
-        printf("[Prediction context]\n");
-
-        struct ggml_tensor * tokens_input  = ggml_new_tensor_1d(lora.ctx, GGML_TYPE_I32, n_tokens);
-        struct ggml_tensor * target_logits = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
-        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(lora.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
-
-        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
-        for (int i=sample_ctx; i<n_tokens; ++i) {
-            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
-        }
-
-        for (int i=0; i<sample_ctx-1; ++i) {
-            print_token(lctx, ggml_get_i32_1d(tokens_input, i));
-        }
-
-        printf("\n[Generating %d tokens]\n", n_gen);
-        for (int i=0; i<n_gen; ++i) {
-            struct ggml_init_params cparams = {
-                compute_size, // .mem_size
-                compute_addr, // .mem_buffer
-                false,        // .no_alloc
-            };
-            struct ggml_context * ctx0 = ggml_init(cparams);
-
-            struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &lora, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-
-            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
-
-            //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
-            //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
-
-            // set_logits_masked(logits, token_notavail, -1e9);
-            int token = sample(lctx, &sampler,
-                (float *) ((char *) logits->data + (sample_ctx-1)*logits->nb[1]),
-                (llama_token *) tokens_input->data,
-                sample_ctx-1);
-            //int token = ggml_get_i32_1d(best_samples, sample_ctx-1);
-
-            // print_row(probs, sample_at);
-            print_token(lctx, token);
-
-            lshift_examples(tokens_input, target_logits, target_probs, 1);
-            ggml_set_i32_1d(tokens_input, 0, 0);
-            ggml_set_i32_1d(tokens_input, sample_ctx-1, token);
-
-            ggml_free(ctx0);
-        }
-    }
-
     if (alloc) {
         ggml_allocr_free(alloc);
     }

From 9a28bce29a1d491f717d84822b075689fdcd8d34 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:56:44 +0200
Subject: [PATCH 106/235] reduce large memory overhead in
 train-text-from-scratch

all gradients had to be pinned so that graph_reset works correctly.
this is no longer necessary with the changes to ggml_compute_backward introduced in this PR.
---
 .../train-text-from-scratch.cpp                        | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 6fe85d419618f..5ad17e6b8cc1d 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -793,15 +793,7 @@ struct ggml_tensor * llama_build_train_graphs(
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
         GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
         ggml_allocr_alloc(alloc, t36->grad);
-        // gradient tensors (will be set to zero by ggml_graph_reset)
-        // pinning these produces large unnecessary memory overhead, which will be resolved by PR 2632
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            if (!gf->grads[i]) continue;
-            if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) {
-                ggml_allocr_alloc(alloc, gf->grads[i]);
-            }
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one));
-        }
+
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
         for (int i = 0; i < (int) checkpoints.size(); ++i) {

From 49af7fbe126e3f2580e0809d2f02903342435aa0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 00:57:28 +0200
Subject: [PATCH 107/235] add comment explaining why finetune checkpoints are
 allocated in one block

---
 examples/finetune/finetune.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 0e14c1ae3db9b..1f38d4b9865b1 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -932,6 +932,8 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
             ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
         }
 
+        // allocating checkpoints in one block to reduce memory fragmentation
+        // note: they will be freed in reverse order
         for (int i = 0; i < checkpoints.size(); ++i) {
             if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);

From 007280c82fccd24cd7b5560c0094c02a3c32475c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 01:00:58 +0200
Subject: [PATCH 108/235] make default value of float member a float literal

---
 .../train-text-from-scratch/train-text-from-scratch.cpp     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 5ad17e6b8cc1d..fcfe3a6f7b645 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -164,8 +164,8 @@ struct my_llama_hparams {
     uint32_t n_rot   = 64;
     uint32_t n_ff    = 11008;
 
-    // float f_norm_eps     = 1e-5; // falcon
-    float f_norm_rms_eps = 1e-5; // llama
+    // float f_norm_eps     = 1e-5f; // falcon
+    float f_norm_rms_eps = 1e-5f; // llama
 
     float rope_freq_base  = 10000.0f;
     float rope_freq_scale = 1.0f;
@@ -1572,7 +1572,7 @@ struct train_params get_default_train_params() {
     params.n_batch    =    8;
     params.n_examples =    1;
 
-    params.f_norm_rms_eps  = 1e-5;
+    params.f_norm_rms_eps  = 1e-5f;
     params.rope_freq_base  = 10000.0f;
     params.rope_freq_scale = 1.0f;
 

From 1faee64db9ccb58c6320f9a29c99c8767e71958b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 01:09:35 +0200
Subject: [PATCH 109/235] handle rms_norm and rope parameters the same as in
 train-text-from-scratch

---
 examples/finetune/finetune.cpp | 61 ++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 1f38d4b9865b1..08edd4bb9e504 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -176,8 +176,6 @@ struct my_llama_hparams {
     uint32_t n_layer = 32;
     uint32_t n_rot   = 64;
 
-    float f_rms_norm_eps = 1e-5f;
-
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
     }
@@ -229,6 +227,12 @@ struct my_llama_lora_hparams {
     uint32_t n_rank_norm = 1;
     uint32_t n_rank_output = 4;
 
+    // float f_norm_eps     = 1e-5f; // falcon
+    float f_norm_rms_eps = 1e-5f; // llama
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
+
     bool operator!=(const my_llama_lora_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
     }
@@ -769,8 +773,9 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     const int n_head     = hparams.n_head;
     const int n_rot      = hparams.n_rot;
     const int n_ff       = hparams.n_ff;
-    const float rms_norm_eps = hparams.f_rms_norm_eps;
-    const int rope_mode  = 0;
+    const float rms_norm_eps    = lora->hparams.f_norm_rms_eps;
+    const float rope_freq_base  = lora->hparams.rope_freq_base;
+    const float rope_freq_scale = lora->hparams.rope_freq_scale;
 
     GGML_ASSERT(n_layer == lora->layers.size());
 
@@ -781,6 +786,18 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         }
     };
 
+    // rope has so much parameters that we make a custom function for it
+    auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+                (struct ggml_tensor * t) -> struct ggml_tensor * {
+        // not capturing these, to silcence warnings
+        const int n_past    = 0;
+        const int rope_mode = 0;
+
+        return ggml_rope_custom(ctx,
+            t, n_past, n_rot, rope_mode, n_ctx,
+            rope_freq_base, rope_freq_scale);
+    };
+
     set_name(tokens_input, "tokens_input");
     set_name(targets,      "targets");
 
@@ -834,10 +851,10 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
         struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
         struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = ggml_rope_inplace (ctx, t06, n_past, n_rot, rope_mode, n_ctx);   set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
         struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
         struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = ggml_rope_inplace (ctx, t09, n_past, n_rot, rope_mode, n_ctx);   set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
 
         struct ggml_tensor * t11;
         if (ggml_is_quantized(wv->type)) {
@@ -1631,6 +1648,10 @@ struct train_params {
     int n_batch;
     int n_examples;
 
+    float f_norm_rms_eps;
+    float rope_freq_base;
+    float rope_freq_scale;
+
     int32_t lora_r;
     int32_t lora_alpha;
 
@@ -1701,6 +1722,10 @@ struct train_params get_default_train_params() {
     params.n_batch    =    8;
     params.n_examples =    1;
 
+    params.f_norm_rms_eps  = 1e-5f;
+    params.rope_freq_base  = 10000.0f;
+    params.rope_freq_scale = 1.0f;
+
     params.lora_alpha  = 4;
     params.lora_r      = 4;
 
@@ -1771,6 +1796,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
+    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
+    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
+    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
     fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
     fprintf(stderr, "  --lora-r N                 LORA r     : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_r);
     fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor (default %d)\n", params->n_rank_attention_norm);
@@ -1910,6 +1938,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_examples = std::stoi(argv[i]);
+        } else if (arg == "--norm-rms-eps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->f_norm_rms_eps = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->rope_freq_base = std::stof(argv[i]);
+        } else if (arg == "--rope-freq-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->rope_freq_scale = std::stof(argv[i]);
         } else if (arg == "--lora-alpha") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2290,6 +2336,9 @@ int main(int argc, char ** argv) {
     init_model(lmodel, &model, params.n_ctx);
 
     struct my_llama_lora lora;
+    lora.hparams.f_norm_rms_eps        = params.f_norm_rms_eps;
+    lora.hparams.rope_freq_base        = params.rope_freq_base;
+    lora.hparams.rope_freq_scale       = params.rope_freq_scale;
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.lora_alpha;
     lora.hparams.n_rank_attention_norm = params.n_rank_attention_norm;

From a3b45298f11bf30f7c3b075a210ad36ed5e70190 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 01:12:51 +0200
Subject: [PATCH 110/235] remove unused code

---
 examples/finetune/finetune.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 08edd4bb9e504..0e8a816a17ecc 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -568,7 +568,6 @@ static size_t hash_find(void * hash_table[], void * p) {
 }
 
 static bool hash_insert(void * hash_table[], void * p) {
-    size_t h = hash(p);
     size_t i = hash_find(hash_table, p);
 
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
@@ -592,7 +591,6 @@ struct hash_map {
     void * keys[GGML_GRAPH_HASHTABLE_SIZE];
     void * vals[GGML_GRAPH_HASHTABLE_SIZE];
 };
-static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
 
 struct hash_map * new_hash_map() {
     struct hash_map * result = new struct hash_map;

From ca97583f0b5c403ee09f9d34c15ebfb94441945d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 01:19:45 +0200
Subject: [PATCH 111/235] remove vocab related code as it is unnecessary

---
 examples/finetune/finetune.cpp | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 0e8a816a17ecc..09cfc6bf3d937 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -154,19 +154,6 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
     return tensor;
 }
 
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    struct token_score {
-        token tok;
-        float score;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
-};
-
 struct my_llama_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;   // this is provided as user input?
@@ -2304,25 +2291,6 @@ int main(int argc, char ** argv) {
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-    //struct llama_vocab vocab;
-    //{
-    //    std::vector<const char *> strings;
-    //    std::vector<float> scores;
-    //    int n_vocab = llama_n_vocab(lctx);
-    //    strings.resize(n_vocab, NULL);
-    //    scores.resize(n_vocab, 0);
-    //    n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-    //    GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
-    //    vocab.id_to_token.resize(n_vocab);
-    //    for (int i=0; i<n_vocab; ++i) {
-    //        std::string tok   = std::string(strings[i]);
-    //        float       score = scores[i];
-    //        vocab.id_to_token[i].tok   = tok;
-    //        vocab.id_to_token[i].score = score;
-    //        vocab.token_to_id.emplace(tok, i);
-    //    }
-    //}
-
     printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
     if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {

From e030f7b2c5e68066c99f1d716c91146e659ead1c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 01:27:28 +0200
Subject: [PATCH 112/235] add LLM_KV_TRAINING_TYPE to train-text-from-scratch
 checkpoints

so that they can be differentiated from lora finetune checkpoints
---
 .../convert-train-checkpoint-to-gguf.py          | 12 ++++++++----
 .../train-text-from-scratch.cpp                  | 16 ++++++++++++----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
index 01b3ee92a5a0c..773bf1262b268 100644
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -44,10 +44,13 @@
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
 
-LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"
+LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
+LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
+LLM_KV_TRAINING_TYPE               = "training.type"
+LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
 
 class Tensor:
     def __init__(self, dtype='f', ne=None):
@@ -457,6 +460,7 @@ def save_gguf(self, gguf_writer):
         gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
         gguf_writer.add_layer_norm_rms_eps(1e-5)
         gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
+        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
         gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
         gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
         gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index fcfe3a6f7b645..020440a62ec1d 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -246,10 +246,13 @@ const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.m
 const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
 const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
 
-const char * LLM_KV_TRAINING_FILE_VERSION    = "training.file_version";
-const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count";
-const char * LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count";
-const char * LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count";
+const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
+const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
+const char * LLM_KV_TRAINING_TYPE               = "training.type";
+const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
+const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
+const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
+const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
 
 // gguf constants (sync with gguf.py)
 
@@ -1431,6 +1434,10 @@ void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
     GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
     GGML_ASSERT(file_version == 0);
 
+    std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
+    GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
+    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
+
     GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
     GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
     GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
@@ -1442,6 +1449,7 @@ void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_mode
     save_llama_model_gguf(fctx, fn_vocab_model, model);
 
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    model->train_samples);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     model->train_tokens);

From ecb1b20c857cf806d0fb539025599b58fcf7461a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 01:40:02 +0200
Subject: [PATCH 113/235] add gguf constants and load/save functions from
 train-text-from-scratch

---
 examples/finetune/finetune.cpp | 322 ++++++++++++++++++++++++++++++++-
 1 file changed, 313 insertions(+), 9 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 09cfc6bf3d937..3ac3cf5bab748 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -273,6 +273,77 @@ struct my_llama_lora {
     uint32_t train_tokens = 0;
 };
 
+// gguf constants
+const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
+const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
+const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
+const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
+const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
+const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
+const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
+const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
+const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
+const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
+const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
+const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
+const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
+const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
+const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
+
+const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
+const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
+const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
+
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
+const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
+
+const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
+const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
+const char * LLM_KV_TRAINING_TYPE               = "training.type";
+const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
+const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
+const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
+const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
+
+// gguf constants (sync with gguf.py)
+
+const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
+const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
+
+const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
+const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
+const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
+const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
+const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
+const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
+const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
+const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
+const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
+
+const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
+const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
+const char * LLM_TENSOR_OUTPUT        = "output";
+const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
+const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
+const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
+const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
+const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
+const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
+const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
+const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
+const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
+
 void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
@@ -1203,6 +1274,248 @@ void shuffle_ints(int * begin, int * end) {
     });
 }
 
+std::string replace_str(const char * s, const char * needle, const char * replacement) {
+    std::string str = s;
+    size_t pos = str.find(needle);
+    if (pos != std::string::npos) {
+        str.replace(pos, strlen(needle), replacement);
+    }
+    return str;
+}
+
+#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
+{ \
+    const std::string skey(key); \
+    const int kid = gguf_find_key(ctx, skey.c_str()); \
+    if (kid >= 0) { \
+        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
+        if (ktype != (type)) { \
+            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
+        } \
+        (dst) = func(ctx, kid); \
+    } else if (req) { \
+        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
+    } \
+}
+
+bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
+    GGML_ASSERT(a != NULL);
+    GGML_ASSERT(b != NULL);
+    GGML_ASSERT(a->type == b->type);
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
+
+    return true;
+}
+
+void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
+    if (dst == NULL) {
+        return;
+    }
+    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
+    GGML_ASSERT(are_same_layout(dst, t));
+    memcpy(dst->data, t->data, ggml_nbytes(t));
+
+    if (strlen(ggml_get_name(dst)) == 0) {
+        ggml_set_name(dst, name);
+    }
+}
+
+void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
+    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
+
+    uint32_t file_version;
+    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
+    GGML_ASSERT(file_version == 0);
+
+    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
+    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
+    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
+
+    uint64_t nx;
+    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
+    opt->nx = (size_t) nx;
+
+    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
+
+    std::string opt_type;
+    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
+    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
+        opt->params.type = GGML_OPT_ADAM;
+
+        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
+        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
+        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
+
+        GGML_ASSERT(opt->ctx != NULL);
+        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
+
+        read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
+        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
+        opt->params.type = GGML_OPT_LBFGS;
+
+        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
+        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
+        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
+        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
+        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
+        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
+        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
+
+        GGML_ASSERT(opt->ctx != NULL);
+        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
+
+        read_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
+        read_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
+        read_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
+        read_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
+        read_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
+        read_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+        read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
+        read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
+        read_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
+        read_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
+    } else {
+        throw std::runtime_error("unknown optimizer type\n");
+    }
+}
+
+void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
+    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
+    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
+
+                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
+                if (opt->adam.pf) {
+                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+                }
+
+                gguf_add_tensor(fctx, opt->adam.m);
+                gguf_add_tensor(fctx, opt->adam.v);
+                if (opt->adam.pf) {
+                    gguf_add_tensor(fctx, opt->adam.pf);
+                }
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
+
+                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
+                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
+                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
+                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
+                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
+                if (opt->lbfgs.pf) {
+                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+                }
+                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
+                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
+                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
+                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
+
+                gguf_add_tensor(fctx, opt->lbfgs.x);
+                gguf_add_tensor(fctx, opt->lbfgs.xp);
+                gguf_add_tensor(fctx, opt->lbfgs.g);
+                gguf_add_tensor(fctx, opt->lbfgs.gp);
+                gguf_add_tensor(fctx, opt->lbfgs.d);
+                if (opt->lbfgs.pf) {
+                    gguf_add_tensor(fctx, opt->lbfgs.pf);
+                }
+                gguf_add_tensor(fctx, opt->lbfgs.lmal);
+                gguf_add_tensor(fctx, opt->lbfgs.lmys);
+                gguf_add_tensor(fctx, opt->lbfgs.lms);
+                gguf_add_tensor(fctx, opt->lbfgs.lmy);
+            } break;
+    }
+}
+
+void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
+    // TODO
+}
+
+void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
+    // TODO
+}
+
+void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+    load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
+
+    uint32_t file_version;
+    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
+    GGML_ASSERT(file_version == 0);
+
+    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
+    GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
+    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
+
+    GGUF_GET_KEY(fctx, lora->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
+    GGUF_GET_KEY(fctx, lora->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+    GGUF_GET_KEY(fctx, lora->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+
+    load_opt_context_gguf(fctx, f_ggml_ctx, opt);
+}
+
+void save_checkpoint_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+    save_llama_lora_gguf(fctx, model, lora);
+
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, lora->train_its);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    lora->train_samples);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     lora->train_tokens);
+
+    save_opt_context_gguf(fctx, opt);
+}
+
+bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+    struct ggml_context * f_ggml_ctx;
+    struct gguf_init_params params;
+    params.no_alloc = false;
+    params.ctx = &f_ggml_ctx;
+    struct gguf_context * fctx = gguf_init_from_file(filename, params);
+    if (fctx == NULL) {
+        return false;
+    }
+
+    load_checkpoint_gguf(fctx, f_ggml_ctx, model, lora, opt);
+
+    return true;
+}
+
+void save_checkpoint_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
+    std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    printf("%s: saving to %s\n", __func__, fn.c_str());
+    struct gguf_context * fctx = gguf_init_empty();
+
+    save_checkpoint_gguf(fctx, model, lora, opt);
+
+    // write file
+    const bool only_meta = false;
+    gguf_write_to_file(fctx, fn.c_str(), only_meta);
+    gguf_free(fctx);
+}
+
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     if (tensor == NULL) {
         file->write_u32(0);
@@ -1373,15 +1686,6 @@ void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struc
     }
 }
 
-std::string replace_str(const char * s, const char * needle, const char * replacement) {
-    std::string str = s;
-    size_t pos = str.find(needle);
-    if (pos != std::string::npos) {
-        str.replace(pos, strlen(needle), replacement);
-    }
-    return str;
-}
-
 void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());

From 0564f4ed1f95dc3a701300475293b0a75195667d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 18:20:39 +0200
Subject: [PATCH 114/235] add load & save lora finetune checkpoints via gguf

---
 examples/finetune/finetune.cpp | 928 +++++++++++++++++++++------------
 1 file changed, 597 insertions(+), 331 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 3ac3cf5bab748..586e88f76283b 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -17,6 +17,40 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+
+uint32_t compute_data_checksum(struct ggml_tensor * tensor) {
+    const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1;
+    const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1;
+    const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1;
+    const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1;
+    const size_t nb0 = tensor->nb[0];
+    const size_t nb1 = tensor->nb[1];
+    const size_t nb2 = tensor->nb[2];
+    const size_t nb3 = tensor->nb[3];
+    const size_t nb  = ggml_element_size(tensor);
+    uint32_t result = 0;
+    for (int i3 = 0; i3 < n3; ++i3) {
+        for (int i2 = 0; i2 < n2; ++i2) {
+            for (int i1 = 0; i1 < n1; ++i1) {
+                for (int i0 = 0; i0 < n0; ++i0) {
+                    char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                    uint32_t val;
+                    memcpy(&val, ptr, nb);
+                    result = result ^ val;
+                    result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+void print_data_checksum(struct ggml_tensor * tensor) {
+    uint32_t chk = compute_data_checksum(tensor);
+    printf("%s: chk=[%08x] data=[%p] name=%s\n", __func__, chk, tensor->data, ggml_get_name(tensor));
+}
+
+
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -316,6 +350,19 @@ const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
 const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
 const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
 
+const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
+const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
+const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
+const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm";
+const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q";
+const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k";
+const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v";
+const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output";
+const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm";
+const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate";
+const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down";
+const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up";
+
 // gguf constants (sync with gguf.py)
 
 const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
@@ -372,6 +419,19 @@ void print_lora_params(struct my_llama_lora_hparams * params) {
 void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
     auto & hparams = model->hparams;
 
+    std::vector<char> tn_buf;
+    tn_buf.resize(GGML_MAX_NAME);
+    auto tn = [&tn_buf](const char * key) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
+        return tn_buf.data();
+    };
+    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
+        std::string s = tn_buf.data();
+        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
+        return tn_buf.data();
+    };
+
     hparams.n_vocab = llama_model_n_vocab(input);
     hparams.n_ctx   = n_ctx;
     hparams.n_embd  = llama_model_n_embd(input);
@@ -385,43 +445,24 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
     const uint32_t n_vocab = hparams.n_vocab;
     const uint32_t n_ff    = hparams.n_ff;
 
-    model->tok_embeddings = llama_get_model_tensor(input, "tok_embeddings.weight");
-    model->norm           = llama_get_model_tensor(input, "norm.weight");
-    model->output         = llama_get_model_tensor(input, "output.weight");
+    model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
+    model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
+    model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
 
     model->layers.resize(n_layer);
 
-    char name[GGML_MAX_NAME];
-
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = model->layers[i];
 
-        snprintf(name, GGML_MAX_NAME, "layers.%d.attention_norm.weight", i);
-        layer.attention_norm = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wq.weight", i);
-        layer.wq = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wk.weight", i);
-        layer.wk = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wv.weight", i);
-        layer.wv = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.attention.wo.weight", i);
-        layer.wo = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.ffn_norm.weight", i);
-        layer.ffn_norm = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w1.weight", i);
-        layer.w1 = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w2.weight", i);
-        layer.w2 = llama_get_model_tensor(input, name);
-
-        snprintf(name, GGML_MAX_NAME, "layers.%d.feed_forward.w3.weight", i);
-        layer.w3 = llama_get_model_tensor(input, name);
+        layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i));
+        layer.wq             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i));
+        layer.wk             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i));
+        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
+        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
+        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
+        layer.w1             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
+        layer.w2             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
+        layer.w3             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
     }
 }
 
@@ -439,6 +480,19 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
     lora->train_samples = 0;
     lora->train_tokens = 0;
 
+    std::vector<char> tn_buf;
+    tn_buf.resize(GGML_MAX_NAME);
+    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
+        return tn_buf.data();
+    };
+    auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
+        std::string s = tn_buf.data();
+        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
+        return tn_buf.data();
+    };
+
     lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
     lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
     lora->norm_a           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
@@ -446,12 +500,12 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
     lora->output_a         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd);
     lora->output_b         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab);
 
-    ggml_set_name(lora->tok_embeddings_a, "tok_embeddings.weight.loraA");
-    ggml_set_name(lora->tok_embeddings_b, "tok_embeddings.weight.loraB");
-    ggml_set_name(lora->norm_a,           "norm.weight.loraA");
-    ggml_set_name(lora->norm_b,           "norm.weight.loraB");
-    ggml_set_name(lora->output_a,         "output.weight.loraA");
-    ggml_set_name(lora->output_b,         "output.weight.loraB");
+    ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_a"));
+    ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_b"));
+    ggml_set_name(lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a"));
+    ggml_set_name(lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b"));
+    ggml_set_name(lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_a"));
+    ggml_set_name(lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_b"));
 
     lora->layers.resize(n_layer);
     for (uint32_t i = 0; i < n_layer; ++i) {
@@ -481,27 +535,24 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
         layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
         layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
 
-        ggml_format_name(layer.attention_norm_a, "%s.attention_norm.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.attention_norm_b, "%s.attention_norm.weight.loraB", layers_i.c_str());
-
-        ggml_format_name(layer.wq_a, "%s.attention.wq.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.wq_b, "%s.attention.wq.weight.loraB", layers_i.c_str());
-        ggml_format_name(layer.wk_a, "%s.attention.wk.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.wk_b, "%s.attention.wk.weight.loraB", layers_i.c_str());
-        ggml_format_name(layer.wv_a, "%s.attention.wv.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.wv_b, "%s.attention.wv.weight.loraB", layers_i.c_str());
-        ggml_format_name(layer.wo_a, "%s.attention.wo.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.wo_b, "%s.attention.wo.weight.loraB", layers_i.c_str());
-
-        ggml_format_name(layer.ffn_norm_a, "%s.ffn_norm.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.ffn_norm_b, "%s.ffn_norm.weight.loraB", layers_i.c_str());
-
-        ggml_format_name(layer.w1_a, "%s.feed_forward.w1.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.w1_b, "%s.feed_forward.w1.weight.loraB", layers_i.c_str());
-        ggml_format_name(layer.w2_a, "%s.feed_forward.w2.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.w2_b, "%s.feed_forward.w2.weight.loraB", layers_i.c_str());
-        ggml_format_name(layer.w3_a, "%s.feed_forward.w3.weight.loraA", layers_i.c_str());
-        ggml_format_name(layer.w3_b, "%s.feed_forward.w3.weight.loraB", layers_i.c_str());
+        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
+        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
+        ggml_set_name(layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_a", i));
+        ggml_set_name(layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_b", i));
+        ggml_set_name(layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_a", i));
+        ggml_set_name(layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_b", i));
+        ggml_set_name(layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_a", i));
+        ggml_set_name(layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_b", i));
+        ggml_set_name(layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_a", i));
+        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
+        ggml_set_name(layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
+        ggml_set_name(layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
+        ggml_set_name(layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
+        ggml_set_name(layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
+        ggml_set_name(layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
+        ggml_set_name(layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
     }
 }
 
@@ -1450,14 +1501,182 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
 }
 
 void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    // TODO
+    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
+    
+    std::string arch;
+
+    std::vector<char> keybuf;
+    keybuf.resize(512);
+    auto kv = [&arch, &keybuf](const char * key) -> const char * {
+        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
+        return keybuf.data();
+    };
+
+    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
+    GGML_ASSERT(arch == "llama");
+
+    uint32_t ftype_u;
+    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
+    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
+
+    // n_ctx was not saved in earlier checkpoint file version, so we make it optional here
+    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+
+    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
+    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
+
+    model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
+    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+
+    float rope_freq_scale = 1.0f;
+    GGUF_GET_KEY(fctx, lora->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(fctx, lora->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    if (rope_freq_scale != 1.0f) {
+        lora->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
+    }
+
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm,           gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_output,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
+
+    init_lora(model, lora);
+
+    read_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
+    read_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
+    read_tensor_by_name(lora->norm_a,           f_ggml_ctx, ggml_get_name(lora->norm_a));
+    read_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
+    read_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
+    read_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
+    
+    print_data_checksum(lora->tok_embeddings_a);
+    print_data_checksum(lora->tok_embeddings_b);
+    print_data_checksum(lora->norm_a);
+    print_data_checksum(lora->norm_b);
+    print_data_checksum(lora->output_a);
+    print_data_checksum(lora->output_b);
+
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+        read_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
+        read_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
+        read_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
+        read_tensor_by_name(layer.wq_b,             f_ggml_ctx, ggml_get_name(layer.wq_b));
+        read_tensor_by_name(layer.wk_a,             f_ggml_ctx, ggml_get_name(layer.wk_a));
+        read_tensor_by_name(layer.wk_b,             f_ggml_ctx, ggml_get_name(layer.wk_b));
+        read_tensor_by_name(layer.wv_a,             f_ggml_ctx, ggml_get_name(layer.wv_a));
+        read_tensor_by_name(layer.wv_b,             f_ggml_ctx, ggml_get_name(layer.wv_b));
+        read_tensor_by_name(layer.wo_a,             f_ggml_ctx, ggml_get_name(layer.wo_a));
+        read_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
+        read_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
+        read_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
+        read_tensor_by_name(layer.w1_a,             f_ggml_ctx, ggml_get_name(layer.w1_a));
+        read_tensor_by_name(layer.w1_b,             f_ggml_ctx, ggml_get_name(layer.w1_b));
+        read_tensor_by_name(layer.w2_a,             f_ggml_ctx, ggml_get_name(layer.w2_a));
+        read_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
+        read_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
+        read_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
+
+        print_data_checksum(layer.attention_norm_a);
+        print_data_checksum(layer.attention_norm_b);
+        print_data_checksum(layer.wq_a);
+        print_data_checksum(layer.wq_b);
+        print_data_checksum(layer.wk_a);
+        print_data_checksum(layer.wk_b);
+        print_data_checksum(layer.wv_a);
+        print_data_checksum(layer.wv_b);
+        print_data_checksum(layer.wo_a);
+        print_data_checksum(layer.wo_b);
+        print_data_checksum(layer.ffn_norm_a);
+        print_data_checksum(layer.ffn_norm_b);
+        print_data_checksum(layer.w1_a);
+        print_data_checksum(layer.w1_b);
+        print_data_checksum(layer.w2_a);
+        print_data_checksum(layer.w2_b);
+        print_data_checksum(layer.w3_a);
+        print_data_checksum(layer.w3_b);
+    }
 }
 
 void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    // TODO
-}
+    const char * arch = "llama";
+    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+
+    std::vector<char> keybuf;
+    keybuf.resize(512);
+    auto kv = [arch, &keybuf](const char * key) -> const char * {
+        snprintf(keybuf.data(), keybuf.size(), key, arch);
+        return keybuf.data();
+    };
 
-void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
+    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
+
+    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
+    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
+    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff); 
+    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
+    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
+    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot);
+    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), lora->hparams.f_norm_rms_eps);
+    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              lora->hparams.rope_freq_base);
+    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           lora->hparams.rope_freq_scale);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,   lora->hparams.n_rank_tok_embeddings);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM,  lora->hparams.n_rank_norm);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT,       lora->hparams.n_rank_output);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,    lora->hparams.n_rank_attention_norm);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q,       lora->hparams.n_rank_wq);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K,       lora->hparams.n_rank_wk);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_w1);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_w2);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_w3);
+
+    gguf_add_tensor(fctx, lora->tok_embeddings_a);
+    gguf_add_tensor(fctx, lora->tok_embeddings_b);
+    gguf_add_tensor(fctx, lora->norm_a);
+    gguf_add_tensor(fctx, lora->norm_b);
+    gguf_add_tensor(fctx, lora->output_a);
+    gguf_add_tensor(fctx, lora->output_b);
+
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+
+        gguf_add_tensor(fctx, layer.attention_norm_a);
+        gguf_add_tensor(fctx, layer.attention_norm_b);
+        gguf_add_tensor(fctx, layer.wq_a);
+        gguf_add_tensor(fctx, layer.wq_b);
+        gguf_add_tensor(fctx, layer.wk_a);
+        gguf_add_tensor(fctx, layer.wk_b);
+        gguf_add_tensor(fctx, layer.wv_a);
+        gguf_add_tensor(fctx, layer.wv_b);
+        gguf_add_tensor(fctx, layer.wo_a);
+        gguf_add_tensor(fctx, layer.wo_b);
+        gguf_add_tensor(fctx, layer.ffn_norm_a);
+        gguf_add_tensor(fctx, layer.ffn_norm_b);
+        gguf_add_tensor(fctx, layer.w1_a);
+        gguf_add_tensor(fctx, layer.w1_b);
+        gguf_add_tensor(fctx, layer.w2_a);
+        gguf_add_tensor(fctx, layer.w2_b);
+        gguf_add_tensor(fctx, layer.w3_a);
+        gguf_add_tensor(fctx, layer.w3_b);
+    }
+}
+
+void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
 
     uint32_t file_version;
@@ -1475,7 +1694,7 @@ void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
     load_opt_context_gguf(fctx, f_ggml_ctx, opt);
 }
 
-void save_checkpoint_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     save_llama_lora_gguf(fctx, model, lora);
 
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
@@ -1487,7 +1706,7 @@ void save_checkpoint_gguf(struct gguf_context * fctx, struct my_llama_model * mo
     save_opt_context_gguf(fctx, opt);
 }
 
-bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     struct ggml_context * f_ggml_ctx;
     struct gguf_init_params params;
     params.no_alloc = false;
@@ -1497,18 +1716,18 @@ bool load_checkpoint_file(const char * filename, struct my_llama_model * model,
         return false;
     }
 
-    load_checkpoint_gguf(fctx, f_ggml_ctx, model, lora, opt);
+    load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, opt);
 
     return true;
 }
 
-void save_checkpoint_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
     struct gguf_context * fctx = gguf_init_empty();
 
-    save_checkpoint_gguf(fctx, model, lora, opt);
+    save_checkpoint_lora_gguf(fctx, model, lora, opt);
 
     // write file
     const bool only_meta = false;
@@ -1516,7 +1735,7 @@ void save_checkpoint_file(const char * filename, struct my_llama_model * model,
     gguf_free(fctx);
 }
 
-void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
     if (tensor == NULL) {
         file->write_u32(0);
         file->write_u32(0);
@@ -1524,7 +1743,9 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
         file->seek((0-file->tell()) & 31, SEEK_CUR);
         return;
     }
-    const char * name = ggml_get_name(tensor);
+    if (name == NULL) {
+        name = ggml_get_name(tensor);
+    }
     uint32_t name_len = strlen(name);
     uint32_t nd = tensor->n_dims;
     uint32_t ne[4] = { (uint32_t)tensor->ne[0],
@@ -1540,45 +1761,26 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-    int32_t nd = file->read_u32();
-    GGML_ASSERT(nd == tensor->n_dims);
-
-    uint32_t name_len       = file->read_u32();
-    enum     ggml_type type = (enum ggml_type) file->read_u32();
-    GGML_ASSERT(type == tensor->type);
-
-    uint32_t ne[4];
-    file->read_raw(ne, sizeof(ne[0]) * nd);
-    for (int i=0; i<nd; ++i) {
-        GGML_ASSERT(ne[i] == tensor->ne[i]);
-    }
-
-    std::string name = file->read_string(name_len);
-    GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
-
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->read_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-void skip_tensor(struct llama_file * file) {
-    int32_t nd = file->read_u32();
+// void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+//     int32_t nd = file->read_u32();
+//     GGML_ASSERT(nd == tensor->n_dims);
 
-    uint32_t name_len       = file->read_u32();
-    enum     ggml_type type = (enum ggml_type) file->read_u32();
+//     uint32_t name_len       = file->read_u32();
+//     enum     ggml_type type = (enum ggml_type) file->read_u32();
+//     GGML_ASSERT(type == tensor->type);
 
-    uint32_t ne[4] = { 1, 1, 1, 1 };
+//     uint32_t ne[4];
+//     file->read_raw(ne, sizeof(ne[0]) * nd);
+//     for (int i=0; i<nd; ++i) {
+//         GGML_ASSERT(ne[i] == tensor->ne[i]);
+//     }
 
-    file->read_raw(ne, sizeof(ne[0]) * nd);
+//     std::string name = file->read_string(name_len);
+//     GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
 
-    std::string name = file->read_string(name_len);
-
-    file->seek(-file->tell() & 31, SEEK_CUR);
-
-    size_t nelements = ne[0]*ne[1]*ne[2]*ne[3];
-    size_t nbytes = nelements*ggml_type_size(type)/ggml_blck_size(type);
-    file->seek(nbytes, SEEK_CUR);
-}
+//     file->seek((0-file->tell()) & 31, SEEK_CUR);
+//     file->read_raw(tensor->data, ggml_nbytes(tensor));
+// }
 
 void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
     const uint32_t version = 1;
@@ -1595,9 +1797,9 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
             {
                 GGML_ASSERT(opt->adam.m  != NULL);
                 GGML_ASSERT(opt->adam.v  != NULL);
-                write_tensor(file, opt->adam.m);
-                write_tensor(file, opt->adam.v);
-                write_tensor(file, opt->adam.pf);
+                write_tensor(file, opt->adam.m, NULL);
+                write_tensor(file, opt->adam.v, NULL);
+                write_tensor(file, opt->adam.pf, NULL);
                 file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
                 file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
                 file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
@@ -1605,16 +1807,16 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
         case GGML_OPT_LBFGS:
             {
                 GGML_ASSERT(opt->lbfgs.x != NULL);
-                write_tensor(file, opt->lbfgs.x);
-                write_tensor(file, opt->lbfgs.xp);
-                write_tensor(file, opt->lbfgs.g);
-                write_tensor(file, opt->lbfgs.gp);
-                write_tensor(file, opt->lbfgs.d);
-                write_tensor(file, opt->lbfgs.pf);
-                write_tensor(file, opt->lbfgs.lmal);
-                write_tensor(file, opt->lbfgs.lmys);
-                write_tensor(file, opt->lbfgs.lms);
-                write_tensor(file, opt->lbfgs.lmy);
+                write_tensor(file, opt->lbfgs.x, NULL);
+                write_tensor(file, opt->lbfgs.xp, NULL);
+                write_tensor(file, opt->lbfgs.g, NULL);
+                write_tensor(file, opt->lbfgs.gp, NULL);
+                write_tensor(file, opt->lbfgs.d, NULL);
+                write_tensor(file, opt->lbfgs.pf, NULL);
+                write_tensor(file, opt->lbfgs.lmal, NULL);
+                write_tensor(file, opt->lbfgs.lmys, NULL);
+                write_tensor(file, opt->lbfgs.lms, NULL);
+                write_tensor(file, opt->lbfgs.lmy, NULL);
                 file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
                 file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
                 file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
@@ -1625,66 +1827,66 @@ void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt)
     }
 }
 
-void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    opt->params.past    = (int) file->read_u32();
-    opt->params.lbfgs.m = (int) file->read_u32();
-    file->read_raw(&opt->nx,     sizeof(opt->nx));
-    ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-    file->read_raw(&opt->iter,   sizeof(opt->iter));
-    opt->just_initialized = (bool) file->read_u32();
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                read_tensor(file, opt->adam.m);
-                read_tensor(file, opt->adam.v);
-                if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-                file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                read_tensor(file, opt->lbfgs.x);
-                read_tensor(file, opt->lbfgs.xp);
-                read_tensor(file, opt->lbfgs.g);
-                read_tensor(file, opt->lbfgs.gp);
-                read_tensor(file, opt->lbfgs.d);
-                if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-                read_tensor(file, opt->lbfgs.lmal);
-                read_tensor(file, opt->lbfgs.lmys);
-                read_tensor(file, opt->lbfgs.lms);
-                read_tensor(file, opt->lbfgs.lmy);
-                file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-    uint32_t version = file->read_u32();
-    printf("%s: opt context version %u\n", __func__, version);
-    switch (version) {
-        case 0:
-            {
-                GGML_ASSERT(false); // not supported in finetune
-            } break;
-        case 1:
-            {
-                read_opt_context_v1(file, ctx, opt);
-            } break;
-        default:
-            {
-                fprintf(stderr, "%s: unknown version %u\n", __func__, version);
-            }
-    }
-}
+// void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+//     opt->params.past    = (int) file->read_u32();
+//     opt->params.lbfgs.m = (int) file->read_u32();
+//     file->read_raw(&opt->nx,     sizeof(opt->nx));
+//     ggml_opt_init(ctx, opt, opt->params, opt->nx);
+
+//     file->read_raw(&opt->iter,   sizeof(opt->iter));
+//     opt->just_initialized = (bool) file->read_u32();
+
+//     switch (opt->params.type) {
+//         case GGML_OPT_ADAM:
+//             {
+//                 read_tensor(file, opt->adam.m);
+//                 read_tensor(file, opt->adam.v);
+//                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
+//                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
+//                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
+//                 file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
+//             } break;
+//         case GGML_OPT_LBFGS:
+//             {
+//                 GGML_ASSERT(opt->lbfgs.x != NULL);
+//                 read_tensor(file, opt->lbfgs.x);
+//                 read_tensor(file, opt->lbfgs.xp);
+//                 read_tensor(file, opt->lbfgs.g);
+//                 read_tensor(file, opt->lbfgs.gp);
+//                 read_tensor(file, opt->lbfgs.d);
+//                 if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
+//                 read_tensor(file, opt->lbfgs.lmal);
+//                 read_tensor(file, opt->lbfgs.lmys);
+//                 read_tensor(file, opt->lbfgs.lms);
+//                 read_tensor(file, opt->lbfgs.lmy);
+//                 file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
+//                 file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
+//                 file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
+//                 file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
+//                 file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
+//                 file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
+//             } break;
+//     }
+// }
+
+// void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
+//     uint32_t version = file->read_u32();
+//     printf("%s: opt context version %u\n", __func__, version);
+//     switch (version) {
+//         case 0:
+//             {
+//                 GGML_ASSERT(false); // not supported in finetune
+//             } break;
+//         case 1:
+//             {
+//                 read_opt_context_v1(file, ctx, opt);
+//             } break;
+//         default:
+//             {
+//                 fprintf(stderr, "%s: unknown version %u\n", __func__, version);
+//             }
+//     }
+// }
 
 void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
@@ -1695,6 +1897,14 @@ void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
         return;
     }
 
+    std::vector<char> tn_buf;
+    tn_buf.resize(GGML_MAX_NAME);
+
+    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
+        return tn_buf.data();
+    };
+
     const uint32_t magic   = 'ggcl';
     const uint32_t version = 0;
 
@@ -1705,7 +1915,7 @@ void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
     file.write_u32(lora->train_tokens);
     file.write_u32(model->hparams.n_vocab);
     file.write_u32(model->hparams.n_embd);
-    //file.write_u32(model->hparams.n_mult); 
+    file.write_u32(4 /*model->hparams.n_mult*/); 
     file.write_u32(model->hparams.n_head);
     file.write_u32(model->hparams.n_layer);
     file.write_u32(model->hparams.n_rot);
@@ -1722,137 +1932,163 @@ void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora,
     file.write_u32(lora->hparams.n_rank_norm);
     file.write_u32(lora->hparams.n_rank_output);
 
-    write_tensor(&file, lora->tok_embeddings_a);
-    write_tensor(&file, lora->tok_embeddings_b);
-    write_tensor(&file, lora->norm_a);
-    write_tensor(&file, lora->norm_b);
-    write_tensor(&file, lora->output_a);
-    write_tensor(&file, lora->output_b);
+    write_tensor(&file, lora->tok_embeddings_a, "tok_embeddings.weight.loraA");
+    write_tensor(&file, lora->tok_embeddings_b, "tok_embeddings.weight.loraB");
+    write_tensor(&file, lora->norm_a,           "norm.weight.loraA");
+    write_tensor(&file, lora->norm_b,           "norm.weight.loraB");
+    write_tensor(&file, lora->output_a,         "output.weight.loraA");
+    write_tensor(&file, lora->output_b,         "output.weight.loraB");
+
+    print_data_checksum(lora->tok_embeddings_a);
+    print_data_checksum(lora->tok_embeddings_b);
+    print_data_checksum(lora->norm_a);
+    print_data_checksum(lora->norm_b);
+    print_data_checksum(lora->output_a);
+    print_data_checksum(lora->output_b);
 
     for (uint32_t i = 0; i < lora->layers.size(); ++i) {
         auto & layer = lora->layers[i];
 
-        write_tensor(&file, layer.attention_norm_a);
-        write_tensor(&file, layer.attention_norm_b);
-        write_tensor(&file, layer.wq_a);
-        write_tensor(&file, layer.wq_b);
-        write_tensor(&file, layer.wk_a);
-        write_tensor(&file, layer.wk_b);
-        write_tensor(&file, layer.wv_a);
-        write_tensor(&file, layer.wv_b);
-        write_tensor(&file, layer.wo_a);
-        write_tensor(&file, layer.wo_b);
-        write_tensor(&file, layer.ffn_norm_a);
-        write_tensor(&file, layer.ffn_norm_b);
-        write_tensor(&file, layer.w1_a);
-        write_tensor(&file, layer.w1_b);
-        write_tensor(&file, layer.w2_a);
-        write_tensor(&file, layer.w2_b);
-        write_tensor(&file, layer.w3_a);
-        write_tensor(&file, layer.w3_b);
+        write_tensor(&file, layer.attention_norm_a, tni("%d.attention_norm.weight.loraA", i));
+        write_tensor(&file, layer.attention_norm_b, tni("%d.attention_norm.weight.loraB", i));
+        write_tensor(&file, layer.wq_a,             tni("%d.attention.wq.weight.loraA", i));
+        write_tensor(&file, layer.wq_b,             tni("%d.attention.wq.weight.loraB", i));
+        write_tensor(&file, layer.wk_a,             tni("%d.attention.wk.weight.loraA", i));
+        write_tensor(&file, layer.wk_b,             tni("%d.attention.wk.weight.loraB", i));
+        write_tensor(&file, layer.wv_a,             tni("%d.attention.wv.weight.loraA", i));
+        write_tensor(&file, layer.wv_b,             tni("%d.attention.wv.weight.loraB", i));
+        write_tensor(&file, layer.wo_a,             tni("%d.attention.wo.weight.loraA", i));
+        write_tensor(&file, layer.wo_b,             tni("%d.attention.wo.weight.loraB", i));
+        write_tensor(&file, layer.ffn_norm_a,       tni("%d.ffn_norm.weight.loraA", i));
+        write_tensor(&file, layer.ffn_norm_b,       tni("%d.ffn_norm.weight.loraB", i));
+        write_tensor(&file, layer.w1_a,             tni("%d.feed_forward.w1.weight.loraA", i));
+        write_tensor(&file, layer.w1_b,             tni("%d.feed_forward.w1.weight.loraB", i));
+        write_tensor(&file, layer.w2_a,             tni("%d.feed_forward.w2.weight.loraA", i));
+        write_tensor(&file, layer.w2_b,             tni("%d.feed_forward.w2.weight.loraB", i));
+        write_tensor(&file, layer.w3_a,             tni("%d.feed_forward.w3.weight.loraA", i));
+        write_tensor(&file, layer.w3_b,             tni("%d.feed_forward.w3.weight.loraB", i));
+
+        print_data_checksum(layer.attention_norm_a);
+        print_data_checksum(layer.attention_norm_b);
+        print_data_checksum(layer.wq_a);
+        print_data_checksum(layer.wq_b);
+        print_data_checksum(layer.wk_a);
+        print_data_checksum(layer.wk_b);
+        print_data_checksum(layer.wv_a);
+        print_data_checksum(layer.wv_b);
+        print_data_checksum(layer.wo_a);
+        print_data_checksum(layer.wo_b);
+        print_data_checksum(layer.ffn_norm_a);
+        print_data_checksum(layer.ffn_norm_b);
+        print_data_checksum(layer.w1_a);
+        print_data_checksum(layer.w1_b);
+        print_data_checksum(layer.w2_a);
+        print_data_checksum(layer.w2_b);
+        print_data_checksum(layer.w3_a);
+        print_data_checksum(layer.w3_b);
     }
 
     write_opt_context(&file, opt);
 }
 
-bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, bool init) {
-    struct llama_file file(filename, "rb");
-
-    uint32_t magic;
-    uint32_t version;
-
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
-
-    if (file.fp) {
-        printf("%s: Loading model from '%s'.\n", __func__, filename);
-        magic                  = file.read_u32();
-        GGML_ASSERT(magic     == 'ggcl');
-        version                = file.read_u32();
-        GGML_ASSERT(version   == 0);
-        train_its              = file.read_u32();
-        train_samples          = file.read_u32();
-        train_tokens           = file.read_u32();
-        uint32_t n_vocab       = file.read_u32();
-        uint32_t n_embd        = file.read_u32();
-        uint32_t n_mult        = file.read_u32();
-        uint32_t n_head        = file.read_u32();
-        uint32_t n_layer       = file.read_u32();
-        uint32_t n_rot         = file.read_u32();
-        GGML_ASSERT(n_vocab == model->hparams.n_vocab);
-        GGML_ASSERT(n_embd  == model->hparams.n_embd);
-        //GGML_ASSERT(n_mult  == model->hparams.n_mult);
-        GGML_ASSERT(n_head  == model->hparams.n_head);
-        GGML_ASSERT(n_layer == model->hparams.n_layer);
-        GGML_ASSERT(n_rot   == model->hparams.n_rot);
-        lora->hparams.n_rank_attention_norm = file.read_u32();
-        lora->hparams.n_rank_wq             = file.read_u32();
-        lora->hparams.n_rank_wk             = file.read_u32();
-        lora->hparams.n_rank_wv             = file.read_u32();
-        lora->hparams.n_rank_wo             = file.read_u32();
-        lora->hparams.n_rank_ffn_norm       = file.read_u32();
-        lora->hparams.n_rank_w1             = file.read_u32();
-        lora->hparams.n_rank_w2             = file.read_u32();
-        lora->hparams.n_rank_w3             = file.read_u32();
-        lora->hparams.n_rank_tok_embeddings = file.read_u32();
-        lora->hparams.n_rank_norm           = file.read_u32();
-        lora->hparams.n_rank_output         = file.read_u32();
-
-        print_params(&model->hparams);
-        print_lora_params(&lora->hparams);
-    }
-
-    if (init) {
-        init_lora(model, lora);
-    }
-
-    if (file.fp) {
-        lora->train_its = train_its;
-        lora->train_samples = train_samples;
-        lora->train_tokens = train_tokens;
-    }
-
-    printf("%s: Training iterations: %u.\n", __func__, lora->train_its);
-    printf("%s: Training samples:    %u.\n", __func__, lora->train_samples);
-    printf("%s: Training tokens:     %u.\n", __func__, lora->train_tokens);
-
-    if (file.fp) {
-        read_tensor(&file, lora->tok_embeddings_a);
-        read_tensor(&file, lora->tok_embeddings_b);
-        read_tensor(&file, lora->norm_a);
-        read_tensor(&file, lora->norm_b);
-        read_tensor(&file, lora->output_a);
-        read_tensor(&file, lora->output_b);
-
-        for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-            auto & layer = lora->layers[i];
-
-            read_tensor(&file, layer.attention_norm_a);
-            read_tensor(&file, layer.attention_norm_b);
-            read_tensor(&file, layer.wq_a);
-            read_tensor(&file, layer.wq_b);
-            read_tensor(&file, layer.wk_a);
-            read_tensor(&file, layer.wk_b);
-            read_tensor(&file, layer.wv_a);
-            read_tensor(&file, layer.wv_b);
-            read_tensor(&file, layer.wo_a);
-            read_tensor(&file, layer.wo_b);
-            read_tensor(&file, layer.ffn_norm_a);
-            read_tensor(&file, layer.ffn_norm_b);
-            read_tensor(&file, layer.w1_a);
-            read_tensor(&file, layer.w1_b);
-            read_tensor(&file, layer.w2_a);
-            read_tensor(&file, layer.w2_b);
-            read_tensor(&file, layer.w3_a);
-            read_tensor(&file, layer.w3_b);
-        }
-
-        read_opt_context(&file, lora->ctx, opt);
-    }
-
-    return (file.fp != NULL);
-}
+// bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, bool init) {
+//     struct llama_file file(filename, "rb");
+
+//     uint32_t magic;
+//     uint32_t version;
+
+//     uint32_t train_its = 0;
+//     uint32_t train_samples = 0;
+//     uint32_t train_tokens = 0;
+
+//     if (file.fp) {
+//         printf("%s: Loading model from '%s'.\n", __func__, filename);
+//         magic                  = file.read_u32();
+//         GGML_ASSERT(magic     == 'ggcl');
+//         version                = file.read_u32();
+//         GGML_ASSERT(version   == 0);
+//         train_its              = file.read_u32();
+//         train_samples          = file.read_u32();
+//         train_tokens           = file.read_u32();
+//         uint32_t n_vocab       = file.read_u32();
+//         uint32_t n_embd        = file.read_u32();
+//         uint32_t n_mult        = file.read_u32();
+//         uint32_t n_head        = file.read_u32();
+//         uint32_t n_layer       = file.read_u32();
+//         uint32_t n_rot         = file.read_u32();
+//         GGML_ASSERT(n_vocab == model->hparams.n_vocab);
+//         GGML_ASSERT(n_embd  == model->hparams.n_embd);
+//         //GGML_ASSERT(n_mult  == model->hparams.n_mult);
+//         GGML_ASSERT(n_head  == model->hparams.n_head);
+//         GGML_ASSERT(n_layer == model->hparams.n_layer);
+//         GGML_ASSERT(n_rot   == model->hparams.n_rot);
+//         lora->hparams.n_rank_attention_norm = file.read_u32();
+//         lora->hparams.n_rank_wq             = file.read_u32();
+//         lora->hparams.n_rank_wk             = file.read_u32();
+//         lora->hparams.n_rank_wv             = file.read_u32();
+//         lora->hparams.n_rank_wo             = file.read_u32();
+//         lora->hparams.n_rank_ffn_norm       = file.read_u32();
+//         lora->hparams.n_rank_w1             = file.read_u32();
+//         lora->hparams.n_rank_w2             = file.read_u32();
+//         lora->hparams.n_rank_w3             = file.read_u32();
+//         lora->hparams.n_rank_tok_embeddings = file.read_u32();
+//         lora->hparams.n_rank_norm           = file.read_u32();
+//         lora->hparams.n_rank_output         = file.read_u32();
+
+//         print_params(&model->hparams);
+//         print_lora_params(&lora->hparams);
+//     }
+
+//     if (init) {
+//         init_lora(model, lora);
+//     }
+
+//     if (file.fp) {
+//         lora->train_its = train_its;
+//         lora->train_samples = train_samples;
+//         lora->train_tokens = train_tokens;
+//     }
+
+//     printf("%s: Training iterations: %u.\n", __func__, lora->train_its);
+//     printf("%s: Training samples:    %u.\n", __func__, lora->train_samples);
+//     printf("%s: Training tokens:     %u.\n", __func__, lora->train_tokens);
+
+//     if (file.fp) {
+//         read_tensor(&file, lora->tok_embeddings_a);
+//         read_tensor(&file, lora->tok_embeddings_b);
+//         read_tensor(&file, lora->norm_a);
+//         read_tensor(&file, lora->norm_b);
+//         read_tensor(&file, lora->output_a);
+//         read_tensor(&file, lora->output_b);
+
+//         for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+//             auto & layer = lora->layers[i];
+
+//             read_tensor(&file, layer.attention_norm_a);
+//             read_tensor(&file, layer.attention_norm_b);
+//             read_tensor(&file, layer.wq_a);
+//             read_tensor(&file, layer.wq_b);
+//             read_tensor(&file, layer.wk_a);
+//             read_tensor(&file, layer.wk_b);
+//             read_tensor(&file, layer.wv_a);
+//             read_tensor(&file, layer.wv_b);
+//             read_tensor(&file, layer.wo_a);
+//             read_tensor(&file, layer.wo_b);
+//             read_tensor(&file, layer.ffn_norm_a);
+//             read_tensor(&file, layer.ffn_norm_b);
+//             read_tensor(&file, layer.w1_a);
+//             read_tensor(&file, layer.w1_b);
+//             read_tensor(&file, layer.w2_a);
+//             read_tensor(&file, layer.w2_b);
+//             read_tensor(&file, layer.w3_a);
+//             read_tensor(&file, layer.w3_b);
+//         }
+
+//         read_opt_context(&file, lora->ctx, opt);
+//     }
+
+//     return (file.fp != NULL);
+// }
 
 void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
@@ -1863,6 +2099,21 @@ void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, cons
         return;
     }
 
+    std::vector<char> tn_buf;
+    tn_buf.resize(GGML_MAX_NAME);
+
+    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
+        return tn_buf.data();
+    };
+
+    auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * {
+        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
+        std::string s = tn_buf.data();
+        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
+        return tn_buf.data();
+    };
+
     uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
     // write_magic
     file.write_u32(LLAMA_FILE_MAGIC_LORA);   // magic
@@ -1871,32 +2122,32 @@ void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, cons
     file.write_u32(lora->hparams.lora_r);
     file.write_u32(lora->hparams.lora_alpha);
     // write tensors
-    write_tensor(&file, lora->tok_embeddings_a);
-    write_tensor(&file, lora->tok_embeddings_b);
-    write_tensor(&file, lora->norm_a);
-    write_tensor(&file, lora->norm_b);
-    write_tensor(&file, lora->output_a);
-    write_tensor(&file, lora->output_b);
+    write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraA"));
+    write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraB"));
+    write_tensor(&file, lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA"));
+    write_tensor(&file, lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB"));
+    write_tensor(&file, lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraA"));
+    write_tensor(&file, lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraB"));
     for (uint32_t i = 0; i < lora->layers.size(); ++i) {
         auto & layer = lora->layers[i];
-        write_tensor(&file, layer.attention_norm_a);
-        write_tensor(&file, layer.attention_norm_b);
-        write_tensor(&file, layer.wq_a);
-        write_tensor(&file, layer.wq_b);
-        write_tensor(&file, layer.wk_a);
-        write_tensor(&file, layer.wk_b);
-        write_tensor(&file, layer.wv_a);
-        write_tensor(&file, layer.wv_b);
-        write_tensor(&file, layer.wo_a);
-        write_tensor(&file, layer.wo_b);
-        write_tensor(&file, layer.ffn_norm_a);
-        write_tensor(&file, layer.ffn_norm_b);
-        write_tensor(&file, layer.w1_a);
-        write_tensor(&file, layer.w1_b);
-        write_tensor(&file, layer.w2_a);
-        write_tensor(&file, layer.w2_b);
-        write_tensor(&file, layer.w3_a);
-        write_tensor(&file, layer.w3_b);
+        write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA"));
+        write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB"));
+        write_tensor(&file, layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraA"));
+        write_tensor(&file, layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraB"));
+        write_tensor(&file, layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraA"));
+        write_tensor(&file, layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraB"));
+        write_tensor(&file, layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraA"));
+        write_tensor(&file, layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraB"));
+        write_tensor(&file, layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraA"));
+        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
+        write_tensor(&file, layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
+        write_tensor(&file, layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
+        write_tensor(&file, layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
+        write_tensor(&file, layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
+        write_tensor(&file, layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
+        write_tensor(&file, layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
     }
 }
 
@@ -2527,8 +2778,15 @@ void opt_callback(void * vdata, float * sched) {
         data->lora->train_tokens  += new_iters * n_batch * n_ctx;
 
         if (strlen(params->fn_checkpoint_out) > 0) {
-            save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter, params->fn_latest);
-            save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, -1, params->fn_latest);
+            save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
+            save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, -1, params->fn_latest);
+            std::string fn_chk_old = params->fn_checkpoint_out;
+            fn_chk_old = fn_chk_old + std::string(".old.bin");
+            save_checkpoint(data->model, data->lora, opt, fn_chk_old.c_str(), params->pattern_fn_it, opt->iter, params->fn_latest);
+            save_checkpoint(data->model, data->lora, opt, fn_chk_old.c_str(), params->pattern_fn_it, -1, params->fn_latest);
+
+            // save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter, params->fn_latest);
+            // save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, -1, params->fn_latest);
         }
         if (strlen(params->fn_lora_out) > 0) {
             save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
@@ -2624,8 +2882,6 @@ int main(int argc, char ** argv) {
     lora.hparams.n_rank_norm           = params.n_rank_norm;
     lora.hparams.n_rank_output         = params.n_rank_output;
 
-    print_lora_params(&lora.hparams);
-
     std::vector<size_t> token_noccurs;
     std::vector<bool>   token_notavail;
     token_noccurs.resize(model.hparams.n_vocab, 0);
@@ -2688,8 +2944,14 @@ int main(int argc, char ** argv) {
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint(&model, &lora, opt, params.fn_checkpoint_in, true);
+    // bool existed = load_checkpoint(&model, &lora, opt, params.fn_checkpoint_in, true);
+    bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, opt);
+    if (!existed) {
+        init_lora(&model, &lora);
+    }
     set_param_lora(&lora);
+    print_params(&model.hparams);
+    print_lora_params(&lora.hparams);
 
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
 
@@ -2847,8 +3109,12 @@ int main(int argc, char ** argv) {
     lora.train_tokens  += new_iters * n_batch * n_tokens;
 
     if (params.n_examples > 0) {
-        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_checkpoint(&model, &lora, opt, params.fn_checkpoint_out, params.pattern_fn_it, -1, params.fn_latest);
+        save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, -1, params.fn_latest);
+        std::string fn_chk_old = params.fn_checkpoint_out;
+        fn_chk_old = fn_chk_old + std::string(".old.bin");
+        save_checkpoint(&model, &lora, opt, fn_chk_old.c_str(), params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_checkpoint(&model, &lora, opt, fn_chk_old.c_str(), params.pattern_fn_it, -1, params.fn_latest);
     }
 
     if (strlen(params.fn_lora_out) > 0) {

From 6134ad4de72bdd536440df1b1bf15c9ab340c7ef Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 18:24:06 +0200
Subject: [PATCH 115/235] add python script to convert old finetune checkpoint
 files to gguf

---
 .../convert-finetune-checkpoint-to-gguf.py    | 489 ++++++++++++++++++
 1 file changed, 489 insertions(+)
 create mode 100644 examples/finetune/convert-finetune-checkpoint-to-gguf.py

diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
new file mode 100644
index 0000000000000..4b9fa7c53b04d
--- /dev/null
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@@ -0,0 +1,489 @@
+#!/usr/bin/env python3
+# finetune checkpoint --> gguf conversion
+
+import argparse
+import gguf
+import os
+import struct
+import sys
+import numpy as np
+from pathlib import Path
+
+# gguf constants
+LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
+LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
+LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
+LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
+LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
+LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
+LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
+LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
+LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
+LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
+LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
+LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
+LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
+LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
+
+LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
+
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
+LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
+
+LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
+LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
+LLM_KV_TRAINING_TYPE               = "training.type"
+LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
+
+LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
+LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
+LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
+LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
+LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
+LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
+LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
+LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
+LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
+LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
+LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
+LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
+
+class Tensor:
+    def __init__(self, dtype='f', ne=None):
+        if ne is None:
+            ne = []
+        self.dtype = dtype
+        self.ne = ne
+        self.nbytes = 0
+        if self.dtype == 'f':
+            if len(self.ne) == 0:
+                self.nbytes = 0
+            else:
+                self.nbytes = int(np.product(self.ne)) * 4
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+    def load(self, data, offset):
+        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        assert(nd == len(self.ne))
+        ne = []
+        for d in range(nd):
+            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+            ne.append(n)
+
+        if tuple(ne) != tuple(self.ne):
+            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
+
+        if self.dtype == 'f':
+            assert(dtype == 0)
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+        self.name = bytes(data[offset:offset+namelen]); offset += namelen
+        # 32-byte alignment
+        offset += (0 - offset) & 31
+        self.data = data[offset:offset+self.nbytes]
+        offset += self.nbytes
+        return offset
+
+    def max_storage_size(self):
+        result = 0
+        result += 4 # nd
+        result += 4 # namelen
+        result += 4 # dtype
+        result += len(self.ne)*8 # ne
+        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
+        result += 31 # 32-byte alignment
+        result += self.nbytes
+        return result
+
+    def save_gguf(self, gguf_writer, name):
+        gguf_writer.add_tensor(
+            name=name,
+            tensor=self.data,
+            raw_shape=np.array(list(reversed(self.ne))),
+            raw_dtype=gguf.GGMLQuantizationType.F32)
+
+class OptimizationContext:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
+        offset += 4
+        
+        if self.version != 1:
+            raise ValueError('Invalid version of optimization context in checkpoint file')
+
+        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
+        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
+
+        self.adam_m  = Tensor('f', [self.nx])
+        self.adam_v  = Tensor('f', [self.nx])
+        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
+
+        self.lbfgs_x    = Tensor('f', [self.nx])
+        self.lbfgs_xp   = Tensor('f', [self.nx])
+        self.lbfgs_g    = Tensor('f', [self.nx])
+        self.lbfgs_gp   = Tensor('f', [self.nx])
+        self.lbfgs_d    = Tensor('f', [self.nx])
+        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
+        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
+        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
+        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
+        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
+
+        # forgot to save type in version 1:
+        # guess self.type from number of remaining bytes
+        size_type_0 = 12 + sum([t.max_storage_size() for t in
+                                [self.adam_m, self.adam_v]
+                                +([self.adam_pf] if (self.past > 0) else [])])
+        size_type_1 = 24 + sum([t.max_storage_size() for t in
+                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
+                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
+                                 self.lbfgs_lmal, self.lbfgs_lmys,
+                                 self.lbfgs_lms, self.lbfgs_lmy]
+                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
+        # due to alignment padding the size might not by exact
+        # but the difference in size for both types is significant,
+        # so we can just use whichever is closest
+        remaining = len(data) - offset
+        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
+            self.type = 0
+        else:
+            self.type = 1
+
+        if self.type == 0:
+            offset = self.adam_m.load(data, offset)
+            offset = self.adam_v.load(data, offset)
+            offset = self.adam_pf.load(data,offset)
+
+            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        elif self.type == 1:
+            offset = self.lbfgs_x.load(data, offset)
+            offset = self.lbfgs_xp.load(data, offset)
+            offset = self.lbfgs_g.load(data, offset)
+            offset = self.lbfgs_gp.load(data, offset)
+            offset = self.lbfgs_d.load(data, offset)
+            offset = self.lbfgs_pf.load(data, offset)
+            offset = self.lbfgs_lmal.load(data, offset)
+            offset = self.lbfgs_lmys.load(data, offset)
+            offset = self.lbfgs_lms.load(data, offset)
+            offset = self.lbfgs_lmy.load(data, offset)
+
+            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        else:
+            raise ValueError(f"Invalid optimizer type '{self.type}'")
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
+        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
+        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
+
+        if self.type == 0:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
+
+            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
+            if self.past > 0:
+                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
+
+        elif self.type == 1:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
+
+            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
+            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
+            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
+            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
+            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
+            if self.past > 0:
+                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
+            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
+            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
+            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
+            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
+        else:
+            raise ValueError('Unknown optimizer type')
+
+class LoraParams:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
+
+class ModelParams:
+    def __init__(self, n_ff = None):
+        self.n_ff = n_ff
+
+    def load(self, data, offset):
+        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def get_n_ff(self):
+        if self.n_ff is None:
+            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
+            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
+        else:
+            return self.n_ff
+
+    def save_gguf(self, gguf_writer):
+        # self.n_vocab not saved
+        gguf_writer.add_embedding_length(self.n_embd)
+        gguf_writer.add_head_count(self.n_head)
+        gguf_writer.add_block_count(self.n_layer)
+        gguf_writer.add_rope_dimension_count(self.n_rot)
+        gguf_writer.add_feed_forward_length(self.get_n_ff())
+
+def tensor_name(key, bid=None, suffix=".weight"):
+    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
+
+class Layer:
+    def __init__(self, params, lora_params, bid):
+        self.bid = bid
+        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
+        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
+        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
+        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
+        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
+        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
+        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
+        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
+        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
+        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
+        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
+        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
+        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
+        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
+        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
+        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
+        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
+        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
+
+    def load(self, data, offset):
+        offset = self.att_norm_a.load(data, offset)
+        offset = self.att_norm_b.load(data, offset)
+        offset = self.wq_a.load(data, offset)
+        offset = self.wq_b.load(data, offset)
+        offset = self.wk_a.load(data, offset)
+        offset = self.wk_b.load(data, offset)
+        offset = self.wv_a.load(data, offset)
+        offset = self.wv_b.load(data, offset)
+        offset = self.wo_a.load(data, offset)
+        offset = self.wo_b.load(data, offset)
+        offset = self.ffn_norm_a.load(data, offset)
+        offset = self.ffn_norm_b.load(data, offset)
+        offset = self.w1_a.load(data, offset)
+        offset = self.w1_b.load(data, offset)
+        offset = self.w2_a.load(data, offset)
+        offset = self.w2_b.load(data, offset)
+        offset = self.w3_a.load(data, offset)
+        offset = self.w3_b.load(data, offset)
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
+        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
+        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
+        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
+        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
+        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
+        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
+        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
+        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
+        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
+        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
+        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
+        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
+        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
+        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
+        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
+        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
+        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
+
+class LoraModel:
+    def __init__(self, n_ff = None):
+        self.params = ModelParams(n_ff = n_ff)
+        self.lora_params = LoraParams()
+        self.layers = []
+
+    def load(self, data, offset):
+        offset = self.params.load(data, offset)
+        offset = self.lora_params.load(data, offset)
+
+        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
+        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
+        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
+        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
+        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
+        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
+
+        offset = self.tok_embd_a.load(data, offset)
+        offset = self.tok_embd_b.load(data, offset)
+        offset = self.norm_a.load(data, offset)
+        offset = self.norm_b.load(data, offset)
+        offset = self.output_a.load(data, offset)
+        offset = self.output_b.load(data, offset)
+
+        self.layers.clear()
+        for bid in range(self.params.n_layer):
+            layer = Layer(self.params, self.lora_params, bid)
+            offset = layer.load(data, offset)
+            self.layers.append(layer)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.params.save_gguf(gguf_writer)
+        self.lora_params.save_gguf(gguf_writer)
+
+        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
+        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
+        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
+        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
+        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
+        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
+
+        for layer in self.layers:
+            layer.save_gguf(gguf_writer)
+
+class LoraCheckpoint:
+    def __init__(self, n_ff = None):
+        self.model = LoraModel(n_ff = n_ff)
+        self.opt_ctx = OptimizationContext()
+
+    def load(self, data, offset):
+        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
+        if magic != b'ggcl':
+            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
+
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        if self.version != 0:
+            raise ValueError('Invalid version of checkpoint file')
+
+        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        offset = self.model.load(data, offset)
+        offset = self.opt_ctx.load(data, offset)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
+        gguf_writer.add_layer_norm_rms_eps(1e-5)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
+        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
+        self.model.save_gguf(gguf_writer)
+        self.opt_ctx.save_gguf(gguf_writer)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
+    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
+    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
+    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(cfg)
+    data = np.memmap(cfg.input, mode = 'r')
+    chk = LoraCheckpoint(n_ff = cfg.ff)
+    offset = 0
+    offset = chk.load(data, offset)
+    # we should have read all available data
+    assert(offset == len(data))
+
+    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+    chk.save_gguf(gguf_writer)
+    print("    gguf: write header")
+    gguf_writer.write_header_to_file()
+    print("    gguf: write metadata")
+    gguf_writer.write_kv_data_to_file()
+    print("    gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+if __name__ == '__main__':
+    main()

From 1425968ead586ddf22eda336826091f7d6a861c2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 18:30:16 +0200
Subject: [PATCH 116/235] remove old checkpoint save & load code

---
 examples/finetune/finetune.cpp | 352 +--------------------------------
 1 file changed, 4 insertions(+), 348 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 586e88f76283b..5413efa8d2566 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1761,335 +1761,6 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const c
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-// void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
-//     int32_t nd = file->read_u32();
-//     GGML_ASSERT(nd == tensor->n_dims);
-
-//     uint32_t name_len       = file->read_u32();
-//     enum     ggml_type type = (enum ggml_type) file->read_u32();
-//     GGML_ASSERT(type == tensor->type);
-
-//     uint32_t ne[4];
-//     file->read_raw(ne, sizeof(ne[0]) * nd);
-//     for (int i=0; i<nd; ++i) {
-//         GGML_ASSERT(ne[i] == tensor->ne[i]);
-//     }
-
-//     std::string name = file->read_string(name_len);
-//     GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
-
-//     file->seek((0-file->tell()) & 31, SEEK_CUR);
-//     file->read_raw(tensor->data, ggml_nbytes(tensor));
-// }
-
-void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) {
-    const uint32_t version = 1;
-    GGML_ASSERT(opt->nx   >= 0);
-    GGML_ASSERT(opt->iter >= 0);
-    file->write_u32(version);
-    file->write_u32(opt->params.past);
-    file->write_u32(opt->params.lbfgs.m);
-    file->write_raw(&opt->nx,     sizeof(opt->nx));
-    file->write_raw(&opt->iter,   sizeof(opt->iter));
-    file->write_u32((uint32_t)  opt->just_initialized);
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                GGML_ASSERT(opt->adam.m  != NULL);
-                GGML_ASSERT(opt->adam.v  != NULL);
-                write_tensor(file, opt->adam.m, NULL);
-                write_tensor(file, opt->adam.v, NULL);
-                write_tensor(file, opt->adam.pf, NULL);
-                file->write_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-                file->write_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-                file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                GGML_ASSERT(opt->lbfgs.x != NULL);
-                write_tensor(file, opt->lbfgs.x, NULL);
-                write_tensor(file, opt->lbfgs.xp, NULL);
-                write_tensor(file, opt->lbfgs.g, NULL);
-                write_tensor(file, opt->lbfgs.gp, NULL);
-                write_tensor(file, opt->lbfgs.d, NULL);
-                write_tensor(file, opt->lbfgs.pf, NULL);
-                write_tensor(file, opt->lbfgs.lmal, NULL);
-                write_tensor(file, opt->lbfgs.lmys, NULL);
-                write_tensor(file, opt->lbfgs.lms, NULL);
-                write_tensor(file, opt->lbfgs.lmy, NULL);
-                file->write_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-                file->write_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-                file->write_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-                file->write_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-                file->write_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-                file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-            } break;
-    }
-}
-
-// void read_opt_context_v1(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-//     opt->params.past    = (int) file->read_u32();
-//     opt->params.lbfgs.m = (int) file->read_u32();
-//     file->read_raw(&opt->nx,     sizeof(opt->nx));
-//     ggml_opt_init(ctx, opt, opt->params, opt->nx);
-
-//     file->read_raw(&opt->iter,   sizeof(opt->iter));
-//     opt->just_initialized = (bool) file->read_u32();
-
-//     switch (opt->params.type) {
-//         case GGML_OPT_ADAM:
-//             {
-//                 read_tensor(file, opt->adam.m);
-//                 read_tensor(file, opt->adam.v);
-//                 if (opt->adam.pf) { read_tensor(file, opt->adam.pf); }
-//                 file->read_raw(&opt->adam.fx_best,          sizeof(opt->adam.fx_best));
-//                 file->read_raw(&opt->adam.fx_prev,          sizeof(opt->adam.fx_prev));
-//                 file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement));
-//             } break;
-//         case GGML_OPT_LBFGS:
-//             {
-//                 GGML_ASSERT(opt->lbfgs.x != NULL);
-//                 read_tensor(file, opt->lbfgs.x);
-//                 read_tensor(file, opt->lbfgs.xp);
-//                 read_tensor(file, opt->lbfgs.g);
-//                 read_tensor(file, opt->lbfgs.gp);
-//                 read_tensor(file, opt->lbfgs.d);
-//                 if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); }
-//                 read_tensor(file, opt->lbfgs.lmal);
-//                 read_tensor(file, opt->lbfgs.lmys);
-//                 read_tensor(file, opt->lbfgs.lms);
-//                 read_tensor(file, opt->lbfgs.lmy);
-//                 file->read_raw(&opt->lbfgs.fx_best,          sizeof(opt->lbfgs.fx_best));
-//                 file->read_raw(&opt->lbfgs.step,             sizeof(opt->lbfgs.step));
-//                 file->read_raw(&opt->lbfgs.j,                sizeof(opt->lbfgs.j));
-//                 file->read_raw(&opt->lbfgs.k,                sizeof(opt->lbfgs.k));
-//                 file->read_raw(&opt->lbfgs.end,              sizeof(opt->lbfgs.end));
-//                 file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement));
-//             } break;
-//     }
-// }
-
-// void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) {
-//     uint32_t version = file->read_u32();
-//     printf("%s: opt context version %u\n", __func__, version);
-//     switch (version) {
-//         case 0:
-//             {
-//                 GGML_ASSERT(false); // not supported in finetune
-//             } break;
-//         case 1:
-//             {
-//                 read_opt_context_v1(file, ctx, opt);
-//             } break;
-//         default:
-//             {
-//                 fprintf(stderr, "%s: unknown version %u\n", __func__, version);
-//             }
-//     }
-// }
-
-void save_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, const char * pattern_it, int iteration, const char * latest) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    std::string fn = replace_str(filename, pattern_it, sit.c_str());
-    printf("%s: saving to %s\n", __func__, fn.c_str());
-    struct llama_file file(fn.c_str(), "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        return tn_buf.data();
-    };
-
-    const uint32_t magic   = 'ggcl';
-    const uint32_t version = 0;
-
-    file.write_u32(magic);
-    file.write_u32(version);
-    file.write_u32(lora->train_its);
-    file.write_u32(lora->train_samples);
-    file.write_u32(lora->train_tokens);
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(4 /*model->hparams.n_mult*/); 
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(lora->hparams.n_rank_attention_norm);
-    file.write_u32(lora->hparams.n_rank_wq);
-    file.write_u32(lora->hparams.n_rank_wk);
-    file.write_u32(lora->hparams.n_rank_wv);
-    file.write_u32(lora->hparams.n_rank_wo);
-    file.write_u32(lora->hparams.n_rank_ffn_norm);
-    file.write_u32(lora->hparams.n_rank_w1);
-    file.write_u32(lora->hparams.n_rank_w2);
-    file.write_u32(lora->hparams.n_rank_w3);
-    file.write_u32(lora->hparams.n_rank_tok_embeddings);
-    file.write_u32(lora->hparams.n_rank_norm);
-    file.write_u32(lora->hparams.n_rank_output);
-
-    write_tensor(&file, lora->tok_embeddings_a, "tok_embeddings.weight.loraA");
-    write_tensor(&file, lora->tok_embeddings_b, "tok_embeddings.weight.loraB");
-    write_tensor(&file, lora->norm_a,           "norm.weight.loraA");
-    write_tensor(&file, lora->norm_b,           "norm.weight.loraB");
-    write_tensor(&file, lora->output_a,         "output.weight.loraA");
-    write_tensor(&file, lora->output_b,         "output.weight.loraB");
-
-    print_data_checksum(lora->tok_embeddings_a);
-    print_data_checksum(lora->tok_embeddings_b);
-    print_data_checksum(lora->norm_a);
-    print_data_checksum(lora->norm_b);
-    print_data_checksum(lora->output_a);
-    print_data_checksum(lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-
-        write_tensor(&file, layer.attention_norm_a, tni("%d.attention_norm.weight.loraA", i));
-        write_tensor(&file, layer.attention_norm_b, tni("%d.attention_norm.weight.loraB", i));
-        write_tensor(&file, layer.wq_a,             tni("%d.attention.wq.weight.loraA", i));
-        write_tensor(&file, layer.wq_b,             tni("%d.attention.wq.weight.loraB", i));
-        write_tensor(&file, layer.wk_a,             tni("%d.attention.wk.weight.loraA", i));
-        write_tensor(&file, layer.wk_b,             tni("%d.attention.wk.weight.loraB", i));
-        write_tensor(&file, layer.wv_a,             tni("%d.attention.wv.weight.loraA", i));
-        write_tensor(&file, layer.wv_b,             tni("%d.attention.wv.weight.loraB", i));
-        write_tensor(&file, layer.wo_a,             tni("%d.attention.wo.weight.loraA", i));
-        write_tensor(&file, layer.wo_b,             tni("%d.attention.wo.weight.loraB", i));
-        write_tensor(&file, layer.ffn_norm_a,       tni("%d.ffn_norm.weight.loraA", i));
-        write_tensor(&file, layer.ffn_norm_b,       tni("%d.ffn_norm.weight.loraB", i));
-        write_tensor(&file, layer.w1_a,             tni("%d.feed_forward.w1.weight.loraA", i));
-        write_tensor(&file, layer.w1_b,             tni("%d.feed_forward.w1.weight.loraB", i));
-        write_tensor(&file, layer.w2_a,             tni("%d.feed_forward.w2.weight.loraA", i));
-        write_tensor(&file, layer.w2_b,             tni("%d.feed_forward.w2.weight.loraB", i));
-        write_tensor(&file, layer.w3_a,             tni("%d.feed_forward.w3.weight.loraA", i));
-        write_tensor(&file, layer.w3_b,             tni("%d.feed_forward.w3.weight.loraB", i));
-
-        print_data_checksum(layer.attention_norm_a);
-        print_data_checksum(layer.attention_norm_b);
-        print_data_checksum(layer.wq_a);
-        print_data_checksum(layer.wq_b);
-        print_data_checksum(layer.wk_a);
-        print_data_checksum(layer.wk_b);
-        print_data_checksum(layer.wv_a);
-        print_data_checksum(layer.wv_b);
-        print_data_checksum(layer.wo_a);
-        print_data_checksum(layer.wo_b);
-        print_data_checksum(layer.ffn_norm_a);
-        print_data_checksum(layer.ffn_norm_b);
-        print_data_checksum(layer.w1_a);
-        print_data_checksum(layer.w1_b);
-        print_data_checksum(layer.w2_a);
-        print_data_checksum(layer.w2_b);
-        print_data_checksum(layer.w3_a);
-        print_data_checksum(layer.w3_b);
-    }
-
-    write_opt_context(&file, opt);
-}
-
-// bool load_checkpoint(struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * filename, bool init) {
-//     struct llama_file file(filename, "rb");
-
-//     uint32_t magic;
-//     uint32_t version;
-
-//     uint32_t train_its = 0;
-//     uint32_t train_samples = 0;
-//     uint32_t train_tokens = 0;
-
-//     if (file.fp) {
-//         printf("%s: Loading model from '%s'.\n", __func__, filename);
-//         magic                  = file.read_u32();
-//         GGML_ASSERT(magic     == 'ggcl');
-//         version                = file.read_u32();
-//         GGML_ASSERT(version   == 0);
-//         train_its              = file.read_u32();
-//         train_samples          = file.read_u32();
-//         train_tokens           = file.read_u32();
-//         uint32_t n_vocab       = file.read_u32();
-//         uint32_t n_embd        = file.read_u32();
-//         uint32_t n_mult        = file.read_u32();
-//         uint32_t n_head        = file.read_u32();
-//         uint32_t n_layer       = file.read_u32();
-//         uint32_t n_rot         = file.read_u32();
-//         GGML_ASSERT(n_vocab == model->hparams.n_vocab);
-//         GGML_ASSERT(n_embd  == model->hparams.n_embd);
-//         //GGML_ASSERT(n_mult  == model->hparams.n_mult);
-//         GGML_ASSERT(n_head  == model->hparams.n_head);
-//         GGML_ASSERT(n_layer == model->hparams.n_layer);
-//         GGML_ASSERT(n_rot   == model->hparams.n_rot);
-//         lora->hparams.n_rank_attention_norm = file.read_u32();
-//         lora->hparams.n_rank_wq             = file.read_u32();
-//         lora->hparams.n_rank_wk             = file.read_u32();
-//         lora->hparams.n_rank_wv             = file.read_u32();
-//         lora->hparams.n_rank_wo             = file.read_u32();
-//         lora->hparams.n_rank_ffn_norm       = file.read_u32();
-//         lora->hparams.n_rank_w1             = file.read_u32();
-//         lora->hparams.n_rank_w2             = file.read_u32();
-//         lora->hparams.n_rank_w3             = file.read_u32();
-//         lora->hparams.n_rank_tok_embeddings = file.read_u32();
-//         lora->hparams.n_rank_norm           = file.read_u32();
-//         lora->hparams.n_rank_output         = file.read_u32();
-
-//         print_params(&model->hparams);
-//         print_lora_params(&lora->hparams);
-//     }
-
-//     if (init) {
-//         init_lora(model, lora);
-//     }
-
-//     if (file.fp) {
-//         lora->train_its = train_its;
-//         lora->train_samples = train_samples;
-//         lora->train_tokens = train_tokens;
-//     }
-
-//     printf("%s: Training iterations: %u.\n", __func__, lora->train_its);
-//     printf("%s: Training samples:    %u.\n", __func__, lora->train_samples);
-//     printf("%s: Training tokens:     %u.\n", __func__, lora->train_tokens);
-
-//     if (file.fp) {
-//         read_tensor(&file, lora->tok_embeddings_a);
-//         read_tensor(&file, lora->tok_embeddings_b);
-//         read_tensor(&file, lora->norm_a);
-//         read_tensor(&file, lora->norm_b);
-//         read_tensor(&file, lora->output_a);
-//         read_tensor(&file, lora->output_b);
-
-//         for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-//             auto & layer = lora->layers[i];
-
-//             read_tensor(&file, layer.attention_norm_a);
-//             read_tensor(&file, layer.attention_norm_b);
-//             read_tensor(&file, layer.wq_a);
-//             read_tensor(&file, layer.wq_b);
-//             read_tensor(&file, layer.wk_a);
-//             read_tensor(&file, layer.wk_b);
-//             read_tensor(&file, layer.wv_a);
-//             read_tensor(&file, layer.wv_b);
-//             read_tensor(&file, layer.wo_a);
-//             read_tensor(&file, layer.wo_b);
-//             read_tensor(&file, layer.ffn_norm_a);
-//             read_tensor(&file, layer.ffn_norm_b);
-//             read_tensor(&file, layer.w1_a);
-//             read_tensor(&file, layer.w1_b);
-//             read_tensor(&file, layer.w2_a);
-//             read_tensor(&file, layer.w2_b);
-//             read_tensor(&file, layer.w3_a);
-//             read_tensor(&file, layer.w3_b);
-//         }
-
-//         read_opt_context(&file, lora->ctx, opt);
-//     }
-
-//     return (file.fp != NULL);
-// }
-
 void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
@@ -2247,9 +1918,9 @@ struct train_params get_default_train_params() {
     struct train_params params;
     params.fn_model_base     = "";
     params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint-ITERATION.bin";
-    params.fn_lora_out       = "ggml-lora-ITERATION-f32.bin";
+    params.fn_checkpoint_in  = "checkpoint.gguf";
+    params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
+    params.fn_lora_out       = "ggml-lora-ITERATION-f32.gguf";
     params.pattern_fn_it     = "ITERATION";
     params.fn_latest         = "LATEST";
 
@@ -2780,13 +2451,6 @@ void opt_callback(void * vdata, float * sched) {
         if (strlen(params->fn_checkpoint_out) > 0) {
             save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
             save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, -1, params->fn_latest);
-            std::string fn_chk_old = params->fn_checkpoint_out;
-            fn_chk_old = fn_chk_old + std::string(".old.bin");
-            save_checkpoint(data->model, data->lora, opt, fn_chk_old.c_str(), params->pattern_fn_it, opt->iter, params->fn_latest);
-            save_checkpoint(data->model, data->lora, opt, fn_chk_old.c_str(), params->pattern_fn_it, -1, params->fn_latest);
-
-            // save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, opt->iter, params->fn_latest);
-            // save_checkpoint(data->model, data->lora, opt, params->fn_checkpoint_out, params->pattern_fn_it, -1, params->fn_latest);
         }
         if (strlen(params->fn_lora_out) > 0) {
             save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
@@ -2948,6 +2612,7 @@ int main(int argc, char ** argv) {
     bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, opt);
     if (!existed) {
         init_lora(&model, &lora);
+        randomize_lora(&lora, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
     set_param_lora(&lora);
     print_params(&model.hparams);
@@ -2958,11 +2623,6 @@ int main(int argc, char ** argv) {
     opt->iter = lora.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
-    bool from_scratch = !existed;
-    if (from_scratch) {
-        randomize_lora(&lora, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-    }
-
     printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
     // ggml_print_tensor_objects(lora.ctx);
 
@@ -3111,10 +2771,6 @@ int main(int argc, char ** argv) {
     if (params.n_examples > 0) {
         save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
         save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, -1, params.fn_latest);
-        std::string fn_chk_old = params.fn_checkpoint_out;
-        fn_chk_old = fn_chk_old + std::string(".old.bin");
-        save_checkpoint(&model, &lora, opt, fn_chk_old.c_str(), params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_checkpoint(&model, &lora, opt, fn_chk_old.c_str(), params.pattern_fn_it, -1, params.fn_latest);
     }
 
     if (strlen(params.fn_lora_out) > 0) {

From ebff3a14c329ca76860bce253c3d94e79a70f89a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 18:31:20 +0200
Subject: [PATCH 117/235] remove code to print data checksums which was used to
 verify correctness of new gguf code

---
 examples/finetune/finetune.cpp | 60 ----------------------------------
 1 file changed, 60 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 5413efa8d2566..e8efb1db951fb 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -17,40 +17,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-
-uint32_t compute_data_checksum(struct ggml_tensor * tensor) {
-    const int n3 = (tensor->n_dims >= 3) ? tensor->ne[3] : 1;
-    const int n2 = (tensor->n_dims >= 2) ? tensor->ne[2] : 1;
-    const int n1 = (tensor->n_dims >= 1) ? tensor->ne[1] : 1;
-    const int n0 = (tensor->n_dims >= 0) ? tensor->ne[0] : 1;
-    const size_t nb0 = tensor->nb[0];
-    const size_t nb1 = tensor->nb[1];
-    const size_t nb2 = tensor->nb[2];
-    const size_t nb3 = tensor->nb[3];
-    const size_t nb  = ggml_element_size(tensor);
-    uint32_t result = 0;
-    for (int i3 = 0; i3 < n3; ++i3) {
-        for (int i2 = 0; i2 < n2; ++i2) {
-            for (int i1 = 0; i1 < n1; ++i1) {
-                for (int i0 = 0; i0 < n0; ++i0) {
-                    char * ptr = ((char *) tensor->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                    uint32_t val;
-                    memcpy(&val, ptr, nb);
-                    result = result ^ val;
-                    result = (((result << 1u) | ((result >> 31u) & 0x1u)) + 1u) & 0xffffffffu;
-                }
-            }
-        }
-    }
-    return result;
-}
-
-void print_data_checksum(struct ggml_tensor * tensor) {
-    uint32_t chk = compute_data_checksum(tensor);
-    printf("%s: chk=[%08x] data=[%p] name=%s\n", __func__, chk, tensor->data, ggml_get_name(tensor));
-}
-
-
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -1560,13 +1526,6 @@ void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
     read_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
     read_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
     
-    print_data_checksum(lora->tok_embeddings_a);
-    print_data_checksum(lora->tok_embeddings_b);
-    print_data_checksum(lora->norm_a);
-    print_data_checksum(lora->norm_b);
-    print_data_checksum(lora->output_a);
-    print_data_checksum(lora->output_b);
-
     for (uint32_t i = 0; i < lora->layers.size(); ++i) {
         auto & layer = lora->layers[i];
         read_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
@@ -1587,25 +1546,6 @@ void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
         read_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
         read_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
         read_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
-
-        print_data_checksum(layer.attention_norm_a);
-        print_data_checksum(layer.attention_norm_b);
-        print_data_checksum(layer.wq_a);
-        print_data_checksum(layer.wq_b);
-        print_data_checksum(layer.wk_a);
-        print_data_checksum(layer.wk_b);
-        print_data_checksum(layer.wv_a);
-        print_data_checksum(layer.wv_b);
-        print_data_checksum(layer.wo_a);
-        print_data_checksum(layer.wo_b);
-        print_data_checksum(layer.ffn_norm_a);
-        print_data_checksum(layer.ffn_norm_b);
-        print_data_checksum(layer.w1_a);
-        print_data_checksum(layer.w1_b);
-        print_data_checksum(layer.w2_a);
-        print_data_checksum(layer.w2_b);
-        print_data_checksum(layer.w3_a);
-        print_data_checksum(layer.w3_b);
     }
 }
 

From 5813ac832fdf0df09b0519a8f38434bd790f50b1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 19:21:09 +0200
Subject: [PATCH 118/235] omit tokenization when training is disabled, only
 save llama lora adapter

training can be disabled by passing '-n 0' to finetune
---
 examples/finetune/finetune.cpp | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index e8efb1db951fb..48b44366ceb3e 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2457,12 +2457,14 @@ int main(int argc, char ** argv) {
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-    printf("%s: tokenize training data\n", __func__);
     std::vector<llama_token> train_tokens;
-    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
-        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
+    if (params.n_examples > 0) {
+        printf("%s: tokenize training data\n", __func__);
+        if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
+            fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
+        }
+        printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
     }
-    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
 
     struct my_llama_model model;
     init_model(lmodel, &model, params.n_ctx);
@@ -2579,17 +2581,19 @@ int main(int argc, char ** argv) {
         alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
     }
 
-    GGML_ASSERT(n_tokens < (int) train_tokens.size());
     std::vector<int> train_samples;
-    train_samples.push_back(0);
-    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
-            train_samples.push_back(i);
+    if (params.n_examples > 0) {
+        GGML_ASSERT(n_tokens < (int) train_tokens.size());
+        train_samples.push_back(0);
+        for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
+            if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
+                train_samples.push_back(i);
+            }
+        }
+        shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
+        for (int i = 0; i < (int) train_samples.size(); ++i) {
+            GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
         }
-    }
-    shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-    for (int i = 0; i < (int) train_samples.size(); ++i) {
-        GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
     printf("%s: begin training\n", __func__);

From a6165dafcde743f597803b987518a357dcadcea7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 19:30:42 +0200
Subject: [PATCH 119/235] remove trailing whitespace

---
 examples/finetune/convert-finetune-checkpoint-to-gguf.py | 2 +-
 examples/finetune/finetune.cpp                           | 6 +++---
 ggml.c                                                   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
index 4b9fa7c53b04d..96d6633ed7d5e 100644
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@@ -131,7 +131,7 @@ def __init__(self):
     def load(self, data, offset):
         self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
         offset += 4
-        
+
         if self.version != 1:
             raise ValueError('Invalid version of optimization context in checkpoint file')
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 48b44366ceb3e..4230b1449ab61 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1468,7 +1468,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
 
 void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-    
+
     std::string arch;
 
     std::vector<char> keybuf;
@@ -1525,7 +1525,7 @@ void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
     read_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
     read_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
     read_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
-    
+
     for (uint32_t i = 0; i < lora->layers.size(); ++i) {
         auto & layer = lora->layers[i];
         read_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
@@ -1565,7 +1565,7 @@ void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * mo
 
     gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
     gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff); 
+    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
     gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
     gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
     gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot);
diff --git a/ggml.c b/ggml.c
index 681891c8c98dc..49aff624329f7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16497,7 +16497,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 if (src0->grad) {
                     src0->grad =
                         ggml_add_or_set(ctx, src0->grad,
-                            // last ggml_get_rows_back argument src0->grad is only 
+                            // last ggml_get_rows_back argument src0->grad is only
                             // necessary to setup correct output shape
                             ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
                         zero_table);

From e28cf7e9ce3daba1ef0f844a78ef0b067f468409 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 19:38:23 +0200
Subject: [PATCH 120/235] update README.md

---
 examples/finetune/README.md | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index ea17c38d9ee3d..11fe992d0c409 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -8,19 +8,18 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 
 # finetune LORA adapter
 ./bin/finetune \
-        --model-base open-llama-3b-v2-q8_0.bin \
-        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
-        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
+        --model-base open-llama-3b-v2-q8_0.gguf \
+        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
+        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
         --model-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
         --train-data "shakespeare.txt" \
         --save-every 10 \
         --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --print-details-interval 0 --predict 0 \
         --use-checkpointing --use-alloc \
         --mem-lora 2 --mem-compute 1 --mem-compute0 20
 
 # predict
-./bin/main -m open-llama-3b-v2-q8_0.bin --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```
 
 Finetune output files will be saved every N iterations (config with `--save-every N`).
@@ -31,10 +30,6 @@ If you have enough RAM, you can make finetuning a bit faster by disabling checkp
 
 To change the amount of memory for finetuning with memory allocator (`--use-alloc`, used by default), you can use `--mem-compute0 N` to specify the number of gigabytes.
 
-After training, text is generated using the trained LORA. 
-But this text prediction is not optimized as well as it is in `main`. 
-It may result in out-of-memory crash, to disable the text prediction after training use `--predict 0`.
-
 The LORA rank is configured for each model tensor type separately with these command line options:
 
 ```bash

From 794bb7ea42d366e4195452322db1196082e0b593 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 19:59:14 +0200
Subject: [PATCH 121/235] implement ggml_compute_forward_repeat_f16

---
 ggml.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/ggml.c b/ggml.c
index 49aff624329f7..c584ff2de9d89 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9950,11 +9950,61 @@ static void ggml_compute_forward_repeat_f32(
     }
 }
 
+static void ggml_compute_forward_repeat_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_can_repeat(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_UNARY_OP_LOCALS;
+
+    // guaranteed to be an integer due to the check in ggml_can_repeat
+    const int nr0 = (int)(ne0/ne00);
+    const int nr1 = (int)(ne1/ne01);
+    const int nr2 = (int)(ne2/ne02);
+    const int nr3 = (int)(ne3/ne03);
+
+    // TODO: support for transposed / permuted tensors
+    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // TODO: maybe this is not optimal?
+    for                         (int i3 = 0; i3 < nr3;  i3++) {
+        for                     (int k3 = 0; k3 < ne03; k3++) {
+            for                 (int i2 = 0; i2 < nr2;  i2++) {
+                for             (int k2 = 0; k2 < ne02; k2++) {
+                    for         (int i1 = 0; i1 < nr1;  i1++) {
+                        for     (int k1 = 0; k1 < ne01; k1++) {
+                            for (int i0 = 0; i0 < nr0;  i0++) {
+                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
+                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
+                                // ggml_vec_cpy_f16(ne00, y, x)
+                                for (int i = 0; i < ne00; ++i) {
+                                    y[i]  = x[i];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_repeat(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
     switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_repeat_f16(params, src0, dst);
+            } break;
         case GGML_TYPE_F32:
             {
                 ggml_compute_forward_repeat_f32(params, src0, dst);

From 5f0a4e971f6e1884861be33f20cc2d5c816dea97 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 19:59:41 +0200
Subject: [PATCH 122/235] avoid stack overflow of large cgraphs in test-grad0

---
 tests/test-grad0.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 468cde66adc65..4454c8092a252 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -251,18 +251,19 @@ static bool check_gradient(
         printf("GGML_N_THREADS = %d\n", n_threads);
     }
 
-    struct ggml_cgraph gf = ggml_build_forward (f);
-    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+    struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
+    struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+    ggml_build_backward_expand(ctx0, gf, gb, false);
 
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 
-    ggml_graph_reset  (&gf);
+    ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 
-    // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
+    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
+    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
 
     for (int i = 0; i < nargs; ++i) {
         const int nelements = ggml_nelements(x[i]);
@@ -273,13 +274,13 @@ static bool check_gradient(
             const float xp = x0 + eps;
             ggml_set_f32_1d(x[i], k, xp);
 
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 
             const double f0 = ggml_get_f32_1d(f, 0);
 
             ggml_set_f32_1d(x[i], k, xm);
 
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 
             const double f1 = ggml_get_f32_1d(f, 0);
             const double g0 = (f0 - f1)/(2.0*(double) eps);
@@ -287,10 +288,10 @@ static bool check_gradient(
             ggml_set_f32_1d(x[i], k, x0);
 
             // compute gradient using backward graph
-            ggml_graph_reset  (&gf);
+            ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 
             const double g1 = ggml_get_f32_1d(x[i]->grad, k);
 

From 82c5247a20f07592ec1fd6ca1cdae6f23f1d6874 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 20:59:31 +0200
Subject: [PATCH 123/235] add ggml API functions ggml_unravel_index,
 ggml_get_i32_nd and its analogs for set and for f32

ggml_get_i32_1d, ggml_set_i32_1d, ggml_get_f32_1d, ggml_set_f32_1d now support non-contiguous tensors.
in case of non-contiguous tensor, the 1d index is unraveled into a multi index using ggml_unravel_index to be passed to '_nd' function equivalent.

this fixes a bug in test-grad0 which happens due to ggml_build_backward not building purely contiguous tensors anymore
---
 ggml.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ggml.h |   9 +++
 2 files changed, 180 insertions(+)

diff --git a/ggml.c b/ggml.c
index c584ff2de9d89..76f0b7b941ba4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4838,7 +4838,37 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
     return tensor;
 }
 
+void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
+    const int64_t ne3 = tensor->ne[3];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne0 = tensor->ne[0];
+
+    const int64_t i3_ = (i/(ne2*ne1*ne0));
+    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
+    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
+    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
+
+    if (i0) {
+        * i0 = i0_;
+    }
+    if (i1) {
+        * i1 = i1_;
+    }
+    if (i2) {
+        * i2 = i2_;
+    }
+    if (i3) {
+        * i3 = i3_;
+    }
+}
+
 int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
     switch (tensor->type) {
         case GGML_TYPE_I8:
             {
@@ -4875,6 +4905,12 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
 }
 
 void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
     switch (tensor->type) {
         case GGML_TYPE_I8:
             {
@@ -4908,7 +4944,74 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
     }
 }
 
+int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                return ((int8_t *) data)[0];
+            } break;
+        case GGML_TYPE_I16:
+            {
+                return ((int16_t *) data)[0];
+            } break;
+        case GGML_TYPE_I32:
+            {
+                return ((int32_t *) data)[0];
+            } break;
+        case GGML_TYPE_F16:
+            {
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                return ((float *) data)[0];
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    return 0.0f;
+}
+
+void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
+    }
     switch (tensor->type) {
         case GGML_TYPE_I8:
             {
@@ -4945,6 +5048,12 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
 }
 
 void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = { 0, 0, 0, 0 };
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
     switch (tensor->type) {
         case GGML_TYPE_I8:
             {
@@ -4978,6 +5087,68 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
     }
 }
 
+float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                return ((int8_t *) data)[0];
+            } break;
+        case GGML_TYPE_I16:
+            {
+                return ((int16_t *) data)[0];
+            } break;
+        case GGML_TYPE_I32:
+            {
+                return ((int32_t *) data)[0];
+            } break;
+        case GGML_TYPE_F16:
+            {
+                return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                return ((float *) data)[0];
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+
+    return 0.0f;
+}
+
+void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
+    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            {
+                ((int8_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I16:
+            {
+                ((int16_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_I32:
+            {
+                ((int32_t *)(data))[0] = value;
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ((float *)(data))[0] = value;
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 void * ggml_get_data(const struct ggml_tensor * tensor) {
     return tensor->data;
 }
diff --git a/ggml.h b/ggml.h
index 25f951f26a0cb..5d63dc0c302c5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -671,12 +671,21 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
     GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 
+    // Converts a flat index into coordinates
+    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
     GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
 
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i1, int i2, int i3, int32_t value);
+
     GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
 
+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 

From 5fcfa7e49ed31195e972834b58786443239ab090 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 21:00:19 +0200
Subject: [PATCH 124/235] increase test-grad0 context mem size to accommodate
 for bigger cgraph

---
 tests/test-grad0.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 4454c8092a252..55a90661b8e05 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -374,7 +374,7 @@ static bool check_mat_mul(
 
 int main(int argc, const char ** argv) {
     struct ggml_init_params params = {
-        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_size   = */ 256*1024*1024,
         /* .mem_buffer = */ NULL,
         /* .no_alloc   = */ false,
     };

From b1aa26f718d0796e1eafad069d6afa544b98b1b3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 21:01:17 +0200
Subject: [PATCH 125/235] add sanity check to ggml_compute_backward, asserting
 the correct shape of gradients

---
 ggml.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml.c b/ggml.c
index 76f0b7b941ba4..f3b2c65b8b54f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17147,6 +17147,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 GGML_ASSERT(false);
             } break;
     }
+
+    for (int i = 0; i < GGML_MAX_SRC; ++i) {
+        if (tensor->src[i] && tensor->src[i]->grad) {
+            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
+        }
+    }
 }
 
 static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {

From a76e66ac8d5de0c22f68e0ac4b05937d91e6e445 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 21:02:10 +0200
Subject: [PATCH 126/235] fix ggml_acc_or_set to return tensor of correct shape

---
 ggml.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index f3b2c65b8b54f..9f9c1d2bbfd95 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16255,7 +16255,8 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
 
 static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
     if (hash_contains(zero_table, a)) {
-        return b;
+        struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
+        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
     } else {
         return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
     }

From dd4e4bca090b1a03a564293719984221f6e610cc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 21:20:44 +0200
Subject: [PATCH 127/235] remove unused 'inplace' argument from
 ggml_compute_backward function

inplace operations to add gradients are no longer created by ggml_compute_backward
use allocator to automatically make inplace operations
---
 ggml.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index 9f9c1d2bbfd95..2f3206332dfcf 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16278,7 +16278,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
     }
 }
 
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace, void * zero_table[]) {
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
 
@@ -17268,9 +17268,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
     for (int i = gf->n_nodes - 1; i >= 0; i--) {
         struct ggml_tensor * node = gf->nodes[i];
 
-        // because we detached the grad nodes from the original graph, we can afford inplace operations
+        // inplace operations to add gradients are not created by ggml_compute_backward
+        // use allocator to automatically make inplace operations
         if (node->grad) {
-            ggml_compute_backward(ctx, node, keep, zero_table);
+            ggml_compute_backward(ctx, node, zero_table);
         }
     }
 

From 8a96d4c2aa8da5849ec54a90ce8ef0221a79eca2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 21:24:37 +0200
Subject: [PATCH 128/235] add missing argument 'int i0' to ggml_get_i32_nd &
 ggml_set_i32_nd header declarations

---
 ggml.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.h b/ggml.h
index 5d63dc0c302c5..c28f1bb675418 100644
--- a/ggml.h
+++ b/ggml.h
@@ -677,8 +677,8 @@ extern "C" {
     GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
 
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i1, int i2, int i3, int32_t value);
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
 
     GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);

From 5854f51188cf6746972c6dc350534686cec09660 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 22:49:01 +0200
Subject: [PATCH 129/235] fix error message in ggml_allocr_alloc to display
 actual max_avail

---
 ggml-alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index a48c23e66a6c9..d18c8d5044e25 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -135,9 +135,9 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     if (best_fit_block == -1) {
         // the last block is our last resort
         struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
         if (block->size >= size) {
             best_fit_block = alloc->n_free_blocks - 1;
-            max_avail = MAX(max_avail, block->size);
         } else {
             fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                     __func__, size, max_avail);

From bf70e27cd60a5401ef3d543608e39fd8172da306 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 29 Aug 2023 23:08:30 +0200
Subject: [PATCH 130/235] fix check_gradient

ggml_build_backward_expand was previously replaced by ggml_build_backward, but the assignment of forward graph to backward graph missing
---
 tests/test-grad0.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 55a90661b8e05..6a81c940e8146 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -253,6 +253,7 @@ static bool check_gradient(
 
     struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
     struct ggml_cgraph * gb = ggml_new_graph(ctx0);
+    *gb = *gf;
     ggml_build_backward_expand(ctx0, gf, gb, false);
 
     ggml_graph_compute_with_ctx(ctx0, gf, n_threads);

From 2392b6725ba56ad1f799e1420e4102db1018ab6e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 14:46:12 +0200
Subject: [PATCH 131/235] use tensor->view_src instead of ggml_is_view and
 get_view_source

---
 examples/finetune/finetune.cpp | 43 ++++++++--------------------------
 1 file changed, 10 insertions(+), 33 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 4230b1449ab61..ea443708b1016 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -680,33 +680,6 @@ void free_hash_map(struct hash_map * map) {
     delete map;
 }
 
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
-           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
-}
-
-static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
-    switch (t->op) {
-        case GGML_OP_PERMUTE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_VIEW:
-            return t->src[0];
-        case GGML_OP_CPY:
-            return t->src[1];
-        default:
-            return NULL;
-    }
-}
-
-static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
-    struct ggml_tensor * parent = t;
-    do {
-        parent = get_view_parent(parent);
-    } while (ggml_is_view(parent));
-    return parent;
-}
-
 struct ggml_tensor * ggml_recompute_graph_node(
         struct ggml_context * ctx,
         struct ggml_cgraph  * graph,
@@ -759,10 +732,14 @@ struct ggml_tensor * ggml_recompute_graph_node(
     for (int k = 0; k < GGML_MAX_SRC; ++k) {
         clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
     }
-    if (ggml_is_view(clone)) {
-        struct ggml_tensor * source = get_view_source(clone);
-        GGML_ASSERT(source != NULL);
-        clone->data = source->data;
+    if (node->view_src != NULL) {
+        // GGML_ASSERT(node->view_src->data != NULL);
+        clone->data = (node->view_src->data == NULL)
+                        ? NULL // view_src not yet allocated
+                        : (char *) node->view_src->data // view_src already allocated
+                                 + node->view_offs;
+        clone->view_src  = node->view_src;
+        clone->view_offs = node->view_offs;
     }
 
     GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
@@ -1002,7 +979,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
         // input gradient
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
-        GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
+        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
         ggml_allocr_alloc(alloc, t36->grad);
 
         // make sure base model tensors data cannot be used in viewable operations
@@ -1025,7 +1002,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
         for (int i = 0; i < checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
+            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);
             }
         }

From d487e0531f577fa8b8e671e9a59df77053df62f7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 15:21:10 +0200
Subject: [PATCH 132/235] move gradient checkpointing code into ggml, new API
 function:

// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
        struct ggml_context   * ctx,
        struct ggml_cgraph    * gf,
        struct ggml_cgraph    * gb,
        struct ggml_cgraph    * gb_tmp,
        struct ggml_tensor  * * checkpoints,
        int                     n_checkpoints);
---
 examples/finetune/finetune.cpp                | 173 ---------------
 .../train-text-from-scratch.cpp               | 202 +-----------------
 ggml.c                                        | 156 ++++++++++++--
 ggml.h                                        |  10 +
 4 files changed, 154 insertions(+), 387 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ea443708b1016..66ff2f8e02181 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -623,179 +623,6 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-static size_t hash(void * p) {
-    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
-}
-
-static size_t hash_find(void * hash_table[], void * p) {
-    size_t h = hash(p);
-
-    // linear probing
-    size_t i = h;
-    while (hash_table[i] != NULL && hash_table[i] != p) {
-        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
-        if (i == h) {
-            // visited all hash table entries -> not found
-            return GGML_GRAPH_HASHTABLE_SIZE;
-        }
-    }
-    return i;
-}
-
-static bool hash_insert(void * hash_table[], void * p) {
-    size_t i = hash_find(hash_table, p);
-
-    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-
-    if (hash_table[i] == p) {
-        return true;
-    }
-
-    // insert
-    GGML_ASSERT(hash_table[i] == NULL);
-    hash_table[i] = p;
-    return false;
-}
-
-static bool hash_contains(void * hash_table[], void * p) {
-    size_t i = hash_find(hash_table, p);
-    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
-}
-
-struct hash_map {
-    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
-    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
-};
-
-struct hash_map * new_hash_map() {
-    struct hash_map * result = new struct hash_map;
-    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
-        result->keys[i] = NULL;
-        result->vals[i] = NULL;
-    }
-    return result;
-};
-
-void free_hash_map(struct hash_map * map) {
-    delete map;
-}
-
-struct ggml_tensor * ggml_recompute_graph_node(
-        struct ggml_context * ctx,
-        struct ggml_cgraph  * graph,
-        struct hash_map     * replacements,
-        struct ggml_tensor  * node) {
-
-    if (node == NULL) {
-        return NULL;
-    }
-
-    if (node->is_param) {
-        return node;
-    }
-
-    if (!hash_contains(graph->visited_hash_table, node)) {
-        return node;
-    }
-
-    int count_children = 0;
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        if (node->src[k]) {
-            ++count_children;
-        }
-    }
-
-    if (count_children == 0) {
-        return node;
-    }
-
-    size_t i = hash_find(replacements->keys, node);
-    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-    if (replacements->keys[i] == node) {
-        return (struct ggml_tensor *) replacements->vals[i];
-    }
-
-    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
-
-    // insert clone into replacements
-    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
-    replacements->keys[i] = node;
-    replacements->vals[i] = clone;
-
-    clone->op       = node->op;
-    clone->grad     = node->grad;
-    clone->is_param = node->is_param;
-    clone->extra    = node->extra;
-    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
-        clone->nb[k] = node->nb[k];
-    }
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
-    }
-    if (node->view_src != NULL) {
-        // GGML_ASSERT(node->view_src->data != NULL);
-        clone->data = (node->view_src->data == NULL)
-                        ? NULL // view_src not yet allocated
-                        : (char *) node->view_src->data // view_src already allocated
-                                 + node->view_offs;
-        clone->view_src  = node->view_src;
-        clone->view_offs = node->view_offs;
-    }
-
-    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
-    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
-    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
-
-    return clone;
-};
-
-void ggml_build_backward_gradient_checkpointing(
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * checkpoints,
-        int                     n_checkpoints) {
-    *gb_tmp = *gf;
-    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
-
-    if (n_checkpoints <= 0) {
-        *gb = *gb_tmp;
-        return;
-    }
-
-    struct hash_map * replacements = new_hash_map();
-
-    // insert checkpoints in replacements
-    for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = hash_find(replacements->keys, checkpoints[i]);
-        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
-        replacements->keys[k] = checkpoints[i];
-        replacements->vals[k] = checkpoints[i];
-    }
-
-    *gb = *gf;
-    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
-    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
-    // by recomputing them from checkpoints
-    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
-        struct ggml_tensor * node = gb_tmp->nodes[i];
-        for (int k = 0; k < GGML_MAX_SRC; ++k) {
-            // insert new tensors recomputing src, reusing already made replacements,
-            // remember replacements: remember new tensors with mapping from corresponding gf nodes
-            // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are checkpoints
-            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
-        }
-        // insert rewritten backward node with replacements made into resulting backward graph gb
-        ggml_build_forward_expand(gb, node);
-    }
-
-    free_hash_map(replacements);
-}
-
 struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct my_llama_model * model,
         struct my_llama_lora  * lora,
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 020440a62ec1d..f907a7402197b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -451,204 +451,6 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-static size_t hash(void * p) {
-    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
-}
-
-static size_t hash_find(void * hash_table[], void * p) {
-    size_t h = hash(p);
-
-    // linear probing
-    size_t i = h;
-    while (hash_table[i] != NULL && hash_table[i] != p) {
-        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
-        if (i == h) {
-            // visited all hash table entries -> not found
-            return GGML_GRAPH_HASHTABLE_SIZE;
-        }
-    }
-    return i;
-}
-
-static bool hash_insert(void * hash_table[], void * p) {
-    //size_t h = hash(p);
-    size_t i = hash_find(hash_table, p);
-
-    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-
-    if (hash_table[i] == p) {
-        return true;
-    }
-
-    // insert
-    GGML_ASSERT(hash_table[i] == NULL);
-    hash_table[i] = p;
-    return false;
-}
-
-static bool hash_contains(void * hash_table[], void * p) {
-    size_t i = hash_find(hash_table, p);
-    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
-}
-
-struct hash_map {
-    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
-    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
-};
-//static const size_t HASH_MAP_SIZE = sizeof(struct hash_map);
-
-struct hash_map * new_hash_map() {
-    struct hash_map * result = new struct hash_map;
-    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
-        result->keys[i] = NULL;
-        result->vals[i] = NULL;
-    }
-    return result;
-};
-
-void free_hash_map(struct hash_map * map) {
-    delete map;
-}
-
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
-           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
-}
-
-static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
-    switch (t->op) {
-        case GGML_OP_PERMUTE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_VIEW:
-            return t->src[0];
-        case GGML_OP_CPY:
-            return t->src[1];
-        default:
-            return NULL;
-    }
-}
-
-static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
-    struct ggml_tensor * parent = t;
-    do {
-        parent = get_view_parent(parent);
-    } while (ggml_is_view(parent));
-    return parent;
-}
-
-struct ggml_tensor * ggml_recompute_graph_node(
-        struct ggml_context * ctx,
-        struct ggml_cgraph  * graph,
-        struct hash_map     * replacements,
-        struct ggml_tensor  * node) {
-
-    if (node == NULL) {
-        return NULL;
-    }
-
-    if (node->is_param) {
-        return node;
-    }
-
-    if (!hash_contains(graph->visited_hash_table, node)) {
-        return node;
-    }
-
-    int count_children = 0;
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        if (node->src[k]) {
-            ++count_children;
-        }
-    }
-
-    if (count_children == 0) {
-        return node;
-    }
-
-    size_t i = hash_find(replacements->keys, node);
-    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-    if (replacements->keys[i] == node) {
-        return (struct ggml_tensor *) replacements->vals[i];
-    }
-
-    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
-
-    // insert clone into replacements
-    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
-    replacements->keys[i] = node;
-    replacements->vals[i] = clone;
-
-    clone->op       = node->op;
-    clone->grad     = node->grad;
-    clone->is_param = node->is_param;
-    clone->extra    = node->extra;
-    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
-        clone->nb[k] = node->nb[k];
-    }
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
-    }
-    if (ggml_is_view(clone)) {
-        struct ggml_tensor * source = get_view_source(clone);
-        GGML_ASSERT(source != NULL);
-        clone->data = source->data;
-    }
-
-    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
-    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
-    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
-
-    return clone;
-};
-
-void ggml_build_backward_gradient_checkpointing(
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * checkpoints,
-        int                     n_checkpoints) {
-    *gb_tmp = *gf;
-    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
-
-    if (n_checkpoints <= 0) {
-        *gb = *gb_tmp;
-        return;
-    }
-
-    struct hash_map * replacements = new_hash_map();
-
-    // insert checkpoints in replacements
-    for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = hash_find(replacements->keys, checkpoints[i]);
-        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
-        replacements->keys[k] = checkpoints[i];
-        replacements->vals[k] = checkpoints[i];
-    }
-
-    *gb = *gf;
-    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
-    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
-    // by recomputing them from checkpoints
-    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
-        struct ggml_tensor * node = gb_tmp->nodes[i];
-        for (int k = 0; k < GGML_MAX_SRC; ++k) {
-            // insert new tensors recomputing src, reusing already made replacements,
-            // remember replacements: remember new tensors with mapping from corresponding gf nodes
-            // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are checkpoints
-            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
-        }
-        // insert rewritten backward node with replacements made into resulting backward graph gb
-        ggml_build_forward_expand(gb, node);
-    }
-
-    free_hash_map(replacements);
-}
-
 struct ggml_tensor * llama_build_train_graphs(
         struct my_llama_model * model,
         struct ggml_allocr    * alloc,
@@ -794,13 +596,13 @@ struct ggml_tensor * llama_build_train_graphs(
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
         // input gradient
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
-        GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
+        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
         ggml_allocr_alloc(alloc, t36->grad);
 
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
         for (int i = 0; i < (int) checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) {
+            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);
             }
         }
diff --git a/ggml.c b/ggml.c
index 00b1a37508519..73d0f337b2f1c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16174,7 +16174,7 @@ static size_t hash(void * p) {
     return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
 }
 
-static bool hash_insert(void * hash_table[], void * p) {
+static size_t hash_find(void * hash_table[], void * p) {
     size_t h = hash(p);
 
     // linear probing
@@ -16182,38 +16182,166 @@ static bool hash_insert(void * hash_table[], void * p) {
     while (hash_table[i] != NULL && hash_table[i] != p) {
         i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
         if (i == h) {
-            // hash table is full
-            GGML_ASSERT(false);
+            // visited all hash table entries -> not found
+            return GGML_GRAPH_HASHTABLE_SIZE;
         }
     }
+    return i;
+}
 
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t i = hash_find(hash_table, p);
+    
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+    
     if (hash_table[i] == p) {
         return true;
     }
 
     // insert
+    GGML_ASSERT(hash_table[i] == NULL);
     hash_table[i] = p;
     return false;
 }
 
 static bool hash_contains(void * hash_table[], void * p) {
-    size_t h = hash(p);
+    size_t i = hash_find(hash_table, p);
+    return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
+}
 
-    // linear probing
-    size_t i = h;
-    while (hash_table[i] != NULL && hash_table[i] != p) {
-        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
-        if (i == h) {
-            // hash table is full
-            return false;
+struct hash_map {
+    void * keys[GGML_GRAPH_HASHTABLE_SIZE];
+    void * vals[GGML_GRAPH_HASHTABLE_SIZE];
+};
+
+struct hash_map * new_hash_map() {
+    struct hash_map * result = malloc(sizeof(struct hash_map));
+    for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
+        result->keys[i] = NULL;
+        result->vals[i] = NULL;
+    }
+    return result;
+};
+
+void free_hash_map(struct hash_map * map) {
+    free(map);
+}
+
+// gradient checkpointing
+
+static struct ggml_tensor * ggml_recompute_graph_node(
+        struct ggml_context * ctx,
+        struct ggml_cgraph  * graph,
+        struct hash_map     * replacements,
+        struct ggml_tensor  * node) {
+
+    if (node == NULL) {
+        return NULL;
+    }
+
+    if (node->is_param) {
+        return node;
+    }
+
+    if (!hash_contains(graph->visited_hash_table, node)) {
+        return node;
+    }
+
+    int count_children = 0;
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        if (node->src[k]) {
+            ++count_children;
         }
     }
 
-    if (hash_table[i] == p) {
-        return true;
+    if (count_children == 0) {
+        return node;
     }
 
-    return false;
+    size_t i = hash_find(replacements->keys, node);
+    GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+    if (replacements->keys[i] == node) {
+        return (struct ggml_tensor *) replacements->vals[i];
+    }
+
+    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+
+    // insert clone into replacements
+    GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
+    replacements->keys[i] = node;
+    replacements->vals[i] = clone;
+
+    clone->op       = node->op;
+    clone->grad     = node->grad;
+    clone->is_param = node->is_param;
+    clone->extra    = node->extra;
+    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
+        clone->nb[k] = node->nb[k];
+    }
+    for (int k = 0; k < GGML_MAX_SRC; ++k) {
+        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
+    }
+    if (node->view_src != NULL) {
+        clone->data = (node->view_src->data == NULL)
+                        ? NULL // view_src not yet allocated
+                        : (char *) node->view_src->data // view_src already allocated
+                                 + node->view_offs;
+        clone->view_src  = node->view_src;
+        clone->view_offs = node->view_offs;
+    }
+
+    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
+    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
+    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
+    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
+
+    return clone;
+};
+
+void ggml_build_backward_gradient_checkpointing(
+        struct ggml_context   * ctx,
+        struct ggml_cgraph    * gf,
+        struct ggml_cgraph    * gb,
+        struct ggml_cgraph    * gb_tmp,
+        struct ggml_tensor  * * checkpoints,
+        int                     n_checkpoints) {
+    *gb_tmp = *gf;
+    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
+
+    if (n_checkpoints <= 0) {
+        *gb = *gb_tmp;
+        return;
+    }
+
+    struct hash_map * replacements = new_hash_map();
+
+    // insert checkpoints in replacements
+    for (int i = 0; i < n_checkpoints; ++i) {
+        size_t k = hash_find(replacements->keys, checkpoints[i]);
+        GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
+        GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
+        replacements->keys[k] = checkpoints[i];
+        replacements->vals[k] = checkpoints[i];
+    }
+
+    *gb = *gf;
+    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
+    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
+    // by recomputing them from checkpoints
+    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
+        struct ggml_tensor * node = gb_tmp->nodes[i];
+        for (int k = 0; k < GGML_MAX_SRC; ++k) {
+            // insert new tensors recomputing src, reusing already made replacements,
+            // remember replacements: remember new tensors with mapping from corresponding gf nodes
+            // recurse for input tensors,
+            // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
+            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
+        }
+        // insert rewritten backward node with replacements made into resulting backward graph gb
+        ggml_build_forward_expand(gb, node);
+    }
+
+    free_hash_map(replacements);
 }
 
 // functions to change gradients considering the case that input a might be initial gradient with zero value
diff --git a/ggml.h b/ggml.h
index db6ccf9e2de5b..b90809bbe8fd1 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1664,6 +1664,16 @@ extern "C" {
     // dump the graph into a file using the dot format
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes, 
+    // but without the second forward pass nodes.
+    GGML_API void ggml_build_backward_gradient_checkpointing(
+            struct ggml_context   * ctx,
+            struct ggml_cgraph    * gf,
+            struct ggml_cgraph    * gb,
+            struct ggml_cgraph    * gb_tmp,
+            struct ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
     //
     // optimization
     //

From e6b71581235d1715fed0bb17ff0bdea1ef9f158f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 15:19:26 +0200
Subject: [PATCH 133/235] replace custom data getters and setters by ggml
 functions

---
 examples/finetune/finetune.cpp                | 37 ++----------
 .../train-text-from-scratch.cpp               | 56 ++-----------------
 2 files changed, 12 insertions(+), 81 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 66ff2f8e02181..9da63d8c92876 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -854,31 +854,6 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     return t36;
 }
 
-void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-    *ptr = value;
-}
-
-void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
 void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
@@ -891,8 +866,8 @@ void get_example_targets(struct llama_context * lctx, const int * train_samples,
     ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        set_f32_2d(target_logits, token, i-1, +1.0f);
-        set_f32_2d(target_probs,  token, i-1, +1.0f);
+        ggml_set_f32_nd(target_logits, token, i-1, 0, 0, +1.0f);
+        ggml_set_f32_nd(target_probs,  token, i-1, 0, 0, +1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
         }
@@ -922,13 +897,13 @@ void get_example_targets_batch(struct llama_context* lctx, const int * train_sam
         // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
+        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, llama_token_bos(lctx));
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            set_f32_3d(target_logits, token, i-1, k, +1.0f);
-            set_f32_3d(target_probs,  token, i-1, k, +1.0f);
+            ggml_set_f32_nd(target_logits, token, i-1, k, 0, +1.0f);
+            ggml_set_f32_nd(target_probs,  token, i-1, k, 0, +1.0f);
             if (i<n_tokens) {
-                set_i32_2d(tokens_input, i, k, token);
+                ggml_set_i32_nd(tokens_input, i, k, 0, 0, token);
             }
         }
     }
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f907a7402197b..aaae5e5768a0e 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -627,50 +627,6 @@ struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-    *ptr = value;
-}
-
-void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    *ptr = value;
-}
-
-float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-void print_row(struct ggml_tensor * probs, int i) {
-    for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = get_f32_2d(probs, k, i);
-        printf(" %.2f", p);
-    }
-    printf("\n");
-}
-
-void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
-    for (int i = 0; i < probs->ne[1]; ++i) {
-        for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = get_f32_2d(probs, k, i);
-            printf(" %.2f", p);
-        }
-        printf("\n");
-    }
-}
-
 void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
@@ -683,8 +639,8 @@ void get_example_targets(struct llama_context * lctx, const int * train_samples,
     ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        set_f32_2d(target_logits, token, i-1, +1.0f);
-        set_f32_2d(target_probs,  token, i-1, +1.0f);
+        ggml_set_f32_nd(target_logits, token, i-1, 0, 0, +1.0f);
+        ggml_set_f32_nd(target_probs,  token, i-1, 0, 0, +1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
         }
@@ -714,13 +670,13 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
         // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
+        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, llama_token_bos(lctx));
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            set_f32_3d(target_logits, token, i-1, k, +1.0f);
-            set_f32_3d(target_probs,  token, i-1, k, +1.0f);
+            ggml_set_f32_nd(target_logits, token, i-1, k, 0, +1.0f);
+            ggml_set_f32_nd(target_probs,  token, i-1, k, 0, +1.0f);
             if (i<n_tokens) {
-                set_i32_2d(tokens_input, i, k, token);
+                ggml_set_i32_nd(tokens_input, i, k, 0, 0, token);
             }
         }
     }

From fc456edda6b4b38308d193a4cb8bac4cfe92f35a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 15:57:17 +0200
Subject: [PATCH 134/235] train-text-from-scratch can train (full finetune)
 gguf models

just pass the gguf model via `--checkpoint-in FN`.
after this, to continue training, pass the generated checkpoint instead of the original gguf model.

tested with smaller models, bigger models may exceed available memory.
use (LORA) finetune for those.
---
 .../train-text-from-scratch.cpp               | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index aaae5e5768a0e..76cf501a5bd5d 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1188,19 +1188,23 @@ void save_llama_model_file(const char * filename, const char * fn_vocab_model, s
 void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
     load_llama_model_gguf(fctx, f_ggml_ctx, model);
 
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-    GGML_ASSERT(file_version == 0);
+    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) >= 0) {
+        uint32_t file_version = 0xFFFFFFFFu;
+        GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
+        GGML_ASSERT(file_version == 0);
 
-    std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
-    GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
+        std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
+        GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
+        GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
 
-    GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-    GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-    GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+        GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
+        GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
 
-    load_opt_context_gguf(fctx, f_ggml_ctx, opt);
+        load_opt_context_gguf(fctx, f_ggml_ctx, opt);
+    } else {
+        printf("%s: loaded llama model as checkpoint\n", __func__);
+    }
 }
 
 void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {

From f3590ad8d97f712c3752921dc42d62adea522a41 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 16:01:08 +0200
Subject: [PATCH 135/235] remove trailing whitespace

---
 ggml.c | 4 ++--
 ggml.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index 73d0f337b2f1c..77abf7c8fabb4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16191,9 +16191,9 @@ static size_t hash_find(void * hash_table[], void * p) {
 
 static bool hash_insert(void * hash_table[], void * p) {
     size_t i = hash_find(hash_table, p);
-    
+
     GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
-    
+
     if (hash_table[i] == p) {
         return true;
     }
diff --git a/ggml.h b/ggml.h
index b90809bbe8fd1..0ab8cedeabff8 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1665,7 +1665,7 @@ extern "C" {
     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
     // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes, 
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
     // but without the second forward pass nodes.
     GGML_API void ggml_build_backward_gradient_checkpointing(
             struct ggml_context   * ctx,

From b26bd4c34ce1811d737d7f58c3f023f5e2b5fc73 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 16:26:05 +0200
Subject: [PATCH 136/235] add option to save train-text-from-scratch output
 every N iterations

---
 .../train-text-from-scratch.cpp               | 89 +++++++++++++++++--
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 76cf501a5bd5d..2a9cf9c7c39b0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -793,6 +793,15 @@ void shuffle_ints(int * begin, int * end) {
     });
 }
 
+std::string replace_str(const char * s, const char * needle, const char * replacement) {
+    std::string str = s;
+    size_t pos = str.find(needle);
+    if (pos != std::string::npos) {
+        str.replace(pos, strlen(needle), replacement);
+    }
+    return str;
+}
+
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 { \
     const std::string skey(key); \
@@ -1174,14 +1183,17 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
     }
 }
 
-void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
+void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, const char * pattern_it, int iteration, const char * latest) {
+    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
+    std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    printf("%s: saving to %s\n", __func__, fn.c_str());
     struct gguf_context * fctx = gguf_init_empty();
 
     save_llama_model_gguf(fctx, fn_vocab_model, model);
 
     // write file
     const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
+    gguf_write_to_file(fctx, fn.c_str(), only_meta);
     gguf_free(fctx);
 }
 
@@ -1234,14 +1246,17 @@ bool load_checkpoint_file(const char * filename, struct my_llama_model * model,
     return true;
 }
 
-void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
+void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
+    std::string fn = replace_str(filename, pattern_it, sit.c_str());
+    printf("%s: saving to %s\n", __func__, fn.c_str());
     struct gguf_context * fctx = gguf_init_empty();
 
     save_checkpoint_gguf(fctx, fn_vocab_model, model, opt);
 
     // write file
     const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
+    gguf_write_to_file(fctx, fn.c_str(), only_meta);
     gguf_free(fctx);
 }
 
@@ -1270,6 +1285,10 @@ struct train_params {
     const char * fn_checkpoint_in;
     const char * fn_checkpoint_out;
     const char * fn_model_out;
+    const char * pattern_fn_it;
+    const char * fn_latest;
+
+    int save_every;
 
     uint32_t seed;
 
@@ -1329,6 +1348,10 @@ struct train_params get_default_train_params() {
     params.fn_checkpoint_in  = "checkpoint.bin";
     params.fn_checkpoint_out = "checkpoint.bin";
     params.fn_model_out      = "ggml-checkpoint-f32.bin";
+    params.pattern_fn_it     = "ITERATION";
+    params.fn_latest         = "LATEST";
+
+    params.save_every = 10;
 
     params.seed       =   -1;
 
@@ -1392,6 +1415,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
     fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
     fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
+    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
+    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
+    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
     fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
@@ -1481,6 +1507,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->fn_model_out = argv[i];
+        } else if (arg == "--pattern-fn-it") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->pattern_fn_it = argv[i];
+        } else if (arg == "--fn-latest") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_latest = argv[i];
+        } else if (arg == "--save-every") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->save_every = std::stoi(argv[i]);
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1722,7 +1766,9 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
 struct opt_callback_data {
     struct train_params *     params;
     struct ggml_opt_context * opt;
+    struct my_llama_model *   model;
     struct llama_context *    lctx;
+    int                       last_save_iter;
     llama_token *             tokens_data;
     size_t                    tokens_size;
     int *                     samples_data;
@@ -1738,6 +1784,26 @@ void opt_callback(void * vdata, float * sched) {
     struct train_params * params    = data->params;
     struct ggml_opt_context * opt   = data->opt;
     int n_batch = params->n_batch;
+    int n_ctx = params->n_ctx;
+
+    const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
+    if (save_now) {
+        int new_iters = opt->iter - data->last_save_iter;
+        data->model->train_its += new_iters;
+        data->model->train_samples += new_iters * n_batch;
+        data->model->train_tokens  += new_iters * n_batch * n_ctx;
+
+        if (strlen(params->fn_checkpoint_out) > 0) {
+            save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
+            save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, -1, params->fn_latest);
+
+        }
+        if (strlen(params->fn_model_out) > 0) {
+            save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, opt->iter, params->fn_latest);
+            save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, -1, params->fn_latest);
+        }
+        data->last_save_iter = opt->iter;
+    }
 
     *sched = (opt->iter < params->warmup)
                 ? (float) opt->iter / (float) params->warmup
@@ -1929,7 +1995,9 @@ int main(int argc, char ** argv) {
     struct opt_callback_data opt_cb_data;
     opt_cb_data.params = &params;
     opt_cb_data.opt = opt;
+    opt_cb_data.model = &model;
     opt_cb_data.lctx = lctx;
+    opt_cb_data.last_save_iter = opt->iter;
     opt_cb_data.tokens_data = train_tokens.data();
     opt_cb_data.tokens_size = train_tokens.size();
     opt_cb_data.samples_data = train_samples.data();
@@ -2038,14 +2106,23 @@ int main(int argc, char ** argv) {
     double  dd = (double) d * 1e-3;
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
+    int new_iters = opt->iter - opt_cb_data.last_save_iter;
+    model.train_its += new_iters;
+    model.train_samples += new_iters * n_batch;
+    model.train_tokens  += new_iters * n_batch * n_tokens;
+
     if (params.n_examples > 0) {
-        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt);
+        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt, params.pattern_fn_it, -1, params.fn_latest);
     }
 
     if (strlen(params.fn_model_out) > 0) {
-        save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model);
+        save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model, params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model, params.pattern_fn_it, -1, params.fn_latest);
     }
 
+    opt_cb_data.last_save_iter = opt->iter;
+
     if (alloc) {
         ggml_allocr_free(alloc);
     }

From 4e986ac4bc98a25be15e28f5932e54af2082b972 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 16:29:09 +0200
Subject: [PATCH 137/235] update README.md

---
 examples/train-text-from-scratch/README.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
index f4ffcd9876c0c..1b3454069e9a3 100644
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 ./bin/train-text-from-scratch \
         --vocab-model ../models/ggml-vocab-llama.gguf \
         --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.gguf \
-        --checkpoint-out chk-shakespeare-256x16.gguf \
-        --model-out ggml-shakespeare-256x16-f32.gguf \
+        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
+        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
+        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
         --train-data "shakespeare.txt" \
         -t 6 -b 16 --seed 1 --adam-iter 256 \
         --no-checkpointing
@@ -20,3 +20,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 # predict
 ./bin/main -m ggml-shakespeare-256x16-f32.gguf
 ```
+
+Output files will be saved every N iterations (config with `--save-every N`).
+The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
+
+To train GGUF models just pass them to `--checkpoint-in FN`.

From 0c57f9f0b312146b153858be78918c37b3e16f42 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 16:55:49 +0200
Subject: [PATCH 138/235] fix warnings

---
 common/common.cpp              |  2 +-
 examples/finetune/finetune.cpp | 16 ++++------------
 ggml.c                         | 10 ++++------
 3 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8503da88abe3a..e2faeee0dae87 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -724,7 +724,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
-    for (int i = 0; i < params.lora_adapter.size(); ++i) {
+    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
         int err = llama_model_apply_lora_from_file(model,
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 9da63d8c92876..b092bd6123c5f 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -406,18 +406,13 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
     hparams.n_layer = llama_model_n_layer(input);
     hparams.n_rot   = llama_model_n_rot(input);
 
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_ff    = hparams.n_ff;
-
     model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
     model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
     model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
 
-    model->layers.resize(n_layer);
+    model->layers.resize(hparams.n_layer);
 
-    for (uint32_t i = 0; i < n_layer; ++i) {
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
         layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i));
@@ -654,7 +649,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     const float rope_freq_base  = lora->hparams.rope_freq_base;
     const float rope_freq_scale = lora->hparams.rope_freq_scale;
 
-    GGML_ASSERT(n_layer == lora->layers.size());
+    GGML_ASSERT((size_t) n_layer == lora->layers.size());
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
         ggml_set_name(t, n);
@@ -828,15 +823,12 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
 
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
-        for (int i = 0; i < checkpoints.size(); ++i) {
+        for (unsigned int i = 0; i < checkpoints.size(); ++i) {
             if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
                 ggml_allocr_alloc(alloc, checkpoints[i]);
             }
         }
 
-        int n_leafs_after = gb->n_leafs;
-        int n_nodes_after = gb->n_nodes;
-
         ggml_allocr_alloc_graph(alloc, gb);
 
         // remove the additional nodes and leafs
diff --git a/ggml.c b/ggml.c
index 77abf7c8fabb4..8ade339579412 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4851,7 +4851,6 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
 }
 
 void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
-    const int64_t ne3 = tensor->ne[3];
     const int64_t ne2 = tensor->ne[2];
     const int64_t ne1 = tensor->ne[1];
     const int64_t ne0 = tensor->ne[0];
@@ -16214,16 +16213,16 @@ struct hash_map {
     void * vals[GGML_GRAPH_HASHTABLE_SIZE];
 };
 
-struct hash_map * new_hash_map() {
+static struct hash_map * new_hash_map(void) {
     struct hash_map * result = malloc(sizeof(struct hash_map));
     for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
         result->keys[i] = NULL;
         result->vals[i] = NULL;
     }
     return result;
-};
+}
 
-void free_hash_map(struct hash_map * map) {
+static void free_hash_map(struct hash_map * map) {
     free(map);
 }
 
@@ -19176,7 +19175,6 @@ static enum ggml_opt_result linesearch_backtracking(
         float * step,
         const float * xp,
         struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
         struct ggml_cgraph * gb,
         struct ggml_cplan  * cplan,
         const int np,
@@ -19421,7 +19419,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, callback, callback_data);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return

From 4fd51c461660029950fc9df0835cf04c843d2f65 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 30 Aug 2023 17:12:23 +0200
Subject: [PATCH 139/235] fix warnings

---
 examples/finetune/finetune.cpp                               | 2 +-
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index b092bd6123c5f..df48da8779c63 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -699,7 +699,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     checkpoints.push_back(t00);
     checkpoints.push_back(t01);
 
-    struct ggml_tensor * kv_scale;
+    struct ggml_tensor * kv_scale = NULL;
     if (!enable_flash_attn) {
         kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
     }
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 2a9cf9c7c39b0..f31427a9984ca 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -515,7 +515,7 @@ struct ggml_tensor * llama_build_train_graphs(
     checkpoints.push_back(t00);
     checkpoints.push_back(t01);
 
-    struct ggml_tensor * kv_scale;
+    struct ggml_tensor * kv_scale = NULL;
     if (!enable_flash_attn) {
         kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
     }

From e0da1684db3ea8b51e7be808d1bd7756abde9f94 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 31 Aug 2023 16:45:47 +0200
Subject: [PATCH 140/235] remove finetune option to disable allocator

the allocator should always be used.
by making sure that it is always used it gets easier to implement automatic memory requirements computation
---
 examples/finetune/finetune.cpp | 121 +++++++++++++++------------------
 1 file changed, 53 insertions(+), 68 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index df48da8779c63..d2451bdcac9d0 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -791,56 +791,56 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         ggml_build_backward_expand(ctx, gf, gb, true);
     }
 
-    if (alloc) {
-        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-        int n_leafs_before = gb->n_leafs;
-        int n_nodes_before = gb->n_nodes;
-        struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
-        // output tensors
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
-        // input gradient
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
-        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-        ggml_allocr_alloc(alloc, t36->grad);
-
-        // make sure base model tensors data cannot be used in viewable operations
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
-        for (int il = 0; il < n_layer; ++il) {
-            struct my_llama_layer & layer = model->layers[il];
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
-            ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
-        }
+    GGML_ASSERT(alloc != NULL);
+
+    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
+    int n_leafs_before = gb->n_leafs;
+    int n_nodes_before = gb->n_nodes;
+    struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
+    // output tensors
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+    // input gradient
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
+    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
+    ggml_allocr_alloc(alloc, t36->grad);
+
+    // make sure base model tensors data cannot be used in viewable operations
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
+    for (int il = 0; il < n_layer; ++il) {
+        struct my_llama_layer & layer = model->layers[il];
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
+    }
 
-        // allocating checkpoints in one block to reduce memory fragmentation
-        // note: they will be freed in reverse order
-        for (unsigned int i = 0; i < checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-                ggml_allocr_alloc(alloc, checkpoints[i]);
-            }
+    // allocating checkpoints in one block to reduce memory fragmentation
+    // note: they will be freed in reverse order
+    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
+        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
+            ggml_allocr_alloc(alloc, checkpoints[i]);
         }
+    }
 
-        ggml_allocr_alloc_graph(alloc, gb);
+    ggml_allocr_alloc_graph(alloc, gb);
 
-        // remove the additional nodes and leafs
-        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-            gb->leafs[i] = NULL;
-        }
-        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-            gb->nodes[i] = NULL;
-        }
-        gb->n_leafs = n_leafs_before;
-        gb->n_nodes = n_nodes_before;
+    // remove the additional nodes and leafs
+    for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
+        gb->leafs[i] = NULL;
+    }
+    for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
+        gb->nodes[i] = NULL;
     }
+    gb->n_leafs = n_leafs_before;
+    gb->n_nodes = n_nodes_before;
 
     *logits = t35;
     return t36;
@@ -1596,7 +1596,6 @@ struct train_params {
     bool use_adam;
     bool use_flash;
     bool use_checkpointing;
-    bool use_alloc;
 
     // only adam
     int   warmup;
@@ -1670,7 +1669,6 @@ struct train_params get_default_train_params() {
     params.use_adam               = true;
     params.use_flash              = true;
     params.use_checkpointing      = true;
-    params.use_alloc              = true;
 
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
@@ -1982,10 +1980,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             params->use_checkpointing = false;
         } else if (arg == "--use-checkpointing") {
             params->use_checkpointing = true;
-        } else if (arg == "--no-alloc") {
-            params->use_alloc = false;
-        } else if (arg == "--use-alloc") {
-            params->use_alloc = true;
         } else if (arg == "--warmup") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2346,11 +2340,8 @@ int main(int argc, char ** argv) {
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
 
-    ggml_allocr * alloc = NULL;
-    if (params.use_alloc) {
-        static const size_t tensor_alignment = 32;
-        alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
-    }
+    static const size_t tensor_alignment = 32;
+    ggml_allocr * alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
 
     std::vector<int> train_samples;
     if (params.n_examples > 0) {
@@ -2405,15 +2396,13 @@ int main(int argc, char ** argv) {
         ggml_set_no_alloc(ctx0, false);
 
         // don't use alloc for input tensors, so we can safely fill them with data
-        struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-        struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
+        struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+        struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
-        ggml_set_no_alloc(ctx0, (alloc != NULL));
+        ggml_set_no_alloc(ctx0, true);
 
-        if (alloc) {
-            ggml_allocr_reset(alloc);
-        }
+        ggml_allocr_reset(alloc);
 
         opt_cb_data.tokens_input  = tokens_input;
         opt_cb_data.target_logits = target_logits;
@@ -2461,7 +2450,6 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
-
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
             printf("error_before_opt: %.6f\n", opt->loss_before);
@@ -2495,10 +2483,7 @@ int main(int argc, char ** argv) {
 
     opt_cb_data.last_save_iter = opt->iter;
 
-    if (alloc) {
-        ggml_allocr_free(alloc);
-    }
-
+    ggml_allocr_free(alloc);
     delete[] compute_addr;
     delete[] compute_buf_0;
     ggml_free(lora.ctx);

From 4914f855c7dfcb23fc9fe98c9cd3329f7215c630 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 31 Aug 2023 16:46:21 +0200
Subject: [PATCH 141/235] add tensor checkpoints only when gradient
 checkpointing is enabled

---
 examples/finetune/finetune.cpp | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d2451bdcac9d0..65501c3554aa2 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -694,10 +694,12 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     struct ggml_tensor * cur = t01;
 
     std::vector<struct ggml_tensor *> checkpoints;
-    checkpoints.push_back(tokens_input);
-    checkpoints.push_back(targets);
-    checkpoints.push_back(t00);
-    checkpoints.push_back(t01);
+    if (enable_checkpointing) {
+        checkpoints.push_back(tokens_input);
+        checkpoints.push_back(targets);
+        checkpoints.push_back(t00);
+        checkpoints.push_back(t01);
+    }
 
     struct ggml_tensor * kv_scale = NULL;
     if (!enable_flash_attn) {
@@ -766,7 +768,9 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
         struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
-        checkpoints.push_back(cur);
+        if (enable_checkpointing) {
+            checkpoints.push_back(cur);
+        }
     }
     struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
     struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                           set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
@@ -775,12 +779,14 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
     struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
 
-    checkpoints.push_back(t31);
-    checkpoints.push_back(t32);
-    checkpoints.push_back(t33);
-    checkpoints.push_back(t34);
-    checkpoints.push_back(t35);
-    checkpoints.push_back(t36);
+    if (enable_checkpointing) {
+        checkpoints.push_back(t31);
+        checkpoints.push_back(t32);
+        checkpoints.push_back(t33);
+        checkpoints.push_back(t34);
+        checkpoints.push_back(t35);
+        checkpoints.push_back(t36);
+    }
 
     ggml_build_forward_expand(gf, t36);
 

From d554a70f1119824dcccdced7589f9868e2049fdc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 1 Sep 2023 15:41:57 +0200
Subject: [PATCH 142/235] initialize opt ggml context if none was provided

---
 ggml.c | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/ggml.c b/ggml.c
index 8ade339579412..d24af8e035a0a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19606,13 +19606,31 @@ GGML_API void ggml_opt_init(
     opt->iter = 0;
     opt->nx = nx;
     opt->just_initialized = true;
+    if (opt->ctx == NULL) {
+        struct ggml_init_params ctx_opt_params;
+        if (opt->params.type == GGML_OPT_ADAM) {
+            ctx_opt_params.mem_size = GGML_MEM_ALIGN*2 + ggml_tensor_overhead()*2 + ggml_type_size(GGML_TYPE_F32)*nx*2;
+            if (opt->params.past > 0) {
+                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
+            }
+        } else if (opt->params.type == GGML_OPT_LBFGS) {
+            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
+            if (opt->params.past > 0) {
+                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
+            }
+        }
+        ctx_opt_params.mem_buffer = NULL;
+        ctx_opt_params.no_alloc   = false;
+
+        opt->ctx = ggml_init(ctx_opt_params);
+    }
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                opt->adam.m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->adam.v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                 opt->adam.pf = params.past > 0
-                    ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
+                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
                     : NULL;
                 ggml_set_zero(opt->adam.m);
                 ggml_set_zero(opt->adam.v);
@@ -19622,18 +19640,18 @@ GGML_API void ggml_opt_init(
             } break;
         case GGML_OPT_LBFGS:
             {
-                opt->lbfgs.x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.g  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.d  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
+                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                 opt->lbfgs.pf = params.past > 0
-                    ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
+                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
                     : NULL;
-                opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lms  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                opt->lbfgs.lmy  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
+                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
+                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
+                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
                 ggml_set_zero(opt->lbfgs.x);
                 ggml_set_zero(opt->lbfgs.xp);
                 ggml_set_zero(opt->lbfgs.g);

From 7e01d11a28c80c1f4fa7701b107076a451bbcbd7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 1 Sep 2023 15:42:40 +0200
Subject: [PATCH 143/235] add ggml-alloc API function 'ggml_allocr_max_size' to
 get max size of alloc

GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
---
 ggml-alloc.c | 4 ++++
 ggml-alloc.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index b99f75e449213..ab34e70772dc5 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -554,3 +554,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
     return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
+
+size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
+    return alloc->max_size;
+}
diff --git a/ggml-alloc.h b/ggml-alloc.h
index 9559da75871a6..0c224f174f396 100644
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -19,6 +19,7 @@ GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
 
 
 #ifdef  __cplusplus

From 5bba329e58be7bf60bfd3c9929ca4d9cdad5048b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 1 Sep 2023 15:58:24 +0200
Subject: [PATCH 144/235] finetune: automatically allocate all memory and
 changes to command line options

remove '--n_examples N' parameter, as it no longer makes sense to call optimization process multiple times in a loop.
add '--only_write_lora' command line option: will skip tokenization and training, to only write a llama.cpp comptabile LORA adapter.
remove memory buffer related command line options.
improve iteration console output.
---
 examples/finetune/finetune.cpp | 768 ++++++++++++++++++++-------------
 1 file changed, 469 insertions(+), 299 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 65501c3554aa2..848e390f6f604 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -17,6 +17,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const size_t tensor_alignment = 32;
+
 struct random_normal_distribution {
     std::mt19937 gen;
     std::normal_distribution<float> rd;
@@ -255,6 +257,7 @@ struct my_llama_lora_layer {
 
 struct my_llama_lora {
     struct ggml_context * ctx = NULL;
+    std::vector<uint8_t> data;
 
     my_llama_lora_hparams hparams;
 
@@ -427,6 +430,42 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
     }
 }
 
+void set_param_lora(struct my_llama_lora * lora) {
+    const uint32_t n_layer = lora->layers.size();
+
+    struct ggml_context* ctx = lora->ctx;
+
+    ggml_set_param(ctx, lora->tok_embeddings_a);
+    ggml_set_param(ctx, lora->tok_embeddings_b);
+    ggml_set_param(ctx, lora->norm_a);
+    ggml_set_param(ctx, lora->norm_b);
+    ggml_set_param(ctx, lora->output_a);
+    ggml_set_param(ctx, lora->output_b);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = lora->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm_a);
+        ggml_set_param(ctx, layer.attention_norm_b);
+        ggml_set_param(ctx, layer.wq_a);
+        ggml_set_param(ctx, layer.wq_b);
+        ggml_set_param(ctx, layer.wk_a);
+        ggml_set_param(ctx, layer.wk_b);
+        ggml_set_param(ctx, layer.wv_a);
+        ggml_set_param(ctx, layer.wv_b);
+        ggml_set_param(ctx, layer.wo_a);
+        ggml_set_param(ctx, layer.wo_b);
+        ggml_set_param(ctx, layer.ffn_norm_a);
+        ggml_set_param(ctx, layer.ffn_norm_b);
+        ggml_set_param(ctx, layer.w1_a);
+        ggml_set_param(ctx, layer.w1_b);
+        ggml_set_param(ctx, layer.w2_a);
+        ggml_set_param(ctx, layer.w2_b);
+        ggml_set_param(ctx, layer.w3_a);
+        ggml_set_param(ctx, layer.w3_b);
+    }
+}
+
 void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
     const auto & lparams = lora->hparams;
 
@@ -435,8 +474,6 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
     const uint32_t n_vocab = model->hparams.n_vocab;
     const uint32_t n_ff    = model->hparams.n_ff;
 
-    struct ggml_context * ctx = lora->ctx;
-
     lora->train_its = 0;
     lora->train_samples = 0;
     lora->train_tokens = 0;
@@ -454,6 +491,15 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
         return tn_buf.data();
     };
 
+    // context for lora tensors without their data
+    struct ggml_init_params ctx_lora_params;
+    ctx_lora_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
+    ctx_lora_params.mem_buffer = NULL;
+    ctx_lora_params.no_alloc   = true;
+
+    struct ggml_context * ctx = ggml_init(ctx_lora_params);
+    lora->ctx = ctx;
+
     lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
     lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
     lora->norm_a           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
@@ -472,8 +518,6 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = lora->layers[i];
 
-        std::string layers_i = "layers." + std::to_string(i);
-
         layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd);
         layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1);
 
@@ -515,44 +559,130 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
         ggml_set_name(layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
         ggml_set_name(layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
     }
-}
-
-void set_param_lora(struct my_llama_lora * lora) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct ggml_context* ctx = lora->ctx;
-
-    ggml_set_param(ctx, lora->tok_embeddings_a);
-    ggml_set_param(ctx, lora->tok_embeddings_b);
-    ggml_set_param(ctx, lora->norm_a);
-    ggml_set_param(ctx, lora->norm_b);
-    ggml_set_param(ctx, lora->output_a);
-    ggml_set_param(ctx, lora->output_b);
 
+    set_param_lora(lora);
+
+    // measure data size
+    ggml_allocr * alloc = NULL;
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
+    ggml_allocr_alloc(alloc, lora->norm_a);
+    ggml_allocr_alloc(alloc, lora->norm_b);
+    ggml_allocr_alloc(alloc, lora->output_a);
+    ggml_allocr_alloc(alloc, lora->output_b);
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = lora->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm_a);
+        ggml_allocr_alloc(alloc, layer.attention_norm_b);
+        ggml_allocr_alloc(alloc, layer.wq_a);
+        ggml_allocr_alloc(alloc, layer.wq_b);
+        ggml_allocr_alloc(alloc, layer.wk_a);
+        ggml_allocr_alloc(alloc, layer.wk_b);
+        ggml_allocr_alloc(alloc, layer.wv_a);
+        ggml_allocr_alloc(alloc, layer.wv_b);
+        ggml_allocr_alloc(alloc, layer.wo_a);
+        ggml_allocr_alloc(alloc, layer.wo_b);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
+        ggml_allocr_alloc(alloc, layer.w1_a);
+        ggml_allocr_alloc(alloc, layer.w1_b);
+        ggml_allocr_alloc(alloc, layer.w2_a);
+        ggml_allocr_alloc(alloc, layer.w2_b);
+        ggml_allocr_alloc(alloc, layer.w3_a);
+        ggml_allocr_alloc(alloc, layer.w3_b);
+    }
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
+    ggml_allocr_alloc(alloc, lora->norm_a->grad);
+    ggml_allocr_alloc(alloc, lora->norm_b->grad);
+    ggml_allocr_alloc(alloc, lora->output_a->grad);
+    ggml_allocr_alloc(alloc, lora->output_b->grad);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = lora->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
+        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
+        ggml_allocr_alloc(alloc, layer.wq_a->grad);
+        ggml_allocr_alloc(alloc, layer.wq_b->grad);
+        ggml_allocr_alloc(alloc, layer.wk_a->grad);
+        ggml_allocr_alloc(alloc, layer.wk_b->grad);
+        ggml_allocr_alloc(alloc, layer.wv_a->grad);
+        ggml_allocr_alloc(alloc, layer.wv_b->grad);
+        ggml_allocr_alloc(alloc, layer.wo_a->grad);
+        ggml_allocr_alloc(alloc, layer.wo_b->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
+        ggml_allocr_alloc(alloc, layer.w1_a->grad);
+        ggml_allocr_alloc(alloc, layer.w1_b->grad);
+        ggml_allocr_alloc(alloc, layer.w2_a->grad);
+        ggml_allocr_alloc(alloc, layer.w2_b->grad);
+        ggml_allocr_alloc(alloc, layer.w3_a->grad);
+        ggml_allocr_alloc(alloc, layer.w3_b->grad);
+    }
 
-        ggml_set_param(ctx, layer.attention_norm_a);
-        ggml_set_param(ctx, layer.attention_norm_b);
-        ggml_set_param(ctx, layer.wq_a);
-        ggml_set_param(ctx, layer.wq_b);
-        ggml_set_param(ctx, layer.wk_a);
-        ggml_set_param(ctx, layer.wk_b);
-        ggml_set_param(ctx, layer.wv_a);
-        ggml_set_param(ctx, layer.wv_b);
-        ggml_set_param(ctx, layer.wo_a);
-        ggml_set_param(ctx, layer.wo_b);
-        ggml_set_param(ctx, layer.ffn_norm_a);
-        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.w1_a);
-        ggml_set_param(ctx, layer.w1_b);
-        ggml_set_param(ctx, layer.w2_a);
-        ggml_set_param(ctx, layer.w2_b);
-        ggml_set_param(ctx, layer.w3_a);
-        ggml_set_param(ctx, layer.w3_b);
+    // allocate data
+    lora->data.resize(ggml_allocr_max_size(alloc));
+    ggml_allocr_free(alloc);
+    alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
+    ggml_allocr_alloc(alloc, lora->norm_a);
+    ggml_allocr_alloc(alloc, lora->norm_b);
+    ggml_allocr_alloc(alloc, lora->output_a);
+    ggml_allocr_alloc(alloc, lora->output_b);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = lora->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm_a);
+        ggml_allocr_alloc(alloc, layer.attention_norm_b);
+        ggml_allocr_alloc(alloc, layer.wq_a);
+        ggml_allocr_alloc(alloc, layer.wq_b);
+        ggml_allocr_alloc(alloc, layer.wk_a);
+        ggml_allocr_alloc(alloc, layer.wk_b);
+        ggml_allocr_alloc(alloc, layer.wv_a);
+        ggml_allocr_alloc(alloc, layer.wv_b);
+        ggml_allocr_alloc(alloc, layer.wo_a);
+        ggml_allocr_alloc(alloc, layer.wo_b);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
+        ggml_allocr_alloc(alloc, layer.w1_a);
+        ggml_allocr_alloc(alloc, layer.w1_b);
+        ggml_allocr_alloc(alloc, layer.w2_a);
+        ggml_allocr_alloc(alloc, layer.w2_b);
+        ggml_allocr_alloc(alloc, layer.w3_a);
+        ggml_allocr_alloc(alloc, layer.w3_b);
+    }
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
+    ggml_allocr_alloc(alloc, lora->norm_a->grad);
+    ggml_allocr_alloc(alloc, lora->norm_b->grad);
+    ggml_allocr_alloc(alloc, lora->output_a->grad);
+    ggml_allocr_alloc(alloc, lora->output_b->grad);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = lora->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
+        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
+        ggml_allocr_alloc(alloc, layer.wq_a->grad);
+        ggml_allocr_alloc(alloc, layer.wq_b->grad);
+        ggml_allocr_alloc(alloc, layer.wk_a->grad);
+        ggml_allocr_alloc(alloc, layer.wk_b->grad);
+        ggml_allocr_alloc(alloc, layer.wv_a->grad);
+        ggml_allocr_alloc(alloc, layer.wv_b->grad);
+        ggml_allocr_alloc(alloc, layer.wo_a->grad);
+        ggml_allocr_alloc(alloc, layer.wo_b->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
+        ggml_allocr_alloc(alloc, layer.w1_a->grad);
+        ggml_allocr_alloc(alloc, layer.w1_b->grad);
+        ggml_allocr_alloc(alloc, layer.w2_a->grad);
+        ggml_allocr_alloc(alloc, layer.w2_b->grad);
+        ggml_allocr_alloc(alloc, layer.w3_a->grad);
+        ggml_allocr_alloc(alloc, layer.w3_b->grad);
     }
+    ggml_allocr_free(alloc);
 }
 
+
+
 void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
     const uint32_t n_layer = lora->layers.size();
 
@@ -852,19 +982,17 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     return t36;
 }
 
-void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
-    int n_vocab  = target_logits->ne[0];
+    int n_vocab  = target_probs->ne[0];
 
     size_t sample = train_samples[example_id % n_train_samples];
     GGML_ASSERT(sample+n_tokens-1 < n_train_data);
 
-    ggml_set_f32(target_logits, -1.0f/n_vocab);
     ggml_set_f32(target_probs, 0.0f);
     ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
     for (int i=1; i<n_tokens+1; ++i) {
         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        ggml_set_f32_nd(target_logits, token, i-1, 0, 0, +1.0f);
         ggml_set_f32_nd(target_probs,  token, i-1, 0, 0, +1.0f);
         if (i<n_tokens) {
             ggml_set_i32_1d(tokens_input, i, token);
@@ -872,20 +1000,16 @@ void get_example_targets(struct llama_context * lctx, const int * train_samples,
     }
 }
 
-void get_example_targets_batch(struct llama_context* lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context* lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
-    int n_vocab  = target_logits->ne[0];
+    int n_vocab  = target_probs->ne[0];
     int n_tokens = tokens_input->ne[0];
     int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_tokens == target_logits->ne[1]);
-    GGML_ASSERT(n_batch  == target_logits->ne[2]);
     GGML_ASSERT(n_vocab  == target_probs->ne[0]);
     GGML_ASSERT(n_tokens == target_probs->ne[1]);
     GGML_ASSERT(n_batch  == target_probs->ne[2]);
 
-    ggml_set_f32(target_logits, -1.0f/n_vocab);
     ggml_set_f32(target_probs, 0.0f);
     // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
@@ -898,7 +1022,6 @@ void get_example_targets_batch(struct llama_context* lctx, const int * train_sam
         ggml_set_i32_nd(tokens_input, 0, k, 0, 0, llama_token_bos(lctx));
         for (int i=1; i<n_tokens+1; ++i) {
             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            ggml_set_f32_nd(target_logits, token, i-1, k, 0, +1.0f);
             ggml_set_f32_nd(target_probs,  token, i-1, k, 0, +1.0f);
             if (i<n_tokens) {
                 ggml_set_i32_nd(tokens_input, i, k, 0, 0, token);
@@ -1141,7 +1264,6 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
         GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
 
-        GGML_ASSERT(opt->ctx != NULL);
         ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
 
         read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
@@ -1158,7 +1280,6 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
         GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
 
-        GGML_ASSERT(opt->ctx != NULL);
         ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
 
         read_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
@@ -1574,7 +1695,8 @@ struct train_params {
     int n_ctx;
     int n_threads;
     int n_batch;
-    int n_examples;
+
+    bool only_write_lora;
 
     float f_norm_rms_eps;
     float rope_freq_base;
@@ -1596,8 +1718,6 @@ struct train_params {
     int n_rank_norm;
     int n_rank_output;
 
-    int print_info_interval;
-
     bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
@@ -1624,10 +1744,6 @@ struct train_params {
     float adam_beta2;
     float adam_gclip;
     float adam_eps_f;
-
-    int mem_lora_gb;
-    int mem_compute_gb;
-    int mem_compute0_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -1647,7 +1763,8 @@ struct train_params get_default_train_params() {
     params.n_ctx      =  128;
     params.n_threads  =    6;
     params.n_batch    =    8;
-    params.n_examples =    1;
+
+    params.only_write_lora = false;
 
     params.f_norm_rms_eps  = 1e-5f;
     params.rope_freq_base  = 10000.0f;
@@ -1669,8 +1786,6 @@ struct train_params get_default_train_params() {
     params.n_rank_norm           = 1;
     params.n_rank_output         = 4;
 
-    params.print_info_interval    = 1;
-
     params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
@@ -1697,10 +1812,6 @@ struct train_params get_default_train_params() {
     params.adam_beta2          = 0.999f;
     params.adam_gclip          = 1.0f;
     params.adam_eps_f          = 0.0f;
-
-    params.mem_lora_gb     =  2;
-    params.mem_compute_gb  = 24;
-    params.mem_compute0_gb =  8;
     return params;
 }
 
@@ -1717,11 +1828,11 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
     fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
     fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
+    fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training\n");
     fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
-    fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
     fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
@@ -1739,7 +1850,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor (default %d)\n", params->n_rank_w1);
     fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor (default %d)\n", params->n_rank_w2);
     fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor (default %d)\n", params->n_rank_w3);
-    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
@@ -1768,9 +1878,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
-    fprintf(stderr, "  --mem-lora N               Memory to allocate for LORA in gigabytes. (default %d)\n", params->mem_lora_gb);
-    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
-    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
     fprintf(stderr, "\n");
 }
 
@@ -1834,6 +1941,8 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->save_every = std::stoi(argv[i]);
+        } else if (arg == "--only-write-lora") {
+            params->only_write_lora = true;
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1858,12 +1967,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_batch = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--examples") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_examples = std::stoi(argv[i]);
         } else if (arg == "--norm-rms-eps") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1966,12 +2069,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_rank_w3 = std::stoi(argv[i]);
-        } else if (arg == "--print-info-interval") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->print_info_interval = std::stoi(argv[i]);
         } else if (arg == "--samples-after-nl") {
             params->samples_start_after_nl = true;
         } else if (arg == "--use-lbfgs") {
@@ -2092,24 +2189,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->lbfgs_n_iter = std::stoi(argv[i]);
-        } else if (arg == "--mem-lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_lora_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute0") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute0_gb = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -2141,7 +2220,6 @@ struct opt_callback_data {
     size_t                    samples_size;
     int                       shuffle_countdown;
     struct ggml_tensor *      tokens_input;
-    struct ggml_tensor *      target_logits;
     struct ggml_tensor *      target_probs;
 };
 
@@ -2183,7 +2261,18 @@ void opt_callback(void * vdata, float * sched) {
 
     int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
     if (impr_plot > 0) impr_plot = 0;
-    printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+    if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+    printf("%s: iter=%*d, sched=%f loss=%f ", __func__, 6, opt->iter, *sched, opt->loss_after);
+    float improvement = opt->loss_before - opt->loss_after;
+    const float plot_scale = 10.0f;
+    int bar_len = (int)(1 + improvement*plot_scale + 0.5);
+    printf("|");
+    for (int i=0; i<bar_len; ++i) {
+        printf("-");
+    }
+    printf(">");
+    // printf("improvement: %*d>", impr_plot, (int)0);
+    printf("\n");
 
     if (data->shuffle_countdown < n_batch) {
         printf("%s: reshuffle samples\n", __func__);
@@ -2202,12 +2291,44 @@ void opt_callback(void * vdata, float * sched) {
         data->tokens_size,
         opt->iter,
         data->tokens_input,
-        data->target_logits,
         data->target_probs);
 
     data->shuffle_countdown -= n_batch;
 }
 
+int64_t get_parameter_count(struct my_llama_lora* lora) {
+    int64_t nx = 0;
+    nx += ggml_nelements(lora->tok_embeddings_a);
+    nx += ggml_nelements(lora->tok_embeddings_b);
+    nx += ggml_nelements(lora->norm_a);
+    nx += ggml_nelements(lora->norm_b);
+    nx += ggml_nelements(lora->output_a);
+    nx += ggml_nelements(lora->output_b);
+
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+        nx += ggml_nelements(layer.attention_norm_a);
+        nx += ggml_nelements(layer.attention_norm_b);
+        nx += ggml_nelements(layer.wq_a);
+        nx += ggml_nelements(layer.wq_b);
+        nx += ggml_nelements(layer.wk_a);
+        nx += ggml_nelements(layer.wk_b);
+        nx += ggml_nelements(layer.wv_a);
+        nx += ggml_nelements(layer.wv_b);
+        nx += ggml_nelements(layer.wo_a);
+        nx += ggml_nelements(layer.wo_b);
+        nx += ggml_nelements(layer.ffn_norm_a);
+        nx += ggml_nelements(layer.ffn_norm_b);
+        nx += ggml_nelements(layer.w1_a);
+        nx += ggml_nelements(layer.w1_b);
+        nx += ggml_nelements(layer.w2_a);
+        nx += ggml_nelements(layer.w2_b);
+        nx += ggml_nelements(layer.w3_a);
+        nx += ggml_nelements(layer.w3_b);
+    }
+    return nx;
+}
+
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -2228,19 +2349,16 @@ int main(int argc, char ** argv) {
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-    std::vector<llama_token> train_tokens;
-    if (params.n_examples > 0) {
-        printf("%s: tokenize training data\n", __func__);
-        if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
-            fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
-        }
-        printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
-    }
-
     struct my_llama_model model;
     init_model(lmodel, &model, params.n_ctx);
 
     struct my_llama_lora lora;
+    struct ggml_opt_context* opt = (struct ggml_opt_context*)alloca(sizeof(struct ggml_opt_context));
+    memset(opt, 0, sizeof(struct ggml_opt_context));
+
+    opt->ctx = NULL;
+
+    // set lora params from command line
     lora.hparams.f_norm_rms_eps        = params.f_norm_rms_eps;
     lora.hparams.rope_freq_base        = params.rope_freq_base;
     lora.hparams.rope_freq_scale       = params.rope_freq_scale;
@@ -2259,213 +2377,267 @@ int main(int argc, char ** argv) {
     lora.hparams.n_rank_norm           = params.n_rank_norm;
     lora.hparams.n_rank_output         = params.n_rank_output;
 
-    std::vector<size_t> token_noccurs;
-    std::vector<bool>   token_notavail;
-    token_noccurs.resize(model.hparams.n_vocab, 0);
-    token_notavail.resize(model.hparams.n_vocab, true);
-    for (int i = 0; i < (int) train_tokens.size(); ++i) {
-        ++token_noccurs[train_tokens[i]];
-        token_notavail[train_tokens[i]] = false;
-    }
-
-    std::vector<float> token_freq;
-    token_freq.resize(model.hparams.n_vocab, 0);
-    int n_unique_tokens = 0;
-    for (int i = 0; i < (int) token_noccurs.size(); ++i) {
-        token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size();
-        n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0;
+    // set opt params from command line
+    if (params.use_adam) {
+        opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+        opt->params.print_forward_graph  = false;
+        opt->params.print_backward_graph = false;
+        opt->params.n_threads            = params.n_threads;
+        opt->params.past                 = params.opt_past;
+        opt->params.delta                = params.opt_delta;
+        opt->params.max_no_improvement   = params.opt_max_no_improvement;
+        opt->params.adam.n_iter          = params.adam_n_iter;
+        opt->params.adam.sched           = 1.0f;
+        opt->params.adam.alpha           = params.adam_alpha;
+        opt->params.adam.decay           = params.adam_decay;
+        opt->params.adam.decay_min_ndim  = params.adam_decay_min_ndim;
+        opt->params.adam.beta1           = params.adam_beta1;
+        opt->params.adam.beta2           = params.adam_beta2;
+        opt->params.adam.gclip           = params.adam_gclip;
+        opt->params.adam.eps_f           = params.adam_eps_f;
+    } else {
+        opt->params = ggml_opt_default_params(GGML_OPT_LBFGS);
+        opt->params.print_forward_graph  = false;
+        opt->params.print_backward_graph = false;
+        opt->params.n_threads            = params.n_threads;
+        opt->params.past                 = params.opt_past;
+        opt->params.delta                = params.opt_delta;
+        opt->params.max_no_improvement   = params.opt_max_no_improvement;
+        opt->params.lbfgs.n_iter         = params.lbfgs_n_iter;
     }
-    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_lora_gb);
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
+    ggml_allocr * alloc = NULL;
 
-    lora.ctx = ggml_init(lcparams);
+    printf("%s: init model\n", __func__);
+    bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, opt);
 
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.n_batch;
+    if (existed) {
+        model.hparams.n_ctx = params.n_ctx;
+
+        const bool opt_param_count_changed = (
+           (lora.hparams.n_rank_attention_norm != params.n_rank_attention_norm)
+        || (lora.hparams.n_rank_wq             != params.n_rank_wq)
+        || (lora.hparams.n_rank_wk             != params.n_rank_wk)
+        || (lora.hparams.n_rank_wv             != params.n_rank_wv)
+        || (lora.hparams.n_rank_wo             != params.n_rank_wo)
+        || (lora.hparams.n_rank_ffn_norm       != params.n_rank_ffn_norm)
+        || (lora.hparams.n_rank_w1             != params.n_rank_w1)
+        || (lora.hparams.n_rank_w2             != params.n_rank_w2)
+        || (lora.hparams.n_rank_w3             != params.n_rank_w3)
+        || (lora.hparams.n_rank_tok_embeddings != params.n_rank_tok_embeddings)
+        || (lora.hparams.n_rank_norm           != params.n_rank_norm)
+        || (lora.hparams.n_rank_output         != params.n_rank_output)
+        );
 
-    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
-    memset(opt, 0, sizeof(struct ggml_opt_context));
+        const bool opt_past_changed = opt->params.past != params.opt_past;
 
-    struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
-    struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
-    opt_params_adam.print_forward_graph  = false;
-    opt_params_adam.print_backward_graph = false;
-    opt_params_adam.n_threads            = params.n_threads;
-    opt_params_adam.past                 = params.opt_past;
-    opt_params_adam.delta                = params.opt_delta;
-    opt_params_adam.max_no_improvement   = params.opt_max_no_improvement;
-    opt_params_adam.adam.n_iter          = params.adam_n_iter;
-    opt_params_adam.adam.sched           = 1.0f;
-    opt_params_adam.adam.alpha           = params.adam_alpha;
-    opt_params_adam.adam.decay           = params.adam_decay;
-    opt_params_adam.adam.decay_min_ndim  = params.adam_decay_min_ndim;
-    opt_params_adam.adam.beta1           = params.adam_beta1;
-    opt_params_adam.adam.beta2           = params.adam_beta2;
-    opt_params_adam.adam.gclip           = params.adam_gclip;
-    opt_params_adam.adam.eps_f           = params.adam_eps_f;
-
-    opt_params_lbfgs.print_forward_graph  = false;
-    opt_params_lbfgs.print_backward_graph = false;
-    opt_params_lbfgs.n_threads            = params.n_threads;
-    opt_params_adam.past                  = params.opt_past;
-    opt_params_adam.delta                 = params.opt_delta;
-    opt_params_adam.max_no_improvement    = params.opt_max_no_improvement;
-    opt_params_lbfgs.lbfgs.n_iter         = params.lbfgs_n_iter;
-
-    opt->ctx = lora.ctx;
-    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
+        GGML_ASSERT(opt_param_count_changed == false);
+        GGML_ASSERT(opt_past_changed == false);
 
-    printf("%s: init model\n", __func__);
-    // bool existed = load_checkpoint(&model, &lora, opt, params.fn_checkpoint_in, true);
-    bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, opt);
-    if (!existed) {
+        if (opt_param_count_changed) {
+            // need to discard previous optimizer gradient statistics and opt_init with new shapes
+            // TODO
+        }
+        if (opt_past_changed) {
+            // need to discard previous optimizer past function value statistics and opt_init with new shapes
+            // TODO
+        }
+    } else { // existed == false
         init_lora(&model, &lora);
         randomize_lora(&lora, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+        if (!params.only_write_lora) {
+            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
+        }
     }
-    set_param_lora(&lora);
+
     print_params(&model.hparams);
     print_lora_params(&lora.hparams);
+    printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
+    printf("%s: max_opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
+    opt->iter = lora.train_its;
 
-    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
+    if (params.only_write_lora) {
+        if (strlen(params.fn_lora_out) > 0) {
+            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
+            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
+        }
+        ggml_free(lora.ctx);
+        llama_free(lctx);
+        llama_free_model(lmodel);
+        return 0;
+    }
+
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
+    int n_batch  = params.n_batch;
 
-    opt->iter = lora.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
     printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
-    // ggml_print_tensor_objects(lora.ctx);
 
-    // TODO: use std::vector<uint8_t> intead of "new"
-    size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
-    uint8_t * compute_addr = new uint8_t[compute_size];
+    std::vector<uint8_t> mem_input_data;
+    std::vector<uint8_t> mem_compute_data;
+
+    // context for input tensors without their data
+    struct ggml_init_params ctx_input_params = {
+        ggml_tensor_overhead() * 2, // mem_size
+        NULL,                       // mem_buffer
+        true,                       // no_alloc
+    };
+    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
+
+    // the input tensors
+    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
+    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+
+    // measure required memory for input tensors
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc, target_probs);
+    size_t max_input_size = ggml_allocr_max_size(alloc);
+    ggml_allocr_free(alloc);
+    printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
 
-    size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
-    uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
+    // allocate input tensors
+    mem_input_data.resize(max_input_size);
+    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc, target_probs);
+    ggml_allocr_free(alloc);
 
-    static const size_t tensor_alignment = 32;
-    ggml_allocr * alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
+    // context for compute tensors without their data
+    size_t estimated_compute_size_wo_data = (
+        ggml_tensor_overhead()*GGML_MAX_NODES*2
+      + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
+            params.use_checkpointing ? 3 : 2
+        )
+    );
+    struct ggml_init_params ctx_compute_params = {
+        estimated_compute_size_wo_data, // mem_size
+        NULL,                           // mem_buffer
+        true,                           // no_alloc
+    };
+    struct ggml_context * ctx_compute = ggml_init(ctx_compute_params);
+
+    struct ggml_tensor * loss   = NULL;
+    struct ggml_tensor * logits = NULL;
+
+    struct ggml_cgraph * gf     = NULL;
+    struct ggml_cgraph * gb     = NULL;
+    struct ggml_cgraph * gb_tmp = NULL;
+
+    // measure required memory for compute tensors
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    gf = ggml_new_graph(ctx_compute);
+    gb = ggml_new_graph(ctx_compute);
+    gb_tmp = params.use_checkpointing
+        ? ggml_new_graph(ctx_compute)
+        : NULL;
+    loss = llama_build_lora_finetune_graphs(
+        &model, &lora, alloc, ctx_compute,
+        gf, gb, gb_tmp,
+        &logits, tokens_input, target_probs,
+        n_tokens, n_batch,
+        params.use_flash,
+        params.use_checkpointing
+    );
+    size_t max_compute_size = ggml_allocr_max_size(alloc);
+    ggml_allocr_free(alloc);
+    printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
+
+    // reset compute context
+    ggml_free(ctx_compute);
+    ctx_compute = ggml_init(ctx_compute_params);
+
+    // allocate compute tensors
+    mem_compute_data.resize(max_compute_size);
+    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    gf = ggml_new_graph(ctx_compute);
+    gb = ggml_new_graph(ctx_compute);
+    gb_tmp = params.use_checkpointing
+        ? ggml_new_graph(ctx_compute)
+        : NULL;
+    loss = llama_build_lora_finetune_graphs(
+        &model, &lora, alloc, ctx_compute,
+        gf, gb, gb_tmp,
+        &logits, tokens_input, target_probs,
+        n_tokens, n_batch,
+        params.use_flash,
+        params.use_checkpointing
+    );
+    ggml_allocr_free(alloc);
+
+    // tokenize data
+    std::vector<llama_token> train_tokens;
+    printf("%s: tokenize training data\n", __func__);
+    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
+        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
+    }
+    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
+
+    std::vector<size_t> token_noccurs;
+    token_noccurs.resize(model.hparams.n_vocab, 0);
+    for (unsigned int i = 0; i < train_tokens.size(); ++i) {
+        ++token_noccurs[train_tokens[i]];
+    }
+    int n_unique_tokens = 0;
+    for (unsigned int i = 0; i < token_noccurs.size(); ++i) {
+        if (token_noccurs[i] == 0) continue;
+        ++n_unique_tokens;
+    }
+    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
+    // generate token positions of training samples
     std::vector<int> train_samples;
-    if (params.n_examples > 0) {
-        GGML_ASSERT(n_tokens < (int) train_tokens.size());
-        train_samples.push_back(0);
-        for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-            if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
-                train_samples.push_back(i);
-            }
-        }
-        shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-        for (int i = 0; i < (int) train_samples.size(); ++i) {
-            GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
+    GGML_ASSERT(n_tokens < (int) train_tokens.size());
+    train_samples.push_back(0);
+    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
+        const bool is_valid_sample_start = !params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx));
+        if (is_valid_sample_start) {
+            train_samples.push_back(i);
         }
     }
+    shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
+    for (int i = 0; i < (int) train_samples.size(); ++i) {
+        GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
+    }
 
     printf("%s: begin training\n", __func__);
 
     struct opt_callback_data opt_cb_data;
     opt_cb_data.params = &params;
-    opt_cb_data.opt = opt;
-    opt_cb_data.model = &model;
-    opt_cb_data.lora = &lora;
-    opt_cb_data.lctx = lctx;
-    opt_cb_data.last_save_iter = opt->iter;
-    opt_cb_data.tokens_data = train_tokens.data();
-    opt_cb_data.tokens_size = train_tokens.size();
-    opt_cb_data.samples_data = train_samples.data();
-    opt_cb_data.samples_size = train_samples.size();
+    opt_cb_data.opt    = opt;
+    opt_cb_data.model  = &model;
+    opt_cb_data.lora   = &lora;
+    opt_cb_data.lctx   = lctx;
+    opt_cb_data.last_save_iter    = opt->iter;
+    opt_cb_data.tokens_data       = train_tokens.data();
+    opt_cb_data.tokens_size       = train_tokens.size();
+    opt_cb_data.samples_data      = train_samples.data();
+    opt_cb_data.samples_size      = train_samples.size();
     opt_cb_data.shuffle_countdown = train_samples.size();
-    opt_cb_data.tokens_input  = NULL;
-    opt_cb_data.target_logits = NULL;
-    opt_cb_data.target_probs  = NULL;
+    opt_cb_data.tokens_input      = tokens_input;
+    opt_cb_data.target_probs      = target_probs;
+
+    // measure required memory for work buffer
+    size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE;
+    printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
+
+    // context for work buffer
+    struct ggml_init_params ctx_work_params = {
+        max_work_size, // mem_size
+        NULL,          // mem_buffer
+        false,         // no_alloc
+    };
+    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
 
     int64_t t0 = ggml_time_ms();
 
-    for (int ex = 0; ex < params.n_examples; ++ex) {
-        if (ex*n_batch >= (int) train_samples.size()) {
-            shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-            for (int i = 0; i < (int) train_samples.size(); ++i) {
-                GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
-            }
-        }
-
-        struct ggml_init_params cparams = {
-            compute_size, // mem_size
-            compute_addr, // mem_buffer
-            false,        // no_alloc
-        };
-        struct ggml_context * ctx0 = ggml_init(cparams);
-
-        ggml_set_no_alloc(ctx0, false);
+    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
 
-        // don't use alloc for input tensors, so we can safely fill them with data
-        struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-        struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-        ggml_set_no_alloc(ctx0, true);
-
-        ggml_allocr_reset(alloc);
-
-        opt_cb_data.tokens_input  = tokens_input;
-        opt_cb_data.target_logits = target_logits;
-        opt_cb_data.target_probs  = target_probs;
-
-        int n_past = 0;
-
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = params.use_checkpointing
-            ? ggml_new_graph(ctx0)
-            : NULL;
-
-        GGML_ASSERT(n_past == 0);
-
-        struct ggml_tensor * loss   = NULL;
-        struct ggml_tensor * logits = NULL;
-
-        loss = llama_build_lora_finetune_graphs(
-            &model, &lora, alloc, ctx0,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.use_flash,
-            params.use_checkpointing
-        );
-
-        size_t used_mem_before_opt = ggml_used_mem(ctx0);
-
-        opt->params.adam.sched = (opt->iter < params.warmup)
-            ? (float) opt->iter / (float) params.warmup
-            : cosine_decay_restart(
-                params.cos_decay_steps,
-                params.cos_decay_min,
-                opt->iter - params.warmup,
-                params.cos_decay_restart,
-                params.enable_restart);
-
-        float min_sched = params.adam_min_alpha / params.adam_alpha;
-        opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched);
-
-        printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
-
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
-
-        size_t used_mem_after_opt = ggml_used_mem(ctx0);
-
-        if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
-            printf("Example %d, opt iter %d\n", ex, opt->iter);
-            printf("error_before_opt: %.6f\n", opt->loss_before);
-            printf("error_after_opt:  %.6f\n", opt->loss_after);
-            printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
-            printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
-        }
-
-        ggml_free(ctx0);
-    }
+    ggml_free(ctx_work);
+    ggml_free(ctx_compute);
+    ggml_free(ctx_input);
 
     int64_t t1 = ggml_time_ms();
     int64_t d  = t1-t0;
@@ -2473,25 +2645,23 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    lora.train_its += new_iters;
-    lora.train_samples += new_iters * n_batch;
-    lora.train_tokens  += new_iters * n_batch * n_tokens;
-
-    if (params.n_examples > 0) {
-        save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, -1, params.fn_latest);
-    }
-
-    if (strlen(params.fn_lora_out) > 0) {
-        save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
+    if (new_iters > 0) {
+        lora.train_its += new_iters;
+        lora.train_samples += new_iters * n_batch;
+        lora.train_tokens  += new_iters * n_batch * n_tokens;
+
+        if (strlen(params.fn_checkpoint_out) > 0) {
+            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
+            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, -1, params.fn_latest);
+        }
+        if (strlen(params.fn_lora_out) > 0) {
+            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
+            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
+        }
+        opt_cb_data.last_save_iter = opt->iter;
     }
 
-    opt_cb_data.last_save_iter = opt->iter;
-
-    ggml_allocr_free(alloc);
-    delete[] compute_addr;
-    delete[] compute_buf_0;
+    ggml_free(opt->ctx);
     ggml_free(lora.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);

From 6cbf55a64bd9e761a01163f25f2f1788c4aba784 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 1 Sep 2023 16:02:45 +0200
Subject: [PATCH 145/235] add finetune to Makefile

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 02ba3e36d8466..db0e1174f60c9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search finetune
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
@@ -427,6 +427,9 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
 beam_search: examples/beam_search/beam_search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
 BUILD_TARGETS += metal
 endif

From 7acb1241c6f9ea1de1b9fe53555dfb60ff1ae6af Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 1 Sep 2023 16:04:08 +0200
Subject: [PATCH 146/235] update README.md

---
 examples/finetune/README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index 11fe992d0c409..e034bb2f082da 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -15,8 +15,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --train-data "shakespeare.txt" \
         --save-every 10 \
         --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing --use-alloc \
-        --mem-lora 2 --mem-compute 1 --mem-compute0 20
+        --use-checkpointing
 
 # predict
 ./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
@@ -28,8 +27,6 @@ The pattern "ITERATION" in the output filenames will be replaced with the iterat
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
 
-To change the amount of memory for finetuning with memory allocator (`--use-alloc`, used by default), you can use `--mem-compute0 N` to specify the number of gigabytes.
-
 The LORA rank is configured for each model tensor type separately with these command line options:
 
 ```bash

From c32ad44f84c4537d10ecb31da296353654bb7835 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 1 Sep 2023 17:03:36 +0200
Subject: [PATCH 147/235] print time per iteration and estimate remaining time

---
 examples/finetune/finetune.cpp | 60 ++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 848e390f6f604..a86edd7e74be6 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2221,8 +2221,34 @@ struct opt_callback_data {
     int                       shuffle_countdown;
     struct ggml_tensor *      tokens_input;
     struct ggml_tensor *      target_probs;
+    int                       first_iter;
+    int64_t                   last_time;
+    float                     time_per_iter;
 };
 
+void print_duration(float fmillis) {
+    if (fmillis < 1000.0f) {
+        printf("%.1fms", fmillis);
+        return;
+    }
+    const int64_t one_sec  = 1000;
+    const int64_t one_min  = one_sec  * 60;
+    const int64_t one_hour = one_min  * 60;
+    const int64_t one_day  = one_hour * 24;
+
+    int64_t millis  = fmillis;
+    int64_t days    = millis/one_day;
+    int64_t hours   = (millis - days*one_day)/one_hour;
+    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
+    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
+
+    if (days > 0) {
+        printf("%ldd %02ld:%02ld:%02ld", days, hours, minutes, seconds);
+    } else {
+        printf("%02ld:%02ld:%02ld", hours, minutes, seconds);
+    }
+}
+
 void opt_callback(void * vdata, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
@@ -2230,6 +2256,25 @@ void opt_callback(void * vdata, float * sched) {
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
+    int64_t now = ggml_time_ms();
+    if (now > data->last_time) {
+        float dt = now - data->last_time;
+        if (data->time_per_iter == 0) {
+            data->time_per_iter = dt;
+        } else {
+            const float gain = 0.7f;
+            data->time_per_iter = data->time_per_iter*(1.0f-gain) + dt*gain;
+        }
+    }
+    data->last_time = now;
+    float remaining_time = 0;
+    if (data->time_per_iter > 0) {
+        const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
+        const int done_iter = opt->iter - data->first_iter;
+        const int remaining_iter = n_iter - done_iter;
+        remaining_time = remaining_iter * data->time_per_iter;
+    }
+
     const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
     if (save_now) {
         int new_iters = opt->iter - data->last_save_iter;
@@ -2262,11 +2307,19 @@ void opt_callback(void * vdata, float * sched) {
     int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
     if (impr_plot > 0) impr_plot = 0;
     if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
-    printf("%s: iter=%*d, sched=%f loss=%f ", __func__, 6, opt->iter, *sched, opt->loss_after);
+    printf("%s: iter=%*d sched=%f loss=%f",
+        __func__, 6, opt->iter, *sched, opt->loss_after);
+    if (data->time_per_iter > 0) {
+        printf(" dt=");
+        print_duration(data->time_per_iter);
+        printf(" eta=");
+        print_duration(remaining_time);
+    }
+
     float improvement = opt->loss_before - opt->loss_after;
     const float plot_scale = 10.0f;
     int bar_len = (int)(1 + improvement*plot_scale + 0.5);
-    printf("|");
+    printf(" |");
     for (int i=0; i<bar_len; ++i) {
         printf("-");
     }
@@ -2618,6 +2671,9 @@ int main(int argc, char ** argv) {
     opt_cb_data.shuffle_countdown = train_samples.size();
     opt_cb_data.tokens_input      = tokens_input;
     opt_cb_data.target_probs      = target_probs;
+    opt_cb_data.first_iter        = opt->iter;
+    opt_cb_data.last_time         = ggml_time_ms();
+    opt_cb_data.time_per_iter     = 0;
 
     // measure required memory for work buffer
     size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE;

From 6ee12b158b230d6fb6a7f5e66001ca13133f4943 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 15:59:14 +0200
Subject: [PATCH 148/235] increase measured alloc size by tensor_alignment

ggml_allocr_reset will reduce the given size by up to tensor_alignment-1
---
 examples/finetune/finetune.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index a86edd7e74be6..d2c6580c9dc21 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -621,7 +621,7 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
     }
 
     // allocate data
-    lora->data.resize(ggml_allocr_max_size(alloc));
+    lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
     ggml_allocr_free(alloc);
     alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
     ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
@@ -2547,7 +2547,7 @@ int main(int argc, char ** argv) {
     alloc = ggml_allocr_new_measure(tensor_alignment);
     ggml_allocr_alloc(alloc, tokens_input);
     ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc);
+    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
     ggml_allocr_free(alloc);
     printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
 
@@ -2594,7 +2594,7 @@ int main(int argc, char ** argv) {
         params.use_flash,
         params.use_checkpointing
     );
-    size_t max_compute_size = ggml_allocr_max_size(alloc);
+    size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
     ggml_allocr_free(alloc);
     printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
 

From cfe217f1cab4301818e9da5a66688b4fc407291f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 16:11:31 +0200
Subject: [PATCH 149/235] fix README.md

---
 examples/finetune/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index e034bb2f082da..0a5a3cfd79a55 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -11,7 +11,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
         --model-base open-llama-3b-v2-q8_0.gguf \
         --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
         --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
-        --model-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
+        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
         --train-data "shakespeare.txt" \
         --save-every 10 \
         --threads 6 --adam-iter 30 --batch 4 --ctx 64 \

From ded6382961e1b34b1da8dc108f068a2cf2856cb7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 20:52:25 +0200
Subject: [PATCH 150/235] add some more allocator debug prints

---
 ggml-alloc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index ab34e70772dc5..045392a3b231c 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -158,6 +158,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     }
 
     tensor->data = addr;
+    AT_PRINTF("%s: allocated data at 0x%p\n", __func__, tensor->data);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
@@ -189,7 +190,8 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
 
     size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at 0x%p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: alloc->data = 0x%p alloc->data+alloc->size = 0x%p alloc->data+alloc->max_size = 0x%p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
@@ -382,7 +384,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     // if the node's data is external, then we cannot re-use it
                     if ((char *) parent->data < (char *) alloc->data ||
                         (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
-                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        AT_PRINTF("not reusing parent %s for %s as 0x%p is external\n", parent->name, node->name, parent->data);
                         continue;
                     }
 

From 8d982c8fd9ea3661ef9712c9f861e17eab2f5444 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 20:53:14 +0200
Subject: [PATCH 151/235] bug fix, probably solves the 'ggml_allocr_alloc: not
 enough space in the buffer' issue

---
 ggml-alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 045392a3b231c..12b74c6915d55 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -181,7 +181,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     void * ptr = tensor->data;
 
-    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->size) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it

From 1ce7023eed2a295062d37df4a5f26aa015cbdfe9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 21:27:12 +0200
Subject: [PATCH 152/235] revert last commit

"bug fix, probably solves the 'ggml_allocr_alloc: not enough space in the buffer' issue"

"alloc was freeing an externally allocated tensor, because it calculated the end of allocator memory as alloc->data + alloc->max_size instead of alloc->data + alloc->size."

This is intentional to reduce the risk of freeing external tensors when measuring. Unless max_size is not properly calculated, I don't see why this is an issue.
---
 ggml-alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 12b74c6915d55..045392a3b231c 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -181,7 +181,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
     void * ptr = tensor->data;
 
-    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->size) {
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
         // the tensor was not allocated in this buffer
         // this can happen because the graph allocator will try to free weights and other tensors from different buffers
         // the easiest way to deal with this is just to ignore it

From 2d2bdc0df789299cd825074393c83ab56ccf070d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 21:28:08 +0200
Subject: [PATCH 153/235] remove unnecessary "0x" before "%p" output

---
 ggml-alloc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 045392a3b231c..83dfb5e45d0f9 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -158,7 +158,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
     }
 
     tensor->data = addr;
-    AT_PRINTF("%s: allocated data at 0x%p\n", __func__, tensor->data);
+    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     add_allocated_tensor(alloc, tensor);
@@ -190,8 +190,8 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
 
     size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
     size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s at 0x%p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-    AT_PRINTF("%s: alloc->data = 0x%p alloc->data+alloc->size = 0x%p alloc->data+alloc->max_size = 0x%p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
+    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, tensor);
@@ -384,7 +384,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                     // if the node's data is external, then we cannot re-use it
                     if ((char *) parent->data < (char *) alloc->data ||
                         (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
-                        AT_PRINTF("not reusing parent %s for %s as 0x%p is external\n", parent->name, node->name, parent->data);
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                         continue;
                     }
 

From 80ac697df93be2a833e87ece68c8ca95fb20b454 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 2 Sep 2023 21:44:20 +0200
Subject: [PATCH 154/235] move measurement memory segment to upper region of
 the address space

---
 ggml-alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-alloc.c b/ggml-alloc.c
index 83dfb5e45d0f9..2c481030e58fd 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -285,7 +285,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
 
 // address and size of the buffer when measuring
 // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
-static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+static void * const MEASURE_BASE_ADDR = (void *) (-(1ULL<<40) - 1);
 static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
 
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {

From 406e0750ccf446bed86c06d1f698b6540aff9adf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 3 Sep 2023 19:25:18 +0200
Subject: [PATCH 155/235] update README.md

---
 examples/finetune/README.md | 43 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index 0a5a3cfd79a55..beb8f8a617dc2 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -22,7 +22,46 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 ```
 
 Finetune output files will be saved every N iterations (config with `--save-every N`).
-The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
+The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
+So in above example after 10 iterations these files will be written:
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
+- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
+- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+
+After 10 more iterations:
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
+- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
+- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+
+Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
+
+llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
+These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
+
+In `main` you can also load multiple LORA adapters, which will then be mixed together.
+
+For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
+
+```bash
+./bin/main -m open-llama-3b-v2-q8_0.gguf \
+  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
+  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
+```
+
+You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
+
+For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
+
+```bash
+./bin/main -m open-llama-3b-v2-q8_0.gguf \
+  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
+  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
+  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
+```
+
+The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
 
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
@@ -44,4 +83,6 @@ The LORA rank is configured for each model tensor type separately with these com
   --rank-w3 N                LORA rank for w3 tensor (default 4)
 ```
 
+The LORA rank of 'norm' tensors should always be 1.
+
 To see all available options use `finetune --help`.

From e07f5c57bb4be51c0b0e5c77026d613cc3583b3d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 3 Sep 2023 20:03:39 +0200
Subject: [PATCH 156/235] fix printf format warnings

---
 examples/finetune/finetune.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d2c6580c9dc21..a29e3bdf07ad9 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2243,10 +2243,9 @@ void print_duration(float fmillis) {
     int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
 
     if (days > 0) {
-        printf("%ldd %02ld:%02ld:%02ld", days, hours, minutes, seconds);
-    } else {
-        printf("%02ld:%02ld:%02ld", hours, minutes, seconds);
+        printf("%lldd ", days);
     }
+    printf("%02lld:%02lld:%02lld", hours, minutes, seconds);
 }
 
 void opt_callback(void * vdata, float * sched) {
@@ -2697,8 +2696,10 @@ int main(int argc, char ** argv) {
 
     int64_t t1 = ggml_time_ms();
     int64_t d  = t1-t0;
-    double  dd = (double) d * 1e-3;
-    printf("%s: total training time=%f seconds\n", __func__, dd);
+    float  fd  = (float) d * 1e-3;
+    printf("%s: total training ", __func__);
+    print_duration(fd);
+    printf("\n");
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     if (new_iters > 0) {

From bdb7092e82cafd88870d34ef330f634048828af8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 3 Sep 2023 20:04:03 +0200
Subject: [PATCH 157/235] add missing gguf_free in load_checkpoint_lora_file

---
 examples/finetune/finetune.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index a29e3bdf07ad9..b9a5b0447b94a 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1556,6 +1556,7 @@ bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * mo
 
     load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, opt);
 
+    gguf_free(fctx);
     return true;
 }
 

From 50589ed6bea71fbf1a367a3dcebe3461f4bc112b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 3 Sep 2023 20:05:54 +0200
Subject: [PATCH 158/235] load default rms_norm and rope parameters from base
 model

---
 examples/finetune/finetune.cpp | 69 ++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 4 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index b9a5b0447b94a..bba1bdb7c877c 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -383,6 +383,9 @@ void print_lora_params(struct my_llama_lora_hparams * params) {
     printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
     printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
     printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
+    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
+    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
+    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
 }
 
 void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
@@ -1238,6 +1241,37 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
     }
 }
 
+void load_default_lora_params_from_base_model(const char * fn_base_model, struct my_llama_lora_hparams * lora_params) {
+    if (strlen(fn_base_model) == 0) {
+        return;
+    }
+    struct gguf_init_params params;
+    params.no_alloc = false;
+    params.ctx = NULL;
+    struct gguf_context * fctx = gguf_init_from_file(fn_base_model, params);
+    if (fctx == NULL) {
+        return;
+    }
+
+    const char * arch = "llama";
+    std::vector<char> keybuf;
+    keybuf.resize(512);
+    auto kv = [arch, &keybuf](const char * key) -> const char * {
+        snprintf(keybuf.data(), keybuf.size(), key, arch);
+        return keybuf.data();
+    };
+
+    float rope_freq_scale = 1.0f;
+    GGUF_GET_KEY(fctx, lora_params->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(fctx, lora_params->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    if (rope_freq_scale != 1.0f) {
+        lora_params->rope_freq_scale = 1.0f / rope_freq_scale;
+    }
+
+    gguf_free(fctx);
+}
+
 void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
@@ -1697,12 +1731,18 @@ struct train_params {
     int n_threads;
     int n_batch;
 
+    bool custom_n_ctx;
+
     bool only_write_lora;
 
     float f_norm_rms_eps;
     float rope_freq_base;
     float rope_freq_scale;
 
+    bool custom_f_norm_rms_eps;
+    bool custom_rope_freq_base;
+    bool custom_rope_freq_scale;
+
     int32_t lora_r;
     int32_t lora_alpha;
 
@@ -1765,12 +1805,18 @@ struct train_params get_default_train_params() {
     params.n_threads  =    6;
     params.n_batch    =    8;
 
+    params.custom_n_ctx = false;
+
     params.only_write_lora = false;
 
     params.f_norm_rms_eps  = 1e-5f;
     params.rope_freq_base  = 10000.0f;
     params.rope_freq_scale = 1.0f;
 
+    params.custom_f_norm_rms_eps  = false;
+    params.custom_rope_freq_base  = false;
+    params.custom_rope_freq_scale = false;
+
     params.lora_alpha  = 4;
     params.lora_r      = 4;
 
@@ -1956,6 +2002,7 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_ctx = std::stoi(argv[i]);
+            params->custom_n_ctx = true;
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1974,18 +2021,21 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->f_norm_rms_eps = std::stof(argv[i]);
+            params->custom_f_norm_rms_eps = true;
         } else if (arg == "--rope-freq-base") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->rope_freq_base = std::stof(argv[i]);
+            params->custom_rope_freq_base = true;
         } else if (arg == "--rope-freq-scale") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->rope_freq_scale = std::stof(argv[i]);
+            params->custom_rope_freq_scale = true;
         } else if (arg == "--lora-alpha") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2411,10 +2461,18 @@ int main(int argc, char ** argv) {
 
     opt->ctx = NULL;
 
+    load_default_lora_params_from_base_model(params.fn_model_base, &lora.hparams);
+
     // set lora params from command line
-    lora.hparams.f_norm_rms_eps        = params.f_norm_rms_eps;
-    lora.hparams.rope_freq_base        = params.rope_freq_base;
-    lora.hparams.rope_freq_scale       = params.rope_freq_scale;
+    if (params.custom_f_norm_rms_eps) {
+        lora.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
+    }
+    if (params.custom_rope_freq_base) {
+        lora.hparams.rope_freq_base  = params.rope_freq_base;
+    }
+    if (params.custom_rope_freq_scale) {
+        lora.hparams.rope_freq_scale = params.rope_freq_scale;
+    }
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.lora_alpha;
     lora.hparams.n_rank_attention_norm = params.n_rank_attention_norm;
@@ -2465,7 +2523,10 @@ int main(int argc, char ** argv) {
     bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, opt);
 
     if (existed) {
-        model.hparams.n_ctx = params.n_ctx;
+        // overwrite last n_ctx with user provided n_ctx
+        if (params.custom_n_ctx) {
+            model.hparams.n_ctx = params.n_ctx;
+        }
 
         const bool opt_param_count_changed = (
            (lora.hparams.n_rank_attention_norm != params.n_rank_attention_norm)

From c1c3b0e0c2ba9f17d4546c702abffdd4687fd2a5 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 5 Sep 2023 01:09:06 +0200
Subject: [PATCH 159/235] add gradient accumulation

specify number accumulation steps with '--grad-acc N'.
this will simulate a bigger batch size of grad_acc*batch.
---
 examples/finetune/finetune.cpp                | 193 ++++++++++--------
 .../train-text-from-scratch.cpp               | 122 ++++++-----
 ggml.c                                        | 150 ++++++++------
 ggml.h                                        |   5 +-
 4 files changed, 268 insertions(+), 202 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index bba1bdb7c877c..6f133ac5f0d14 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1730,6 +1730,7 @@ struct train_params {
     int n_ctx;
     int n_threads;
     int n_batch;
+    int n_gradient_accumulation;
 
     bool custom_n_ctx;
 
@@ -1804,6 +1805,7 @@ struct train_params get_default_train_params() {
     params.n_ctx      =  128;
     params.n_threads  =    6;
     params.n_batch    =    8;
+    params.n_gradient_accumulation = 1;
 
     params.custom_n_ctx = false;
 
@@ -1880,6 +1882,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
+    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
     fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
@@ -2015,6 +2018,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_batch = std::stoi(argv[i]);
+        } else if (arg == "--grad-acc") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_gradient_accumulation = std::stoi(argv[i]);
         } else if (arg == "--norm-rms-eps") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2299,83 +2308,85 @@ void print_duration(float fmillis) {
     printf("%02lld:%02lld:%02lld", hours, minutes, seconds);
 }
 
-void opt_callback(void * vdata, float * sched) {
+void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
     struct ggml_opt_context * opt   = data->opt;
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
-    int64_t now = ggml_time_ms();
-    if (now > data->last_time) {
-        float dt = now - data->last_time;
-        if (data->time_per_iter == 0) {
-            data->time_per_iter = dt;
-        } else {
-            const float gain = 0.7f;
-            data->time_per_iter = data->time_per_iter*(1.0f-gain) + dt*gain;
+    if (accum_step == 0) {
+        int64_t now = ggml_time_ms();
+        if (now > data->last_time) {
+            float dt = now - data->last_time;
+            if (data->time_per_iter == 0) {
+                data->time_per_iter = dt;
+            } else {
+                const float gain = 0.7f;
+                data->time_per_iter = data->time_per_iter*(1.0f-gain) + dt*gain;
+            }
         }
-    }
-    data->last_time = now;
-    float remaining_time = 0;
-    if (data->time_per_iter > 0) {
-        const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
-        const int done_iter = opt->iter - data->first_iter;
-        const int remaining_iter = n_iter - done_iter;
-        remaining_time = remaining_iter * data->time_per_iter;
-    }
-
-    const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
-    if (save_now) {
-        int new_iters = opt->iter - data->last_save_iter;
-        data->lora->train_its += new_iters;
-        data->lora->train_samples += new_iters * n_batch;
-        data->lora->train_tokens  += new_iters * n_batch * n_ctx;
-
-        if (strlen(params->fn_checkpoint_out) > 0) {
-            save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
-            save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, -1, params->fn_latest);
+        data->last_time = now;
+        float remaining_time = 0;
+        if (data->time_per_iter > 0) {
+            const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
+            const int done_iter = opt->iter - data->first_iter;
+            const int remaining_iter = n_iter - done_iter;
+            remaining_time = remaining_iter * data->time_per_iter;
+        }
+
+        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
+        if (save_now) {
+            int new_iters = opt->iter - data->last_save_iter;
+            data->lora->train_its += new_iters;
+            data->lora->train_samples += new_iters * n_batch;
+            data->lora->train_tokens  += new_iters * n_batch * n_ctx;
+
+            if (strlen(params->fn_checkpoint_out) > 0) {
+                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
+                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, -1, params->fn_latest);
+            }
+            if (strlen(params->fn_lora_out) > 0) {
+                save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
+                save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, -1, params->fn_latest);
+            }
+            data->last_save_iter = opt->iter;
         }
-        if (strlen(params->fn_lora_out) > 0) {
-            save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
-            save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, -1, params->fn_latest);
+
+        *sched = (opt->iter < params->warmup)
+                    ? (float) opt->iter / (float) params->warmup
+                    : cosine_decay_restart(
+                        params->cos_decay_steps,
+                        params->cos_decay_min,
+                        opt->iter - params->warmup,
+                        params->cos_decay_restart,
+                        params->enable_restart);
+        float min_sched = params->adam_min_alpha / params->adam_alpha;
+        *sched = min_sched + *sched * (1.0f - min_sched);
+
+        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+        if (impr_plot > 0) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+        printf("%s: iter=%*d sched=%f loss=%f",
+            __func__, 6, opt->iter, *sched, opt->loss_after);
+        if (data->time_per_iter > 0) {
+            printf(" dt=");
+            print_duration(data->time_per_iter);
+            printf(" eta=");
+            print_duration(remaining_time);
         }
-        data->last_save_iter = opt->iter;
-    }
-
-    *sched = (opt->iter < params->warmup)
-                ? (float) opt->iter / (float) params->warmup
-                : cosine_decay_restart(
-                    params->cos_decay_steps,
-                    params->cos_decay_min,
-                    opt->iter - params->warmup,
-                    params->cos_decay_restart,
-                    params->enable_restart);
-    float min_sched = params->adam_min_alpha / params->adam_alpha;
-    *sched = min_sched + *sched * (1.0f - min_sched);
-
-    int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
-    if (impr_plot > 0) impr_plot = 0;
-    if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
-    printf("%s: iter=%*d sched=%f loss=%f",
-        __func__, 6, opt->iter, *sched, opt->loss_after);
-    if (data->time_per_iter > 0) {
-        printf(" dt=");
-        print_duration(data->time_per_iter);
-        printf(" eta=");
-        print_duration(remaining_time);
-    }
-
-    float improvement = opt->loss_before - opt->loss_after;
-    const float plot_scale = 10.0f;
-    int bar_len = (int)(1 + improvement*plot_scale + 0.5);
-    printf(" |");
-    for (int i=0; i<bar_len; ++i) {
-        printf("-");
-    }
-    printf(">");
-    // printf("improvement: %*d>", impr_plot, (int)0);
-    printf("\n");
+
+        float improvement = opt->loss_before - opt->loss_after;
+        const float plot_scale = 10.0f;
+        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
+        printf(" |");
+        for (int i=0; i<bar_len; ++i) {
+            printf("-");
+        }
+        printf(">");
+        // printf("improvement: %*d>", impr_plot, (int)0);
+        printf("\n");
+    }
 
     if (data->shuffle_countdown < n_batch) {
         printf("%s: reshuffle samples\n", __func__);
@@ -2491,30 +2502,32 @@ int main(int argc, char ** argv) {
     // set opt params from command line
     if (params.use_adam) {
         opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
-        opt->params.print_forward_graph  = false;
-        opt->params.print_backward_graph = false;
-        opt->params.n_threads            = params.n_threads;
-        opt->params.past                 = params.opt_past;
-        opt->params.delta                = params.opt_delta;
-        opt->params.max_no_improvement   = params.opt_max_no_improvement;
-        opt->params.adam.n_iter          = params.adam_n_iter;
-        opt->params.adam.sched           = 1.0f;
-        opt->params.adam.alpha           = params.adam_alpha;
-        opt->params.adam.decay           = params.adam_decay;
-        opt->params.adam.decay_min_ndim  = params.adam_decay_min_ndim;
-        opt->params.adam.beta1           = params.adam_beta1;
-        opt->params.adam.beta2           = params.adam_beta2;
-        opt->params.adam.gclip           = params.adam_gclip;
-        opt->params.adam.eps_f           = params.adam_eps_f;
+        opt->params.print_forward_graph     = false;
+        opt->params.print_backward_graph    = false;
+        opt->params.n_threads               = params.n_threads;
+        opt->params.past                    = params.opt_past;
+        opt->params.delta                   = params.opt_delta;
+        opt->params.max_no_improvement      = params.opt_max_no_improvement;
+        opt->params.n_gradient_accumulation = params.n_gradient_accumulation;
+        opt->params.adam.n_iter             = params.adam_n_iter;
+        opt->params.adam.sched              = 1.0f;
+        opt->params.adam.alpha              = params.adam_alpha;
+        opt->params.adam.decay              = params.adam_decay;
+        opt->params.adam.decay_min_ndim     = params.adam_decay_min_ndim;
+        opt->params.adam.beta1              = params.adam_beta1;
+        opt->params.adam.beta2              = params.adam_beta2;
+        opt->params.adam.gclip              = params.adam_gclip;
+        opt->params.adam.eps_f              = params.adam_eps_f;
     } else {
         opt->params = ggml_opt_default_params(GGML_OPT_LBFGS);
-        opt->params.print_forward_graph  = false;
-        opt->params.print_backward_graph = false;
-        opt->params.n_threads            = params.n_threads;
-        opt->params.past                 = params.opt_past;
-        opt->params.delta                = params.opt_delta;
-        opt->params.max_no_improvement   = params.opt_max_no_improvement;
-        opt->params.lbfgs.n_iter         = params.lbfgs_n_iter;
+        opt->params.print_forward_graph     = false;
+        opt->params.print_backward_graph    = false;
+        opt->params.n_threads               = params.n_threads;
+        opt->params.past                    = params.opt_past;
+        opt->params.delta                   = params.opt_delta;
+        opt->params.max_no_improvement      = params.opt_max_no_improvement;
+        opt->params.n_gradient_accumulation = params.n_gradient_accumulation;
+        opt->params.lbfgs.n_iter            = params.lbfgs_n_iter;
     }
 
     ggml_allocr * alloc = NULL;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index f31427a9984ca..21dacfebaa8c6 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1299,8 +1299,9 @@ struct train_params {
     int n_ff;
 
     int n_threads;
-    int n_batch;
     int n_examples;
+    int n_batch;
+    int n_gradient_accumulation;
 
     float f_norm_rms_eps;
     float rope_freq_base;
@@ -1362,8 +1363,9 @@ struct train_params get_default_train_params() {
     params.n_ff       =  768;
 
     params.n_threads  =    6;
-    params.n_batch    =    8;
     params.n_examples =    1;
+    params.n_batch    =    8;
+    params.n_gradient_accumulation = 1;
 
     params.f_norm_rms_eps  = 1e-5f;
     params.rope_freq_base  = 10000.0f;
@@ -1428,8 +1430,9 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
     fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
-    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
+    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
+    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
@@ -1591,6 +1594,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_batch = std::stoi(argv[i]);
+        } else if (arg == "--grad-acc") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_gradient_accumulation = std::stoi(argv[i]);
         } else if (arg == "-n" || arg == "--examples") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1779,45 +1788,48 @@ struct opt_callback_data {
     struct ggml_tensor *      target_probs;
 };
 
-void opt_callback(void * vdata, float * sched) {
+void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
     struct ggml_opt_context * opt   = data->opt;
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
-    const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
-    if (save_now) {
-        int new_iters = opt->iter - data->last_save_iter;
-        data->model->train_its += new_iters;
-        data->model->train_samples += new_iters * n_batch;
-        data->model->train_tokens  += new_iters * n_batch * n_ctx;
+    if (accum_step == 0) {
+        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
+        if (save_now) {
+            int new_iters = opt->iter - data->last_save_iter;
+            data->model->train_its += new_iters;
+            data->model->train_samples += new_iters * n_batch;
+            data->model->train_tokens  += new_iters * n_batch * n_ctx;
 
-        if (strlen(params->fn_checkpoint_out) > 0) {
-            save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
-            save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, -1, params->fn_latest);
+            if (strlen(params->fn_checkpoint_out) > 0) {
+                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
+                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, -1, params->fn_latest);
 
+            }
+            if (strlen(params->fn_model_out) > 0) {
+                save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, opt->iter, params->fn_latest);
+                save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, -1, params->fn_latest);
+            }
+            data->last_save_iter = opt->iter;
         }
-        if (strlen(params->fn_model_out) > 0) {
-            save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, opt->iter, params->fn_latest);
-            save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, -1, params->fn_latest);
-        }
-        data->last_save_iter = opt->iter;
-    }
 
-    *sched = (opt->iter < params->warmup)
-                ? (float) opt->iter / (float) params->warmup
-                : cosine_decay_restart(
-                    params->cos_decay_steps,
-                    params->cos_decay_min,
-                    opt->iter - params->warmup,
-                    params->cos_decay_restart,
-                    params->enable_restart);
-    float min_sched = params->adam_min_alpha / params->adam_alpha;
-    *sched = min_sched + *sched * (1.0f - min_sched);
+        *sched = (opt->iter < params->warmup)
+                    ? (float) opt->iter / (float) params->warmup
+                    : cosine_decay_restart(
+                        params->cos_decay_steps,
+                        params->cos_decay_min,
+                        opt->iter - params->warmup,
+                        params->cos_decay_restart,
+                        params->enable_restart);
+        float min_sched = params->adam_min_alpha / params->adam_alpha;
+        *sched = min_sched + *sched * (1.0f - min_sched);
 
-    int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
-    printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+        int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+        printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+
+    }
 
     if (data->shuffle_countdown < n_batch) {
         printf("%s: reshuffle samples\n", __func__);
@@ -1917,29 +1929,31 @@ int main(int argc, char ** argv) {
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
     struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
-    opt_params_adam.print_forward_graph  = false;
-    opt_params_adam.print_backward_graph = false;
-    opt_params_adam.n_threads            = params.n_threads;
-    opt_params_adam.past                 = params.opt_past;
-    opt_params_adam.delta                = params.opt_delta;
-    opt_params_adam.max_no_improvement   = params.opt_max_no_improvement;
-    opt_params_adam.adam.n_iter          = params.adam_n_iter;
-    opt_params_adam.adam.sched           = 1.0f;
-    opt_params_adam.adam.alpha           = params.adam_alpha;
-    opt_params_adam.adam.decay           = params.adam_decay;
-    opt_params_adam.adam.decay_min_ndim  = params.adam_decay_min_ndim;
-    opt_params_adam.adam.beta1           = params.adam_beta1;
-    opt_params_adam.adam.beta2           = params.adam_beta2;
-    opt_params_adam.adam.gclip           = params.adam_gclip;
-    opt_params_adam.adam.eps_f           = params.adam_eps_f;
-
-    opt_params_lbfgs.print_forward_graph  = false;
-    opt_params_lbfgs.print_backward_graph = false;
-    opt_params_lbfgs.n_threads            = params.n_threads;
-    opt_params_adam.past                  = params.opt_past;
-    opt_params_adam.delta                 = params.opt_delta;
-    opt_params_adam.max_no_improvement    = params.opt_max_no_improvement;
-    opt_params_lbfgs.lbfgs.n_iter         = params.lbfgs_n_iter;
+    opt_params_adam.print_forward_graph     = false;
+    opt_params_adam.print_backward_graph    = false;
+    opt_params_adam.n_threads               = params.n_threads;
+    opt_params_adam.past                    = params.opt_past;
+    opt_params_adam.delta                   = params.opt_delta;
+    opt_params_adam.max_no_improvement      = params.opt_max_no_improvement;
+    opt_params_adam.n_gradient_accumulation = params.n_gradient_accumulation;
+    opt_params_adam.adam.n_iter             = params.adam_n_iter;
+    opt_params_adam.adam.sched              = 1.0f;
+    opt_params_adam.adam.alpha              = params.adam_alpha;
+    opt_params_adam.adam.decay              = params.adam_decay;
+    opt_params_adam.adam.decay_min_ndim     = params.adam_decay_min_ndim;
+    opt_params_adam.adam.beta1              = params.adam_beta1;
+    opt_params_adam.adam.beta2              = params.adam_beta2;
+    opt_params_adam.adam.gclip              = params.adam_gclip;
+    opt_params_adam.adam.eps_f              = params.adam_eps_f;
+
+    opt_params_lbfgs.print_forward_graph     = false;
+    opt_params_lbfgs.print_backward_graph    = false;
+    opt_params_lbfgs.n_threads               = params.n_threads;
+    opt_params_lbfgs.past                    = params.opt_past;
+    opt_params_lbfgs.delta                   = params.opt_delta;
+    opt_params_lbfgs.max_no_improvement      = params.opt_max_no_improvement;
+    opt_params_lbfgs.n_gradient_accumulation = params.n_gradient_accumulation;
+    opt_params_lbfgs.lbfgs.n_iter            = params.lbfgs_n_iter;
 
     opt->ctx = model.ctx;
     opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
diff --git a/ggml.c b/ggml.c
index f5419c34e6a6b..d7dc3cb441fbf 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19112,7 +19112,7 @@ static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float *
 }
 
 static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-    int i = 0;
+    int64_t i = 0;
     for (int p = 0; p < np; ++p) {
         const int64_t ne = ggml_nelements(ps[p]) ;
         // TODO: add function to get all elements at once
@@ -19122,6 +19122,17 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
     }
 }
 
+static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
+    int64_t i = 0;
+    for (int p = 0; p < np; ++p) {
+        const int64_t ne = ggml_nelements(ps[p]) ;
+        // TODO: add function to get all elements at once
+        for (int64_t j = 0; j < ne; ++j) {
+            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
+        }
+    }
+}
+
 //
 // ADAM
 //
@@ -19170,26 +19181,37 @@ static enum ggml_opt_result ggml_opt_adam(
     const float eps   = params.adam.eps;
     const float gclip = params.adam.gclip;
     const int decay_min_ndim = params.adam.decay_min_ndim;
+    const int n_accum = MAX(1, params.n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
 
+    float * g  = opt->adam.g->data;  // gradients
     float * m  = opt->adam.m->data;  // first moment
     float * v  = opt->adam.v->data;  // second moment
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    if (callback) {
-        callback(callback_data, &sched);
-    }
-
-    // compute the function value
-    // ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-
     struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-    ggml_graph_compute(gb, &cplan);
 
-    opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
+
+    // compute the function value
+
+    float fx = 0;
+    ggml_set_zero(opt->adam.g);
+    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+        if (callback) {
+            callback(callback_data, accum_step, &sched);
+        }
+        // ggml_graph_reset  (gf);
+        ggml_set_f32      (f->grad, 1.0f);
+        ggml_graph_compute(gb, &cplan);
+        ggml_opt_acc_grad(np, ps, g, accum_norm);
+        fx += ggml_get_f32_1d(f, 0);
+    }
+    fx *= accum_norm;
+
+    opt->adam.fx_prev = fx;
     opt->adam.fx_best = opt->adam.fx_prev;
     if (pf) {
         pf[opt->iter % params.past] = opt->adam.fx_prev;
@@ -19234,12 +19256,8 @@ static enum ggml_opt_result ggml_opt_adam(
             if (gclip > 0.0f) {
                 // gradient clipping
                 ggml_float sum = 0.0;
-                for (int p = 0; p < np; ++p) {
-                    const int64_t ne = ggml_nelements(ps[p]);
-                    for (int64_t j = 0; j < ne; ++j) {
-                        float g = ggml_get_f32_1d(ps[p]->grad, j);
-                        sum += (ggml_float)(g*g);
-                    }
+                for (int64_t i = 0; i < nx; ++i) {
+                    sum += (ggml_float)(g[i]*g[i]);
                 }
                 ggml_float norm = sqrt(sum);
                 if (norm > (ggml_float) gclip) {
@@ -19253,10 +19271,10 @@ static enum ggml_opt_result ggml_opt_adam(
                 const int64_t ne = ggml_nelements(ps[p]);
                 const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
                 for (int64_t j = 0; j < ne; ++j) {
-                    float x = ggml_get_f32_1d(ps[p], j);
-                    float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
-                    m[i] = m[i]*beta1 +   g*(1.0f - beta1);
-                    v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
+                    float x  = ggml_get_f32_1d(ps[p], j);
+                    float g_ = g[i]*gnorm;
+                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
+                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
                     float mh = m[i]*beta1h;
                     float vh = v[i]*beta2h;
                     vh = sqrtf(vh) + eps;
@@ -19267,16 +19285,20 @@ static enum ggml_opt_result ggml_opt_adam(
             }
         }
 
-        if (callback) {
-            callback(callback_data, &sched);
+        fx = 0;
+        ggml_set_zero(opt->adam.g);
+        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+            if (callback) {
+                callback(callback_data, accum_step, &sched);
+            }
+            // ggml_graph_reset  (gf);
+            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_compute(gb, &cplan);
+            ggml_opt_acc_grad(np, ps, g, accum_norm);
+            fx += ggml_get_f32_1d(f, 0);
         }
+        fx *= accum_norm;
 
-        // ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-
-        ggml_graph_compute(gb, &cplan);
-
-        const float fx = ggml_get_f32_1d(f, 0);
         opt->loss_after = fx;
 
 
@@ -19373,6 +19395,9 @@ static enum ggml_opt_result linesearch_backtracking(
     const float dec = 0.5f;
     const float inc = 2.1f;
 
+    const int n_accum = MAX(1, params->n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
     if (*step <= 0.f) {
         return GGML_LINESEARCH_INVALID_PARAMETERS;
     }
@@ -19390,12 +19415,6 @@ static enum ggml_opt_result linesearch_backtracking(
     dgtest = params->lbfgs.ftol*dginit;
 
     while (true) {
-        if (callback) {
-            // LBFG-S does not support learning rate -> ignore learning schedule
-            float sched = 0;
-            callback(callback_data, &sched);
-        }
-
         ggml_vec_cpy_f32(nx, x, xp);
         ggml_vec_mad_f32(nx, x, d, *step);
 
@@ -19403,14 +19422,22 @@ static enum ggml_opt_result linesearch_backtracking(
         {
             ggml_opt_set_params(np, ps, x);
 
-            //ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-
-            ggml_graph_compute(gb, cplan);
-
-            ggml_opt_get_grad(np, ps, g);
+            *fx = 0;
+            memset(g, 0, sizeof(float)*nx);
+            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+                if (callback) {
+                    // LBFG-S does not support learning rate -> ignore learning schedule
+                    float sched = 0;
+                    callback(callback_data, accum_step, &sched);
+                }
+                // ggml_graph_reset  (gf);
+                ggml_set_f32      (f->grad, 1.0f);
+                ggml_graph_compute(gb, cplan);
+                ggml_opt_acc_grad(np, ps, g, accum_norm);
+                *fx += ggml_get_f32_1d(f, 0);
+            }
+            *fx *= accum_norm;
 
-            *fx = ggml_get_f32_1d(f, 0);
         }
 
         ++count;
@@ -19512,6 +19539,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
 
     float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
 
+    const int n_accum = MAX(1, params.n_gradient_accumulation);
+    const float accum_norm = 1.0f / (float) n_accum;
+
     float fx    = 0.0f; // cost function value
     float xnorm = 0.0f; // ||x||
     float gnorm = 0.0f; // ||g||
@@ -19525,24 +19555,25 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     float * lm_s     = opt->lbfgs.lms->data;
     float * lm_y     = opt->lbfgs.lmy->data;
 
-    if (callback) {
-        // LBFG-S does not support learning rate -> ignore learning schedule
-        float sched = 0;
-        callback(callback_data, &sched);
-    }
-
     // evaluate the function value and its gradient
     {
         ggml_opt_set_params(np, ps, x);
 
-        //ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-
-        ggml_graph_compute(gb, &cplan);
-
-        ggml_opt_get_grad(np, ps, g);
-
-        fx = ggml_get_f32_1d(f, 0);
+        fx = 0;
+        memset(g, 0, sizeof(float)*nx);
+        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
+            if (callback) {
+                // LBFG-S does not support learning rate -> ignore learning schedule
+                float sched = 0;
+                callback(callback_data, accum_step, &sched);
+            }
+            // ggml_graph_reset  (gf);
+            ggml_set_f32      (f->grad, 1.0f);
+            ggml_graph_compute(gb, &cplan);
+            ggml_opt_acc_grad(np, ps, g, accum_norm);
+            fx += ggml_get_f32_1d(f, 0);
+        }
+        fx *= accum_norm;
 
         opt->loss_before = fx;
         opt->loss_after  = fx;
@@ -19729,6 +19760,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                     .print_forward_graph  = true,
                     .print_backward_graph = true,
 
+                    .n_gradient_accumulation = 1,
+
                     .adam = {
                         .n_iter = 10000,
                         .sched  = 1.000f,
@@ -19757,6 +19790,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
                     .print_forward_graph  = true,
                     .print_backward_graph = true,
 
+                    .n_gradient_accumulation = 1,
+
                     .lbfgs = {
                         .m              = 6,
                         .n_iter         = 100,
@@ -19790,7 +19825,7 @@ GGML_API void ggml_opt_init(
     if (opt->ctx == NULL) {
         struct ggml_init_params ctx_opt_params;
         if (opt->params.type == GGML_OPT_ADAM) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*2 + ggml_tensor_overhead()*2 + ggml_type_size(GGML_TYPE_F32)*nx*2;
+            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
             if (opt->params.past > 0) {
                 ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
             }
@@ -19808,6 +19843,7 @@ GGML_API void ggml_opt_init(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
+                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                 opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                 opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
                 opt->adam.pf = params.past > 0
diff --git a/ggml.h b/ggml.h
index 0ab8cedeabff8..0d38a7110cd03 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1708,7 +1708,7 @@ extern "C" {
         GGML_LINESEARCH_INVALID_PARAMETERS,
     };
 
-    typedef void (*ggml_opt_callback)(void * data, float * sched);
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched);
 
     // optimization parameters
     //
@@ -1739,6 +1739,8 @@ extern "C" {
         bool print_forward_graph;
         bool print_backward_graph;
 
+        int n_gradient_accumulation;
+
         // ADAM parameters
         struct {
             int n_iter;
@@ -1784,6 +1786,7 @@ extern "C" {
         float loss_after;
 
         struct {
+            struct ggml_tensor * g;  // current gradient
             struct ggml_tensor * m;  // first moment
             struct ggml_tensor * v;  // second moment
             struct ggml_tensor * pf; // past function values

From d07b6aac7790c1cfc0292a91e7c42588ca29b3e0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Tue, 5 Sep 2023 02:18:17 +0200
Subject: [PATCH 160/235] fix tracking of train_samples and train_tokens

---
 examples/finetune/finetune.cpp                            | 8 ++++----
 .../train-text-from-scratch/train-text-from-scratch.cpp   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 6f133ac5f0d14..d205367b32e17 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2339,8 +2339,8 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
             data->lora->train_its += new_iters;
-            data->lora->train_samples += new_iters * n_batch;
-            data->lora->train_tokens  += new_iters * n_batch * n_ctx;
+            data->lora->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+            data->lora->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
             if (strlen(params->fn_checkpoint_out) > 0) {
                 save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
@@ -2779,8 +2779,8 @@ int main(int argc, char ** argv) {
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     if (new_iters > 0) {
         lora.train_its += new_iters;
-        lora.train_samples += new_iters * n_batch;
-        lora.train_tokens  += new_iters * n_batch * n_tokens;
+        lora.train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+        lora.train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
         if (strlen(params.fn_checkpoint_out) > 0) {
             save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 21dacfebaa8c6..549302a81d6db 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1800,8 +1800,8 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
             data->model->train_its += new_iters;
-            data->model->train_samples += new_iters * n_batch;
-            data->model->train_tokens  += new_iters * n_batch * n_ctx;
+            data->model->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+            data->model->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
             if (strlen(params->fn_checkpoint_out) > 0) {
                 save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
@@ -2122,8 +2122,8 @@ int main(int argc, char ** argv) {
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     model.train_its += new_iters;
-    model.train_samples += new_iters * n_batch;
-    model.train_tokens  += new_iters * n_batch * n_tokens;
+    model.train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+    model.train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
     if (params.n_examples > 0) {
         save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt, params.pattern_fn_it, opt->iter, params.fn_latest);

From 786e786061daaef387d5d6f5a72039c81b462c0b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 5 Sep 2023 12:02:19 +0300
Subject: [PATCH 161/235] build : fix compile warnings

---
 examples/finetune/finetune.cpp | 24 ++++++++++++------------
 ggml.c                         |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d205367b32e17..52c9c0eb4685c 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1747,18 +1747,18 @@ struct train_params {
     int32_t lora_r;
     int32_t lora_alpha;
 
-    int n_rank_attention_norm;
-    int n_rank_wq;
-    int n_rank_wk;
-    int n_rank_wv;
-    int n_rank_wo;
-    int n_rank_ffn_norm;
-    int n_rank_w1;
-    int n_rank_w2;
-    int n_rank_w3;
-    int n_rank_tok_embeddings;
-    int n_rank_norm;
-    int n_rank_output;
+    uint32_t n_rank_attention_norm;
+    uint32_t n_rank_wq;
+    uint32_t n_rank_wk;
+    uint32_t n_rank_wv;
+    uint32_t n_rank_wo;
+    uint32_t n_rank_ffn_norm;
+    uint32_t n_rank_w1;
+    uint32_t n_rank_w2;
+    uint32_t n_rank_w3;
+    uint32_t n_rank_tok_embeddings;
+    uint32_t n_rank_norm;
+    uint32_t n_rank_output;
 
     bool samples_start_after_nl;
     bool use_adam;
diff --git a/ggml.c b/ggml.c
index d7dc3cb441fbf..e9fdb0d10e77f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16476,7 +16476,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
     ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
 
     return clone;
-};
+}
 
 void ggml_build_backward_gradient_checkpointing(
         struct ggml_context   * ctx,

From d375b8f3aa60e355fc242ea8a5a03b7b263b904e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 5 Sep 2023 12:05:13 +0300
Subject: [PATCH 162/235] ggml : fix L-BFGS linesearch loop

---
 ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index e9fdb0d10e77f..84b66603adafd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19467,7 +19467,6 @@ static enum ggml_opt_result linesearch_backtracking(
                     // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
                     return count;
                 }
-                return count;
             }
         }
 

From 8c2d7e37f9832eb64b42e97e6134659b73252a67 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 6 Sep 2023 18:06:24 +0200
Subject: [PATCH 163/235] improve finetune time measurement

fix printf warnings on system where int64_t is (long int).
change time datatypes to double because values get big with long training times.
exclude file saving from time measurement.
converge faster to actual time per iteration by removing very small first duration before first iteration was performed.
fix bug in output of total training time, the reported value was 1000 times to small.
---
 examples/finetune/finetune.cpp | 54 +++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 52c9c0eb4685c..a2ad711fb82f1 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2283,12 +2283,12 @@ struct opt_callback_data {
     struct ggml_tensor *      target_probs;
     int                       first_iter;
     int64_t                   last_time;
-    float                     time_per_iter;
+    double                    millis_per_iter;
 };
 
-void print_duration(float fmillis) {
+void print_duration(double fmillis) {
     if (fmillis < 1000.0f) {
-        printf("%.1fms", fmillis);
+        printf("%.1fms", (float) fmillis);
         return;
     }
     const int64_t one_sec  = 1000;
@@ -2296,16 +2296,17 @@ void print_duration(float fmillis) {
     const int64_t one_hour = one_min  * 60;
     const int64_t one_day  = one_hour * 24;
 
-    int64_t millis  = fmillis;
+    int64_t millis  = (int64_t) fmillis;
     int64_t days    = millis/one_day;
     int64_t hours   = (millis - days*one_day)/one_hour;
     int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
     int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
 
+    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
     if (days > 0) {
-        printf("%lldd ", days);
+        printf("%lldd ", (long long int) days);
     }
-    printf("%02lld:%02lld:%02lld", hours, minutes, seconds);
+    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
 }
 
 void opt_callback(void * vdata, int accum_step, float * sched) {
@@ -2316,25 +2317,27 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
     int n_ctx = params->n_ctx;
 
     if (accum_step == 0) {
+        // time measurement
         int64_t now = ggml_time_ms();
-        if (now > data->last_time) {
-            float dt = now - data->last_time;
-            if (data->time_per_iter == 0) {
-                data->time_per_iter = dt;
+        if (now > data->last_time && opt->iter > data->first_iter) {
+            double dt = now - data->last_time;
+            if (data->millis_per_iter == 0.0) {
+                data->millis_per_iter = dt;
             } else {
-                const float gain = 0.7f;
-                data->time_per_iter = data->time_per_iter*(1.0f-gain) + dt*gain;
+                const double gain = 0.7;
+                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
             }
         }
-        data->last_time = now;
-        float remaining_time = 0;
-        if (data->time_per_iter > 0) {
+
+        double remaining_millis = 0.0;
+        if (data->millis_per_iter > 0.0) {
             const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
             const int done_iter = opt->iter - data->first_iter;
             const int remaining_iter = n_iter - done_iter;
-            remaining_time = remaining_iter * data->time_per_iter;
+            remaining_millis = remaining_iter * data->millis_per_iter;
         }
 
+        // file saving
         const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
@@ -2353,6 +2356,9 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
             data->last_save_iter = opt->iter;
         }
 
+        // exclude file saving from time measurement, by measuring last_time after saving
+        data->last_time = ggml_time_ms();
+
         *sched = (opt->iter < params->warmup)
                     ? (float) opt->iter / (float) params->warmup
                     : cosine_decay_restart(
@@ -2369,11 +2375,13 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
         printf("%s: iter=%*d sched=%f loss=%f",
             __func__, 6, opt->iter, *sched, opt->loss_after);
-        if (data->time_per_iter > 0) {
+
+
+        if (data->millis_per_iter > 0) {
             printf(" dt=");
-            print_duration(data->time_per_iter);
+            print_duration(data->millis_per_iter);
             printf(" eta=");
-            print_duration(remaining_time);
+            print_duration(remaining_millis);
         }
 
         float improvement = opt->loss_before - opt->loss_after;
@@ -2747,7 +2755,7 @@ int main(int argc, char ** argv) {
     opt_cb_data.target_probs      = target_probs;
     opt_cb_data.first_iter        = opt->iter;
     opt_cb_data.last_time         = ggml_time_ms();
-    opt_cb_data.time_per_iter     = 0;
+    opt_cb_data.millis_per_iter   = 0.0;
 
     // measure required memory for work buffer
     size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE;
@@ -2770,10 +2778,8 @@ int main(int argc, char ** argv) {
     ggml_free(ctx_input);
 
     int64_t t1 = ggml_time_ms();
-    int64_t d  = t1-t0;
-    float  fd  = (float) d * 1e-3;
-    printf("%s: total training ", __func__);
-    print_duration(fd);
+    printf("%s: total training time: ", __func__);
+    print_duration((double) (t1 - t0));
     printf("\n");
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;

From c08fcf59479126b660e63eedcd0261dff34930d6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 6 Sep 2023 20:11:22 +0200
Subject: [PATCH 164/235] specify default lora rank with '--lora-r N'

'--lora-r N' will specify default rank for all tensors
'--rank-wq N', etc. will override this default rank for specific tensor types.
---
 examples/finetune/finetune.cpp | 130 +++++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 40 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index a2ad711fb82f1..632392970a568 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1760,6 +1760,19 @@ struct train_params {
     uint32_t n_rank_norm;
     uint32_t n_rank_output;
 
+    bool custom_n_rank_attention_norm;
+    bool custom_n_rank_wq;
+    bool custom_n_rank_wk;
+    bool custom_n_rank_wv;
+    bool custom_n_rank_wo;
+    bool custom_n_rank_ffn_norm;
+    bool custom_n_rank_w1;
+    bool custom_n_rank_w2;
+    bool custom_n_rank_w3;
+    bool custom_n_rank_tok_embeddings;
+    bool custom_n_rank_norm;
+    bool custom_n_rank_output;
+
     bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
@@ -1835,6 +1848,19 @@ struct train_params get_default_train_params() {
     params.n_rank_norm           = 1;
     params.n_rank_output         = 4;
 
+    params.custom_n_rank_attention_norm = false;
+    params.custom_n_rank_wq             = false;
+    params.custom_n_rank_wk             = false;
+    params.custom_n_rank_wv             = false;
+    params.custom_n_rank_wo             = false;
+    params.custom_n_rank_ffn_norm       = false;
+    params.custom_n_rank_w1             = false;
+    params.custom_n_rank_w2             = false;
+    params.custom_n_rank_w3             = false;
+    params.custom_n_rank_tok_embeddings = false;
+    params.custom_n_rank_norm           = false;
+    params.custom_n_rank_output         = false;
+
     params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
@@ -1887,19 +1913,19 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
     fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
-    fprintf(stderr, "  --lora-r N                 LORA r     : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_r);
-    fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor (default %d)\n", params->n_rank_attention_norm);
-    fprintf(stderr, "  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default %d)\n", params->n_rank_ffn_norm);
-    fprintf(stderr, "  --rank-out-norm N          LORA rank for output norm tensor (default %d)\n", params->n_rank_norm);
-    fprintf(stderr, "  --rank-tok-embd N          LORA rank for token embeddings tensor (default %d)\n", params->n_rank_tok_embeddings);
-    fprintf(stderr, "  --rank-out N               LORA rank for output tensor (default %d)\n", params->n_rank_output);
-    fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor (default %d)\n", params->n_rank_wq);
-    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor (default %d)\n", params->n_rank_wk);
-    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor (default %d)\n", params->n_rank_wv);
-    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor (default %d)\n", params->n_rank_wo);
-    fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor (default %d)\n", params->n_rank_w1);
-    fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor (default %d)\n", params->n_rank_w2);
-    fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor (default %d)\n", params->n_rank_w3);
+    fprintf(stderr, "  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r);
+    fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
+    fprintf(stderr, "  --rank-ffn-norm N          LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
+    fprintf(stderr, "  --rank-out-norm N          LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
+    fprintf(stderr, "  --rank-tok-embd N          LORA rank for token embeddings tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-out N               LORA rank for output tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
     fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
@@ -2063,72 +2089,84 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->n_rank_attention_norm = std::stoi(argv[i]);
+            params->custom_n_rank_attention_norm = true;
         } else if (arg == "--rank-ffn-norm") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_ffn_norm = std::stoi(argv[i]);
+            params->custom_n_rank_ffn_norm = true;
         } else if (arg == "--rank-out-norm") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_norm = std::stoi(argv[i]);
+            params->custom_n_rank_norm = true;
         } else if (arg == "--rank-tok-embd") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_tok_embeddings = std::stoi(argv[i]);
+            params->custom_n_rank_tok_embeddings = true;
         } else if (arg == "--rank-out") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_output = std::stoi(argv[i]);
+            params->custom_n_rank_output = true;
         } else if (arg == "--rank-wq") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_wq = std::stoi(argv[i]);
+            params->custom_n_rank_wq = true;
         } else if (arg == "--rank-wk") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_wk = std::stoi(argv[i]);
+            params->custom_n_rank_wk = true;
         } else if (arg == "--rank-wv") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_wv = std::stoi(argv[i]);
+            params->custom_n_rank_wv = true;
         } else if (arg == "--rank-wo") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_wo = std::stoi(argv[i]);
+            params->custom_n_rank_wo = true;
         } else if (arg == "--rank-w1") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_w1 = std::stoi(argv[i]);
+            params->custom_n_rank_w1 = true;
         } else if (arg == "--rank-w2") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_w2 = std::stoi(argv[i]);
+            params->custom_n_rank_w2 = true;
         } else if (arg == "--rank-w3") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->n_rank_w3 = std::stoi(argv[i]);
+            params->custom_n_rank_w3 = true;
         } else if (arg == "--samples-after-nl") {
             params->samples_start_after_nl = true;
         } else if (arg == "--use-lbfgs") {
@@ -2494,18 +2532,30 @@ int main(int argc, char ** argv) {
     }
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.lora_alpha;
-    lora.hparams.n_rank_attention_norm = params.n_rank_attention_norm;
-    lora.hparams.n_rank_wq             = params.n_rank_wq;
-    lora.hparams.n_rank_wk             = params.n_rank_wk;
-    lora.hparams.n_rank_wv             = params.n_rank_wv;
-    lora.hparams.n_rank_wo             = params.n_rank_wo;
-    lora.hparams.n_rank_ffn_norm       = params.n_rank_ffn_norm;
-    lora.hparams.n_rank_w1             = params.n_rank_w1;
-    lora.hparams.n_rank_w2             = params.n_rank_w2;
-    lora.hparams.n_rank_w3             = params.n_rank_w3;
-    lora.hparams.n_rank_tok_embeddings = params.n_rank_tok_embeddings;
-    lora.hparams.n_rank_norm           = params.n_rank_norm;
-    lora.hparams.n_rank_output         = params.n_rank_output;
+    int n_rank_attention_norm          = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
+    int n_rank_wq                      = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
+    int n_rank_wk                      = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
+    int n_rank_wv                      = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
+    int n_rank_wo                      = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
+    int n_rank_ffn_norm                = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
+    int n_rank_w1                      = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
+    int n_rank_w2                      = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
+    int n_rank_w3                      = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
+    int n_rank_tok_embeddings          = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
+    int n_rank_norm                    = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
+    int n_rank_output                  = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
+    lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
+    lora.hparams.n_rank_wq             = n_rank_wq;
+    lora.hparams.n_rank_wk             = n_rank_wk;
+    lora.hparams.n_rank_wv             = n_rank_wv;
+    lora.hparams.n_rank_wo             = n_rank_wo;
+    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
+    lora.hparams.n_rank_w1             = n_rank_w1;
+    lora.hparams.n_rank_w2             = n_rank_w2;
+    lora.hparams.n_rank_w3             = n_rank_w3;
+    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
+    lora.hparams.n_rank_norm           = n_rank_norm;
+    lora.hparams.n_rank_output         = n_rank_output;
 
     // set opt params from command line
     if (params.use_adam) {
@@ -2550,30 +2600,30 @@ int main(int argc, char ** argv) {
         }
 
         const bool opt_param_count_changed = (
-           (lora.hparams.n_rank_attention_norm != params.n_rank_attention_norm)
-        || (lora.hparams.n_rank_wq             != params.n_rank_wq)
-        || (lora.hparams.n_rank_wk             != params.n_rank_wk)
-        || (lora.hparams.n_rank_wv             != params.n_rank_wv)
-        || (lora.hparams.n_rank_wo             != params.n_rank_wo)
-        || (lora.hparams.n_rank_ffn_norm       != params.n_rank_ffn_norm)
-        || (lora.hparams.n_rank_w1             != params.n_rank_w1)
-        || (lora.hparams.n_rank_w2             != params.n_rank_w2)
-        || (lora.hparams.n_rank_w3             != params.n_rank_w3)
-        || (lora.hparams.n_rank_tok_embeddings != params.n_rank_tok_embeddings)
-        || (lora.hparams.n_rank_norm           != params.n_rank_norm)
-        || (lora.hparams.n_rank_output         != params.n_rank_output)
+           (lora.hparams.n_rank_attention_norm != n_rank_attention_norm)
+        || (lora.hparams.n_rank_wq             != n_rank_wq)
+        || (lora.hparams.n_rank_wk             != n_rank_wk)
+        || (lora.hparams.n_rank_wv             != n_rank_wv)
+        || (lora.hparams.n_rank_wo             != n_rank_wo)
+        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
+        || (lora.hparams.n_rank_w1             != n_rank_w1)
+        || (lora.hparams.n_rank_w2             != n_rank_w2)
+        || (lora.hparams.n_rank_w3             != n_rank_w3)
+        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
+        || (lora.hparams.n_rank_norm           != n_rank_norm)
+        || (lora.hparams.n_rank_output         != n_rank_output)
         );
 
         const bool opt_past_changed = opt->params.past != params.opt_past;
 
-        GGML_ASSERT(opt_param_count_changed == false);
-        GGML_ASSERT(opt_past_changed == false);
-
         if (opt_param_count_changed) {
+            print_lora_params(&lora.hparams);
+            GGML_ASSERT(!"Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
             // need to discard previous optimizer gradient statistics and opt_init with new shapes
             // TODO
         }
         if (opt_past_changed) {
+            GGML_ASSERT(!"Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
             // need to discard previous optimizer past function value statistics and opt_init with new shapes
             // TODO
         }

From de6170d8184137ec13496af5ac1aff77e1d083b2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 6 Sep 2023 21:35:21 +0200
Subject: [PATCH 165/235] fix gradient accumulation bug where the same batch
 was used for each microstep

---
 examples/finetune/finetune.cpp                               | 4 ++--
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 632392970a568..ed6bd87934dc5 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2049,7 +2049,7 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 invalid_param = true;
                 break;
             }
-            params->n_gradient_accumulation = std::stoi(argv[i]);
+            params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
         } else if (arg == "--norm-rms-eps") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2449,7 +2449,7 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         data->samples_size,
         data->tokens_data,
         data->tokens_size,
-        opt->iter,
+        opt->iter * params->n_gradient_accumulation,
         data->tokens_input,
         data->target_probs);
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 549302a81d6db..0a486f5538036 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1599,7 +1599,7 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 invalid_param = true;
                 break;
             }
-            params->n_gradient_accumulation = std::stoi(argv[i]);
+            params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
         } else if (arg == "-n" || arg == "--examples") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1846,7 +1846,7 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         data->samples_size,
         data->tokens_data,
         data->tokens_size,
-        opt->iter,
+        opt->iter * params->n_gradient_accumulation,
         data->tokens_input,
         data->target_logits,
         data->target_probs);

From 0c2c9c7545e687538172478de98b605c3233223a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 6 Sep 2023 22:45:36 +0200
Subject: [PATCH 166/235] fix gradient accumulation bug where the same batch
 was used for each microstep

---
 examples/finetune/finetune.cpp                               | 2 +-
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ed6bd87934dc5..d7c0a3360c1b5 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2449,7 +2449,7 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         data->samples_size,
         data->tokens_data,
         data->tokens_size,
-        opt->iter * params->n_gradient_accumulation,
+        opt->iter*params->n_gradient_accumulation + accum_step,
         data->tokens_input,
         data->target_probs);
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0a486f5538036..bfe788a79c339 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1846,7 +1846,7 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         data->samples_size,
         data->tokens_data,
         data->tokens_size,
-        opt->iter * params->n_gradient_accumulation,
+        opt->iter*params->n_gradient_accumulation + accum_step,
         data->tokens_input,
         data->target_logits,
         data->target_probs);

From d7aade7d8a68596c2fb56f881367f28331139cef Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 17:01:54 +0200
Subject: [PATCH 167/235] support grouped-query-attention in ggml_flash_attn
 and ggml_flash_attn_back

k and v can now be repeated in q along ne[2]

in forward pass just use modulo to compute k and v indices, like ik2 = iq2 % nek2.

in backard pass this won't work as easy, because multiple threads will compete to accumulate to the same k->grad[:,ik1,ik2,ik3] and v->grad[:,iv1,iv2,iv3].
so we change the parallelization over q rows to be over k rows. this ensures non-overlapping (ik2,ik3) across threads.
in each thread we then iterate over the number of repetitions of k/v in q to compute iq2 as iq2 = ik2 + irep*nek2.

since ne2 is not the same for q,k and v we also change how the gradients are concatenated into the result tensor.
additionally the offsets of gradq, gradk and gradv in the result tensor are now memory aligned.

we also simplify the compute_backward part of flash_attn to use ggml_reshape instead of switching over the number of dimensions.
this needs a small change to ggml_reshape, removing the assertion of second argument to be contiguous.
since only the shape (ne) of the second reshape argument is of relevance, its memory layout (nb) is irrelevant -> it can very well be non-contiguous.

change test-grad0 to also test for repeated k/v in q.

this changes the rng and now results in small gradient differences in softmax. these solely come from using f16 exp table lookup in forward softmax: when temporarily changing softmax to use actual exp function, the reported gradient differences go away. gradient differences coming solely from f16 table lookup are acceptable.
added a note to explain this.
---
 ggml.c               | 700 ++++++++++++++++++++-----------------------
 tests/test-grad0.cpp |  47 +--
 2 files changed, 344 insertions(+), 403 deletions(-)

diff --git a/ggml.c b/ggml.c
index 84b66603adafd..5b1c3c79ca63d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6585,7 +6585,7 @@ struct ggml_tensor * ggml_reshape(
         struct ggml_tensor * a,
         struct ggml_tensor * b) {
     GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_is_contiguous(b));
+    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
     GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
 
     bool is_node = false;
@@ -7655,27 +7655,30 @@ struct ggml_tensor * ggml_flash_attn_back(
 
     // d shape [D,N,ne2,ne3]
     // q shape [D,N,ne2,ne3]
-    // k shape [D,M,ne2,ne3]
-    // v shape [M,D,ne2,ne3]
+    // k shape [D,M,kvne2,ne3]
+    // v shape [M,D,kvne2,ne3]
 
-    const int64_t   D = q->ne[0];
-    const int64_t   N = q->ne[1];
-    const int64_t   M = k->ne[1];
-    const int64_t ne2 = q->ne[2];
-    const int64_t ne3 = q->ne[3];
+    const int64_t     D = q->ne[0];
+    const int64_t     N = q->ne[1];
+    const int64_t     M = k->ne[1];
+    const int64_t   ne2 = q->ne[2];
+    const int64_t   ne3 = q->ne[3];
+    const int64_t kvne2 = k->ne[2];
 
     GGML_ASSERT(k->ne[0] == D);
     GGML_ASSERT(v->ne[0] == M);
     GGML_ASSERT(v->ne[1] == D);
     GGML_ASSERT(d->ne[0] == D);
     GGML_ASSERT(d->ne[1] == N);
-    GGML_ASSERT(k->ne[2] == ne2);
+    GGML_ASSERT(k->ne[2] == kvne2);
     GGML_ASSERT(k->ne[3] == ne3);
-    GGML_ASSERT(v->ne[2] == ne2);
+    GGML_ASSERT(v->ne[2] == kvne2);
     GGML_ASSERT(v->ne[3] == ne3);
     GGML_ASSERT(d->ne[2] == ne2);
     GGML_ASSERT(d->ne[3] == ne3);
 
+    GGML_ASSERT(ne2 % kvne2 == 0);
+
     bool is_node = false;
 
     if (q->grad || k->grad || v->grad) {
@@ -7685,14 +7688,23 @@ struct ggml_tensor * ggml_flash_attn_back(
     }
 
     // store gradients of q, k and v as continuous tensors concatenated in result.
-    // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3]
-    // gradq->data = result->data
-    // gradk->data = result->data + nb0*D*N*ne2*ne3
-    // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3
     // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
-    int64_t ne[4] = {D,M+N+M,ne2,ne3};
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+    const int64_t elem_v = ggml_nelements(v);
 
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    enum ggml_type result_type = GGML_TYPE_F32;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
+
+    const size_t nelements = (end + tsize - 1)/tsize;
+
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
 
     int32_t masked_i = masked ? 1 : 0;
     ggml_set_op_params(result, &masked_i, sizeof(masked_i));
@@ -14425,7 +14437,7 @@ static void ggml_compute_forward_flash_attn_f32(
         for (int64_t ic = 0; ic < nek1; ++ic) {
             // k indices
             const int ik3 = iq3;
-            const int ik2 = iq2;
+            const int ik2 = iq2 % nek2;
             const int ik1 = ic;
 
             // S indices
@@ -14449,6 +14461,8 @@ static void ggml_compute_forward_flash_attn_f32(
         }
 
         // softmax
+        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+        // dont forget to set their SW values to zero
         {
             float max = -INFINITY;
             ggml_vec_max_f32(M, &max, S);
@@ -14509,9 +14523,13 @@ static void ggml_compute_forward_flash_attn_f32(
             const int i2 = iq2;
             const int i3 = iq3;
 
-            ggml_vec_dot_f32(nek1,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                    (float *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
+            // v indices
+            const int iv2 = iq2 % nev2;
+            const int iv3 = iq3;
+
+            ggml_vec_dot_f32(nev0,
+                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
                     S);
         }
     }
@@ -14608,7 +14626,7 @@ static void ggml_compute_forward_flash_attn_f16(
             for (int64_t ic = 0; ic < nek1; ++ic) {
                 // k indices
                 const int ik3 = iq3;
-                const int ik2 = iq2;
+                const int ik2 = iq2 % nek2;
                 const int ik1 = ic;
 
                 // S indices
@@ -14623,7 +14641,7 @@ static void ggml_compute_forward_flash_attn_f16(
             for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
                 // k indices
                 const int ik3 = iq3;
-                const int ik2 = iq2;
+                const int ik2 = iq2 % nek2;
                 const int ik1 = ic;
 
                 // S indices
@@ -14648,6 +14666,8 @@ static void ggml_compute_forward_flash_attn_f16(
         }
 
         // softmax
+        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+        // dont forget to set their S values to zero
         {
             float max = -INFINITY;
             ggml_vec_max_f32(M, &max, S);
@@ -14704,6 +14724,7 @@ static void ggml_compute_forward_flash_attn_f16(
             S16[i] = GGML_FP32_TO_FP16(S[i]);
         }
 
+        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
         if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
             for (int64_t ic = 0; ic < nev1; ++ic) {
                 // dst indices
@@ -14711,9 +14732,13 @@ static void ggml_compute_forward_flash_attn_f16(
                 const int i2 = iq2;
                 const int i3 = iq3;
 
-                ggml_vec_dot_f16(nek1,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
+                // v indices
+                const int iv2 = iq2 % nev2;
+                const int iv3 = iq3;
+
+                ggml_vec_dot_f16(nev0,
+                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
                         S16);
             }
         } else {
@@ -14723,9 +14748,13 @@ static void ggml_compute_forward_flash_attn_f16(
                 const int i2 = iq2;
                 const int i3 = iq3;
 
-                ggml_vec_dot_f16_unroll(nek1, nbv1,
-                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                        ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
+                // v indices
+                const int iv2 = iq2 % nev2;
+                const int iv3 = iq3;
+
+                ggml_vec_dot_f16_unroll(nev0, nbv1,
+                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
+                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
                         S16);
             }
         }
@@ -14984,10 +15013,38 @@ static void ggml_compute_forward_flash_attn_back_f32(
         return;
     }
 
-    // parallelize by q rows using ggml_vec_dot_f32
+    const int64_t elem_q = ggml_nelements(q);
+    const int64_t elem_k = ggml_nelements(k);
+    const int64_t elem_v = ggml_nelements(v);
 
-    // total rows in q
-    const int nr = neq2*neq3;
+    enum ggml_type result_type = dst->type;
+    GGML_ASSERT(ggml_blck_size(result_type) == 1);
+    const size_t tsize = ggml_type_size(result_type);
+
+    const size_t offs_q = 0;
+    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+
+    void * grad_q = (char *) dst->data;
+    void * grad_k = (char *) dst->data + offs_k;
+    void * grad_v = (char *) dst->data + offs_v;
+
+    const size_t nbgq1 = nb0*neq0;
+    const size_t nbgq2 = nb0*neq0*neq1;
+    const size_t nbgq3 = nb0*neq0*neq1*neq2;
+
+    const size_t nbgk1 = nb0*nek0;
+    const size_t nbgk2 = nb0*nek0*nek1;
+    const size_t nbgk3 = nb0*nek0*nek1*neq2;
+
+    const size_t nbgv1 = nb0*nev0;
+    const size_t nbgv2 = nb0*nev0*nev1;
+    const size_t nbgv3 = nb0*nev0*nev1*neq2;
+
+    // parallelize by k rows using ggml_vec_dot_f32
+
+    // total rows in k
+    const int nr = nek2*nek3;
 
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -15000,268 +15057,248 @@ static void ggml_compute_forward_flash_attn_back_f32(
 
     //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
 
+    // how often k2 (and v2) is repeated in q2
+    int nrep = neq2/nek2;
+
     for (int ir = ir0; ir < ir1; ++ir) {
         // q indices
-        const int iq3 = ir/(neq2);
-        const int iq2 = ir - iq3*neq2;
-        for ( int iq1 = 0; iq1 < neq1; ++iq1) {
+        const int ik3 = ir/(nek2);
+        const int ik2 = ir - ik3*nek2;
 
+        const int iq3 = ik3;
+        const int id3 = ik3;
+        const int iv3 = ik3;
+        const int iv2 = ik2;
 
-            // not sure about CACHE_LINE_SIZE_F32..
-            // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
-            float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
-            float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
+        for (int irep = 0; irep < nrep; ++irep) {
+            const int iq2 = ik2 + irep*nek2;
+            const int id2 = iq2;
 
-            for (int i = M; i < Mup; ++i) {
-                S[i] = -INFINITY;
-            }
+            // (ik2 + irep*nek2) % nek2 == ik2
+            for (int iq1 = 0; iq1 < neq1; ++iq1) {
+                const int id1 = iq1;
 
-            for (int64_t ic = 0; ic < nek1; ++ic) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2;
-                const int ik1 = ic;
+                // not sure about CACHE_LINE_SIZE_F32..
+                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
+                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
+                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
 
-                // S indices
-                const int i1 = ik1;
+                for (int i = M; i < Mup; ++i) {
+                    S[i] = -INFINITY;
+                }
 
-                ggml_vec_dot_f32(neq0,
-                        S + i1,
-                        (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
+                for (int64_t ic = 0; ic < nek1; ++ic) {
+                    // k indices
+                    const int ik1 = ic;
 
-            // scale
-            ggml_vec_scale_f32(nek1, S, scale);
+                    // S indices
+                    const int i1 = ik1;
 
-            if (masked) {
-                for (int64_t i = P; i < M; i++) {
-                    if (i > P + iq1) {
-                        S[i] = -INFINITY;
-                    }
+                    ggml_vec_dot_f32(neq0,
+                            S + i1,
+                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
                 }
-            }
 
-            // softmax
-            {
-                float max = -INFINITY;
-                ggml_vec_max_f32(M, &max, S);
+                // scale
+                ggml_vec_scale_f32(nek1, S, scale);
 
-                ggml_float sum = 0.0;
+                if (masked) {
+                    for (int64_t i = P; i < M; i++) {
+                        if (i > P + iq1) {
+                            S[i] = -INFINITY;
+                        }
+                    }
+                }
+
+                // softmax
+                // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+                // dont forget to set their SM values to zero
                 {
+                    float max = -INFINITY;
+                    ggml_vec_max_f32(M, &max, S);
+
+                    ggml_float sum = 0.0;
+                    {
 #ifdef GGML_SOFT_MAX_ACCELERATE
-                    max = -max;
-                    vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
-                    vvexpf(SM, SM, &Mup);
-                    ggml_vec_sum_f32(Mup, &sum, SM);
+                        max = -max;
+                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
+                        vvexpf(SM, SM, &Mup);
+                        ggml_vec_sum_f32(Mup, &sum, SM);
 #else
-                    uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                    ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
+                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
+                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 
-                    for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                        float * SR =  S + i;
-                        float * SW = SM + i;
+                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                            float * SR =  S + i;
+                            float * SW = SM + i;
 
-                        for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                            if (SR[j] == -INFINITY) {
-                                SW[j] = 0.0f;
-                            } else {
+                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
+                                if (SR[j] == -INFINITY) {
+                                    SW[j] = 0.0f;
+                                } else {
 #ifndef GGML_FLASH_ATTN_EXP_FP16
-                                const float val = expf(SR[j] - max);
+                                    const float val = expf(SR[j] - max);
 #else
-                                ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
-                                memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
+                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
+                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
+                                    const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
 #endif
-                                sump[j] += (ggml_float)val;
-                                SW[j] = val;
+                                    sump[j] += (ggml_float)val;
+                                    SW[j] = val;
+                                }
                             }
                         }
-                    }
 
-                    for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                        sum += sump[i];
-                    }
+                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
+                            sum += sump[i];
+                        }
 #endif
-                }
-
-                assert(sum > 0.0);
-
-                sum = 1.0/sum;
-                ggml_vec_scale_f32(M, SM, sum);
+                    }
 
-            }
+                    assert(sum > 0.0);
 
-            // step-by-step explanation
-            {
-                // forward-process                   shape      grads from backward process
-                // parallel_for iq2,iq3:
-                //  k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,iq2,iq3]  += grad[kcur]
-                //  q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
-                //  v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iq2,iq3]  += grad[vcur]
-                //  for iq1:
-                //   kcur   = k[:D,:M,iq2,iq3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
-                //   qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
-                //   vcur   = v[:M,:D,iq2,iq3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
-                //   S0     = -Inf                   [D,1,1,1]
-                //  ~S1[i]  = dot(kcur[:D,i], qcur)
-                //   S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
-                //   S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
-                //   S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                //   S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
-                //  ~S5[i]  = dot(vcur[:,i], S4)
-                //   S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,iq1,iq2,iq3]
-                //  ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
-                //   dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3]
-                // dst                               backward-/ grad[dst]                 = d
-                //
-                // output gradients with their dependencies:
-                //
-                // grad[kcur] = grad[S1].T @ qcur
-                // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                // grad[S4]   = grad[S5] @ vcur
-                // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
-                // grad[qcur] = grad[S1]   @ kcur
-                // grad[vcur] = grad[S5].T @ S4
-                // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
-                //
-                // in post-order:
-                //
-                // S1         = qcur @ kcur.T
-                // S2         = S1 * scale
-                // S3         = diag_mask_inf(S2, P)
-                // S4         = softmax(S3)
-                // grad[S4]   = d[:D,iq1,iq2,iq3] @ vcur
-                // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                // grad[qcur] = grad[S1]   @ kcur
-                // grad[kcur] = grad[S1].T @ qcur
-                // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4
-                //
-                // using less variables (SM=S4):
-                //
-                // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
-                // SM            = softmax(S)
-                // S             = d[:D,iq1,iq2,iq3] @ vcur
-                // dot_SM_gradSM = dot(SM, S)
-                // S             = SM * (S - dot(SM, S))
-                // S             = diag_mask_zero(S, P) * scale
-                //
-                // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
-                // grad[k][:D,:M,iq2,iq3]  += S.T @ qcur
-                // grad[v][:M,:D,iq2,iq3]  += d[:D,iq1,iq2,iq3].T @ SM
-            }
+                    sum = 1.0/sum;
+                    ggml_vec_scale_f32(M, SM, sum);
 
-            // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur
-            // S = d[:D,iq1,iq2,iq3] @ vcur
-            // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3]
-            ggml_vec_set_f32(M, S, 0);
-            for (int64_t ic = 0; ic < D; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
+                }
 
-                ggml_vec_mad_f32(M,
-                        S,
-                         (float *) ((char *) v->data + (          ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                        *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3)));
-            }
+                // step-by-step explanation
+                {
+                    // forward-process                    shape      grads from backward process
+                    // parallel_for ik2,ik3:
+                    //  for irep:
+                    //   iq2 = ik2 + irep*nek2
+                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
+                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
+                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
+                    //   for iq1:
+                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
+                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
+                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
+                    //    S0     = -Inf                   [D,1,1,1]
+                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
+                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
+                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
+                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
+                    //   ~S5[i]  = dot(vcur[:,i], S4)
+                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
+                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
+                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
+                    // dst                               backward-/ grad[dst]                 = d
+                    //
+                    // output gradients with their dependencies:
+                    //
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S4]   = grad[S5] @ vcur
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[vcur] = grad[S5].T @ S4
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // in post-order:
+                    //
+                    // S1         = qcur @ kcur.T
+                    // S2         = S1 * scale
+                    // S3         = diag_mask_inf(S2, P)
+                    // S4         = softmax(S3)
+                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
+                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
+                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
+                    // grad[qcur] = grad[S1]   @ kcur
+                    // grad[kcur] = grad[S1].T @ qcur
+                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
+                    //
+                    // using less variables (SM=S4):
+                    //
+                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
+                    // SM            = softmax(S)
+                    // S             = d[:D,iq1,iq2,iq3] @ vcur
+                    // dot_SM_gradSM = dot(SM, S)
+                    // S             = SM * (S - dot(SM, S))
+                    // S             = diag_mask_zero(S, P) * scale
+                    //
+                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
+                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
+                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
+                }
 
-            // S = SM * (S - dot(SM, S))
-            float dot_SM_gradSM = 0;
-            ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S);
-            ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
-            ggml_vec_mul_f32 (M, S, S, SM);
+                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
+                // for ic:
+                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
+                ggml_vec_set_f32(M, S, 0);
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(M,
+                            S,
+                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
+                }
 
-            // S = diag_mask_zero(S, P) * scale
-            if (masked) {
-                // for (int64_t i = P + iq1 + 1; i < M; i++) {
-                //     S[i] = 0;
-                // }
-                for (int64_t i = P; i < M; i++) {
-                    if (i > P + iq1) {
-                        S[i] = 0;
+                // S = SM * (S - dot(SM, S))
+                float dot_SM_gradSM = 0;
+                ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S);
+                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
+                ggml_vec_mul_f32 (M, S, S, SM);
+
+                // S = diag_mask_zero(S, P) * scale
+                if (masked) {
+                    // for (int64_t i = P + iq1 + 1; i < M; i++) {
+                    //     S[i] = 0;
+                    // }
+                    for (int64_t i = P; i < M; i++) {
+                        if (i > P + iq1) {
+                            S[i] = 0;
+                        }
                     }
                 }
-            }
-            ggml_vec_scale_f32(M, S, scale);
-
-            void * grad_q = (char *) dst->data;
-            void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3;
-            void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3;
-
-            const size_t nbgq1 = nb0*neq0;
-            const size_t nbgq2 = nb0*neq0*neq1;
-            const size_t nbgq3 = nb0*neq0*neq1*neq2;
-
-            const size_t nbgk1 = nb0*nek0;
-            const size_t nbgk2 = nb0*nek0*nek1;
-            const size_t nbgk3 = nb0*nek0*nek1*neq2;
-
-            const size_t nbgv1 = nb0*nev0;
-            const size_t nbgv2 = nb0*nev0*nev1;
-            const size_t nbgv3 = nb0*nev0*nev1*neq2;
-
-            // S    shape [M,1]
-            // SM   shape [M,1]
-            // kcur shape [D,M]
-            // qcur shape [D,1]
-            // vcur shape [M,D]
-            //
-            // grad[q][:D,iq1,iq2,iq3] += S @ kcur
-            // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
-            // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic]
-            //
-            //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T)
-            //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T)
-            for (int64_t ic = 0; ic < M; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                ggml_vec_mad_f32(D,
-                        (float *) ((char *) grad_q  + (i1*nbgq1  + i2*nbgq2  + i3*nbgq3)),
-                        (float *) ((char *) k->data + (ic*nbk1   + i2*nbk2   + i3*nbk3)),
-                        S[ic]);
-            }
-
-            // grad[k][:D,:M,iq2,iq3] += S.T       @ qcur
-            // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
-            // grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
-            for (int64_t ic = 0; ic < M; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // ggml_vec_set_f32(D,
-                //         (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
-                //         0);
-                ggml_vec_mad_f32(D,
-                        (float *) ((char *) grad_k  + (ic*nbgk1  + i2*nbgk2  + i3*nbgk3)),
-                        (float *) ((char *) q->data + (i1*nbq1   + i2*nbq2   + i3*nbq3)),
-                        S[ic]);
-            }
+                // todo: exclude known zero S[..] values from operation
+                ggml_vec_scale_f32(M, S, scale);
+
+                // S    shape [M,1]
+                // SM   shape [M,1]
+                // kcur shape [D,M]
+                // qcur shape [D,1]
+                // vcur shape [M,D]
+
+                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
+                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
+                // for ic:
+                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
+                // todo: exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < M; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
+                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
+                            S[ic]);
+                }
 
-            // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T       @ SM
-            // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M]
-            // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3]         * SM[:M]
-            for (int64_t ic = 0; ic < D; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
+                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
+                // for ic:
+                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
+                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
+                // todo: exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < M; ++ic) {
+                    ggml_vec_mad_f32(D,
+                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
+                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
+                            S[ic]);
+                }
 
-                // ggml_vec_set_f32(M,
-                //         (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
-                //         0);
-                ggml_vec_mad_f32(M,
-                        (float *) ((char *) grad_v   + (          ic*nbgv1 + i2*nbgv2 + i3*nbgv3)),
-                        SM,
-                        *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1  + i2*nbd2  + i3*nbd3)));
+                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
+                // for ic:
+                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
+                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
+                // todo: exclude known zero SM[..] values from mad
+                for (int64_t ic = 0; ic < D; ++ic) {
+                    ggml_vec_mad_f32(M,
+                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
+                            SM,
+                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
+                }
             }
         }
     }
@@ -17166,143 +17203,40 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             masked);
                 }
 
-                if (src0->grad) {
-                    struct ggml_tensor * grad_q = NULL;
-                    const size_t nb0    = flash_grad->nb[0];
-                    const size_t offset = 0;
-                    switch(src0->n_dims) {
-                        case 2:
-                            {
-                                grad_q = ggml_view_2d(ctx,
-                                    flash_grad,
-                                    src0->ne[0],
-                                    src0->ne[1],
-                                    nb0*src0->ne[0],
-                                    offset);
-                            } break;
-                        case 3:
-                            {
-                                grad_q = ggml_view_3d(ctx,
-                                    flash_grad,
-                                    src0->ne[0],
-                                    src0->ne[1],
-                                    src0->ne[2],
-                                    nb0*src0->ne[0],
-                                    nb0*src0->ne[0]*src0->ne[1],
-                                    offset);
-                            } break;
-                        case 4:
-                            {
-                                grad_q = ggml_view_4d(ctx,
-                                    flash_grad,
-                                    src0->ne[0],
-                                    src0->ne[1],
-                                    src0->ne[2],
-                                    src0->ne[3],
-                                    nb0*src0->ne[0],
-                                    nb0*src0->ne[0]*src0->ne[1],
-                                    nb0*src0->ne[0]*src0->ne[1]*src0->ne[2],
-                                    offset);
-                            } break;
-                    }
+                struct ggml_tensor * src2 = tensor->src[2];
+                const int64_t elem_q = ggml_nelements(src0);
+                const int64_t elem_k = ggml_nelements(src1);
+                const int64_t elem_v = ggml_nelements(src2);
+
+                enum ggml_type result_type = flash_grad->type;
+                GGML_ASSERT(ggml_blck_size(result_type) == 1);
+                const size_t tsize = ggml_type_size(result_type);
 
+                const size_t offs_q = 0;
+                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
+                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
+
+                if (src0->grad) {
+                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
+                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
                             grad_q,
                             zero_table);
                 }
-
                 if (src1->grad) {
-                    struct ggml_tensor * grad_k = NULL;
-                    const size_t nb0    = flash_grad->nb[0];
-                    const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3];
-                    switch(src1->n_dims) {
-                        case 2:
-                            {
-                                grad_k = ggml_view_2d(ctx,
-                                    flash_grad,
-                                    src1->ne[0],
-                                    src1->ne[1],
-                                    nb0*src1->ne[0],
-                                    offset);
-                            } break;
-                        case 3:
-                            {
-                                grad_k = ggml_view_3d(ctx,
-                                    flash_grad,
-                                    src1->ne[0],
-                                    src1->ne[1],
-                                    src1->ne[2],
-                                    nb0*src1->ne[0],
-                                    nb0*src1->ne[0]*src1->ne[1],
-                                    offset);
-                            } break;
-                        case 4:
-                            {
-                                grad_k = ggml_view_4d(ctx,
-                                    flash_grad,
-                                    src1->ne[0],
-                                    src1->ne[1],
-                                    src1->ne[2],
-                                    src1->ne[3],
-                                    nb0*src1->ne[0],
-                                    nb0*src1->ne[0]*src1->ne[1],
-                                    nb0*src1->ne[0]*src1->ne[1]*src1->ne[2],
-                                    offset);
-                            } break;
-                    }
-
+                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
+                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
                     src1->grad = ggml_add_or_set(ctx,
                             src1->grad,
                             grad_k,
                             zero_table);
                 }
-
-                struct ggml_tensor * opt0 = tensor->src[2];
-
-                if (opt0->grad) {
-                    struct ggml_tensor * grad_v = NULL;
-                    const size_t nb0    = flash_grad->nb[0];
-                    const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]
-                                        + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3];
-                    switch(opt0->n_dims) {
-                        case 2:
-                            {
-                                grad_v = ggml_view_2d(ctx,
-                                    flash_grad,
-                                    opt0->ne[0],
-                                    opt0->ne[1],
-                                    nb0*opt0->ne[0],
-                                    offset);
-                            } break;
-                        case 3:
-                            {
-                                grad_v = ggml_view_3d(ctx,
-                                    flash_grad,
-                                    opt0->ne[0],
-                                    opt0->ne[1],
-                                    opt0->ne[2],
-                                    nb0*opt0->ne[0],
-                                    nb0*opt0->ne[0]*opt0->ne[1],
-                                    offset);
-                            } break;
-                        case 4:
-                            {
-                                grad_v = ggml_view_4d(ctx,
-                                    flash_grad,
-                                    opt0->ne[0],
-                                    opt0->ne[1],
-                                    opt0->ne[2],
-                                    opt0->ne[3],
-                                    nb0*opt0->ne[0],
-                                    nb0*opt0->ne[0]*opt0->ne[1],
-                                    nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2],
-                                    offset);
-                            } break;
-                    }
-
-                    opt0->grad = ggml_add_or_set(ctx,
-                            opt0->grad,
+                if (src2->grad) {
+                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
+                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
+                    src2->grad = ggml_add_or_set(ctx,
+                            src2->grad,
                             grad_v,
                             zero_table);
                 }
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 6a81c940e8146..34032a9081e9f 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1359,6 +1359,10 @@ int main(int argc, const char ** argv) {
                                                     ggml_new_f32(ctx0, eps))));
 
                 check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
+                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
+                // this may result in different gradients too finite differences.
+                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
+                // if only the table lookup causes gradients to differ this is acceptable.
             }
         }
 
@@ -1474,28 +1478,31 @@ int main(int argc, const char ** argv) {
 
             for (int masked = 0; masked <= 1; ++masked) {
                 for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
+                    int max_nrep = (ndims >= 3) ? 2 : 1;
+                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
+                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
+                        int64_t nek[4] = { D, M, B, ne[3] };
+                        int64_t nev[4] = { M, D, B, ne[3] };
+                        if (ndims == 2) {
+                            neq[2] = 1; neq[3] = 1;
+                            nek[2] = 1; nek[3] = 1;
+                            nev[2] = 1; nev[3] = 1;
+                        } else if (ndims == 3) {
+                            neq[3] = 1;
+                            nek[3] = 1;
+                            nev[3] = 1;
+                        }
+                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                        ggml_set_param(ctx0, x[0]);
+                        ggml_set_param(ctx0, x[1]);
+                        ggml_set_param(ctx0, x[2]);
 
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
 
-                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+                    }
                 }
             }
         }

From 833a56c14459644bd202cd32a62ea71997108ba0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 17:07:54 +0200
Subject: [PATCH 168/235] add llama API functions to get
 grouped-query-attention n_head parameter 'n_head_kv'.

---
 llama.cpp |  8 ++++++++
 llama.h   | 16 +++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 705b196511a54..55aa955d71c29 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5669,6 +5669,10 @@ int llama_n_head(const struct llama_context * ctx) {
     return ctx->model.hparams.n_head;
 }
 
+int llama_n_head_kv(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_head_kv;
+}
+
 int llama_n_rot(const struct llama_context * ctx) {
     return ctx->model.hparams.n_rot;
 }
@@ -5701,6 +5705,10 @@ int llama_model_n_head(const struct llama_model * model) {
     return model->hparams.n_head;
 }
 
+int llama_model_n_head_kv(const struct llama_model * model) {
+    return model->hparams.n_head_kv;
+}
+
 int llama_model_n_rot(const struct llama_model * model) {
     return model->hparams.n_rot;
 }
diff --git a/llama.h b/llama.h
index f702c54d8a342..c930a48d0d3de 100644
--- a/llama.h
+++ b/llama.h
@@ -245,13 +245,14 @@ extern "C" {
     LLAMA_API bool llama_mmap_supported (void);
     LLAMA_API bool llama_mlock_supported(void);
 
-    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-    LLAMA_API int llama_n_ff   (const struct llama_context * ctx);
-    LLAMA_API int llama_n_head (const struct llama_context * ctx);
-    LLAMA_API int llama_n_rot  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_layer(const struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx    (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd   (const struct llama_context * ctx);
+    LLAMA_API int llama_n_ff     (const struct llama_context * ctx);
+    LLAMA_API int llama_n_head   (const struct llama_context * ctx);
+    LLAMA_API int llama_n_head_kv(const struct llama_context * ctx);
+    LLAMA_API int llama_n_rot    (const struct llama_context * ctx);
+    LLAMA_API int llama_n_layer  (const struct llama_context * ctx);
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
 
@@ -260,6 +261,7 @@ extern "C" {
     LLAMA_API int llama_model_n_embd (const struct llama_model * model);
     LLAMA_API int llama_model_n_ff   (const struct llama_model * model);
     LLAMA_API int llama_model_n_head (const struct llama_model * model);
+    LLAMA_API int llama_model_n_head_kv(const struct llama_model * model);
     LLAMA_API int llama_model_n_rot  (const struct llama_model * model);
     LLAMA_API int llama_model_n_layer(const struct llama_model * model);
 

From 35260f7d74bc745241d7a506f642bf420eff676d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 17:10:23 +0200
Subject: [PATCH 169/235] fix finetune to support grouped-query-attention
 (using flash-attention)

note: ggml changes to ggml_out_prod are necessary to support grouped-query-attention without flash-attention.
---
 examples/finetune/finetune.cpp | 161 ++++++++++++++++++---------------
 1 file changed, 90 insertions(+), 71 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d7c0a3360c1b5..4d3f47fb4ca07 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -157,13 +157,26 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 }
 
 struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_ff    = 11008;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
+    uint32_t n_vocab    = 32000;
+    uint32_t n_ctx      = 512;
+    uint32_t n_embd     = 4096;
+    uint32_t n_ff       = 11008;
+    uint32_t n_head     = 32;
+    uint32_t n_head_kv  = 32;
+    uint32_t n_layer    = 32;
+    uint32_t n_rot      = 64;
+
+    uint32_t n_gqa() const {
+        return n_head/n_head_kv;
+    }
+
+    uint32_t n_embd_head() const {
+        return n_embd/n_head;
+    }
+
+    uint32_t n_embd_gqa() const {
+        return n_embd/n_gqa();
+    }
 
     bool operator!=(const my_llama_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
@@ -404,13 +417,14 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
         return tn_buf.data();
     };
 
-    hparams.n_vocab = llama_model_n_vocab(input);
-    hparams.n_ctx   = n_ctx;
-    hparams.n_embd  = llama_model_n_embd(input);
-    hparams.n_ff    = llama_model_n_ff(input);
-    hparams.n_head  = llama_model_n_head(input);
-    hparams.n_layer = llama_model_n_layer(input);
-    hparams.n_rot   = llama_model_n_rot(input);
+    hparams.n_vocab    = llama_model_n_vocab(input);
+    hparams.n_ctx      = n_ctx;
+    hparams.n_embd     = llama_model_n_embd(input);
+    hparams.n_ff       = llama_model_n_ff(input);
+    hparams.n_head     = llama_model_n_head(input);
+    hparams.n_head_kv  = llama_model_n_head_kv(input);
+    hparams.n_layer    = llama_model_n_layer(input);
+    hparams.n_rot      = llama_model_n_rot(input);
 
     model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
     model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
@@ -472,10 +486,11 @@ void set_param_lora(struct my_llama_lora * lora) {
 void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
     const auto & lparams = lora->hparams;
 
-    const uint32_t n_embd  = model->hparams.n_embd;
-    const uint32_t n_layer = model->hparams.n_layer;
-    const uint32_t n_vocab = model->hparams.n_vocab;
-    const uint32_t n_ff    = model->hparams.n_ff;
+    const uint32_t n_embd     = model->hparams.n_embd;
+    const uint32_t n_embd_gqa = model->hparams.n_embd_gqa();
+    const uint32_t n_layer    = model->hparams.n_layer;
+    const uint32_t n_vocab    = model->hparams.n_vocab;
+    const uint32_t n_ff       = model->hparams.n_ff;
 
     lora->train_its = 0;
     lora->train_samples = 0;
@@ -527,9 +542,9 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
         layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
         layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
         layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
-        layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
+        layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa);
         layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
-        layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
+        layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa);
         layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
         layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
 
@@ -770,19 +785,23 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     ggml_set_scratch(ctx, { 0, 0, nullptr, });
     const int n_past = 0;
     const int N = n_tokens;
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = hparams.n_ff;
+    const auto & hparams  = model->hparams;
+    const int n_ctx       = hparams.n_ctx;
+    const int n_vocab     = hparams.n_vocab;
+    const int n_embd      = hparams.n_embd;
+    const int n_layer     = hparams.n_layer;
+    const int n_head      = hparams.n_head;
+    const int n_head_kv   = hparams.n_head_kv;
+    const int n_rot       = hparams.n_rot;
+    const int n_ff        = hparams.n_ff;
+    const int n_embd_head = hparams.n_embd_head();
+    const int n_embd_gqa  = hparams.n_embd_gqa();
     const float rms_norm_eps    = lora->hparams.f_norm_rms_eps;
     const float rope_freq_base  = lora->hparams.rope_freq_base;
     const float rope_freq_scale = lora->hparams.rope_freq_scale;
 
     GGML_ASSERT((size_t) n_layer == lora->layers.size());
+    GGML_ASSERT(n_embd_head == n_rot);
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
         ggml_set_name(t, n);
@@ -853,64 +872,64 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
         struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
 
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                      set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                    set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
+        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
+        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
+        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                                set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
+        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                 set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
+        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd_head, n_head, N, n_batch);    set_name(t06, "t06");     assert_shape_4d(t06, n_embd_head, n_head, N, n_batch);
+        struct ggml_tensor * t07 = rope              (t06);                                          set_name(t07, "t07");     assert_shape_4d(t07, n_embd_head, n_head, N, n_batch);
+        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                 set_name(t08, "t08");     assert_shape_2d(t08, n_embd_gqa, N*n_batch);
+        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch);
+        struct ggml_tensor * t10 = rope              (t09);                                          set_name(t10, "t10");     assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch);
 
         struct ggml_tensor * t11;
         if (ggml_is_quantized(wv->type)) {
-            struct ggml_tensor * t11_1 = ggml_mul_mat  (ctx, wv, t04);                              set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd, N*n_batch);
-            struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1);                                set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd);
-                                 t11   = ggml_cont     (ctx, t11_2);                                set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+            struct ggml_tensor * t11_1 = ggml_mul_mat  (ctx, wv, t04);                               set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch);
+            struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1);                                 set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa);
+                                 t11   = ggml_cont     (ctx, t11_2);                                 set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
         } else {
-                                 t11   = ggml_mul_mat  (ctx, t04, wv);                              set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
+                                 t11   = ggml_mul_mat  (ctx, t04, wv);                               set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
         }
 
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
+        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv);
+        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                         set_name(t13, "t13");     assert_shape_4d(t13, n_embd_head, N, n_head, n_batch);
+        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                         set_name(t14, "t14");     assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch);
+        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
         struct ggml_tensor * t16;
         if (enable_flash_attn) {
-            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
         } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
+            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);             set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
+            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                     set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
+            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                     set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
         }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, wo, t19);                                set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                      set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                          set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, w3, t24);                                set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, w1, t24);                                set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
+        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                         set_name(t17, "t17");     assert_shape_4d(t17, n_embd_head, n_head, N, n_batch);
+        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                     set_name(t18, "t18");     assert_shape_4d(t18, n_embd_head, n_head, N, n_batch);
+        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                  set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
+        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, wo, t19);                                 set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
+        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                                set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
+        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
+        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
+        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, w3, t24);                                 set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, w1, t24);                                 set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
+        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                 set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
         cur = t30;
         if (enable_checkpointing) {
             checkpoints.push_back(cur);
         }
     }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                   set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                           set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, output, t33);                         set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
+    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                    set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                            set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
+    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                             set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
+    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, output, t33);                          set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
+    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);             set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
+    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                         set_name(t36, "t36");     assert_shape_1d(t36, 1);
 
     if (enable_checkpointing) {
         checkpoints.push_back(t31);

From aea8b6be745b2763acf0159d49d37e8c9801966a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 18:37:45 +0200
Subject: [PATCH 170/235] support broadcastable a in out_prod(a, b) and
 backward pass of broadcasting mul_mat(a, b)

---
 ggml.c | 87 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 66 insertions(+), 21 deletions(-)

diff --git a/ggml.c b/ggml.c
index 5b1c3c79ca63d..2a8d95ec8282b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4363,10 +4363,9 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct
 static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
-    return
-        (t0->ne[1] == t1->ne[1])  &&
-        (t0->ne[2] == t1->ne[2])  &&
-        (t0->ne[3] == t1->ne[3]);
+    return (t0->ne[1] == t1->ne[1])   &&
+           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
+           (t1->ne[3]%t0->ne[3] == 0);
 }
 
 enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
@@ -6358,7 +6357,8 @@ struct ggml_tensor * ggml_out_prod(
         is_node = true;
     }
 
-    const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] };
+    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
+    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
 
     result->op   = GGML_OP_OUT_PROD;
@@ -16832,36 +16832,81 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                 // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
                 // ds1 = t.T.dot(dt)
 
-                // tensor.shape [m,p]
-                // src0.shape   [n,m]
-                // src1.shape   [n,p]
+                // tensor.shape [m,p,qq,rr]
+                // src0.shape   [n,m,q1,r1]
+                // src1.shape   [n,p,qq,rr]
 
                 // necessary for llama
                 if (src0->grad) {
+                    struct ggml_tensor * s1_tg =
+                        ggml_out_prod(ctx, // [n,m,qq,rr]
+                            src1,          // [n,p,qq,rr]
+                            tensor->grad); // [m,p,qq,rr]
+                    const int64_t n  = s1_tg->ne[0];
+                    const int64_t m  = s1_tg->ne[1];
+                    const int64_t qq = s1_tg->ne[2];
+                    const int64_t rr = s1_tg->ne[3];
+                    const int64_t q1 = src0->ne[2];
+                    const int64_t r1 = src0->ne[3];
+                    const int64_t nq = qq/q1;
+                    const int64_t nr = rr/r1;
+                    GGML_ASSERT(qq % q1 == 0);
+                    GGML_ASSERT(rr % r1 == 0);
+                    const bool ne2_broadcasted = qq > q1;
+                    const bool ne3_broadcasted = rr > r1;
+                    // handling broadcasted will create a lot of overhead.
+                    // this could be greatly reduced if we had a ggml_sum_repetitions function.
+                    //   ggml_sum_repetitions(ctx, u with ne=[a,b,c,d], v with ne=[A,B,C,D]) -> ne=[A,B,C,D]
+                    //   with a % A == 0, b % B == 0, etc.
+                    //   TODO: implement such function if necessary, it should be quite trivial
+                    if (ne2_broadcasted) {
+                        printf("%s: ne2_broadcasted\n", __func__);
+                        // sum ne2 repetitions
+                        //                                          s1_tg->ne == [n,m,qq=nq*q1,rr]
+                        s1_tg = ggml_reshape_4d(ctx, s1_tg, n*m, nq, q1, rr); // [n*m,nq,q1,rr]
+                        s1_tg = ggml_transpose(ctx, s1_tg);                   // [nq,n*m,q1,rr]
+                        s1_tg = ggml_cont(ctx, s1_tg);                        // [nq,n*m,q1,rr]
+                        s1_tg = ggml_sum_rows(ctx, s1_tg);                    // [1,n*m,q1,rr]
+                        // due to following reshape we can omit this:
+                        // s1_tg = ggml_reshape_4d(ctx, s1_tg, n, m, q1, rr); // [n,m,q1,rr]
+                    }
+                    if (ne3_broadcasted) {
+                        printf("%s: ne3_broadcasted\n", __func__);
+                        // sum ne3 repetitions
+                        //                                          s1_tg->ne == [n,m,q1,rr=nr*r1]
+                        s1_tg = ggml_reshape_4d(ctx, s1_tg, n*m, q1, nr, r1); // [n*m,q1,nr,r1]
+                        s1_tg = ggml_permute(ctx, s1_tg, 1, 2, 0, 3);         // [nr,n*m,q1,r1]
+                        s1_tg = ggml_cont(ctx, s1_tg);                        // [nr,n*m,q1,r1]
+                        s1_tg = ggml_sum_rows(ctx, s1_tg);                    // [1,n*m,q1,r1]
+                        // due to following reshape we can omit this:
+                        // s1_tg = ggml_reshape_4d(ctx, s1_tg, n, m, q1, r1); // [n,m,q1,r1]
+                    }
+                    if (ne2_broadcasted || ne3_broadcasted) {
+                        // make sure ne and n_dims match
+                        s1_tg = ggml_reshape(ctx, s1_tg, src0);
+                    }
                     src0->grad =
                         ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_out_prod(ctx, // [n,m]
-                                    src1,          // [n,p]
-                                    tensor->grad), // [m,p]
+                                src0->grad, // [n,m,q1,r1]
+                                s1_tg,      // [n,m,q1,r1]
                                 zero_table);
                 }
                 if (src1->grad) {
                     src1->grad =
                         ggml_add_or_set(ctx,
-                                src1->grad,
-                                // ggml_mul_mat(ctx,                   // [n,p]
-                                //     ggml_cont(ctx,                  // [m,n]
-                                //         ggml_transpose(ctx, src0)), // [m,n]
-                                //     tensor->grad),                  // [m,p]
+                                src1->grad,                            // [n,p,qq,rr]
+                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
+                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
+                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
+                                //     tensor->grad),                  // [m,p,qq,rr]
 
                                 // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
                                 // // avoid transpose of src0, rather transpose smaller tensor->grad
                                 // // and then use ggml_out_prod
-                                ggml_out_prod(ctx,                  // [n,p]
-                                    src0,                           // [n,m]
-                                    ggml_transpose(ctx,             // [p,m]
-                                        tensor->grad)),             // [m,p]
+                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
+                                    src0,                           // [n,m,q1,r1]
+                                    ggml_transpose(ctx,             // [p,m,qq,rr]
+                                        tensor->grad)),             // [m,p,qq,rr]
                                 zero_table);
                 }
             } break;

From dd3278619d046e1836d456736744d95202414ff7 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 18:38:29 +0200
Subject: [PATCH 171/235] test broadcasting mul_mat backward pass

---
 tests/test-grad0.cpp | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 34032a9081e9f..6d731e2208776 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -749,25 +749,35 @@ int main(int argc, const char ** argv) {
         {
             const int nargs = 2;
 
-            for (int ndims = 2; ndims <= 2; ++ndims) {
+            for (int ndims = 2; ndims <= 4; ++ndims) {
+                int max_nrep = (ndims >= 3) ? 2 : 1;
                 x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                {
-                    int64_t ne2[4];
-                    get_random_dims(ne2, 4);
-                    ne2[0] = ne[0];
-                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                }
+                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
+                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
+                        {
+                            int64_t ne2[4];
+                            get_random_dims(ne2, 4);
+                            ne2[0] = ne[0];
+                            ne2[2] = nrep2 * ne[2];
+                            ne2[3] = nrep3 * ne[3];
+                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        }
 
-                ggml_set_param(ctx0, x[0]);
-                ggml_set_param(ctx0, x[1]);
+                        ggml_set_param(ctx0, x[0]);
+                        ggml_set_param(ctx0, x[1]);
 
-                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                struct ggml_tensor * f = ggml_sum(ctx0, m);
+                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
+                        struct ggml_tensor * f = ggml_sum(ctx0, m);
 
-                GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
+                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
 
-                check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                check_mat_mul(m, x[1], x[0]);
+                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                        if (ndims == 2) {
+                            // check_mat_mul does not support ndims > 2
+                            check_mat_mul(m, x[1], x[0]);
+                        }
+                    }
+                }
             }
         }
 

From 97385268997005bf342b42d345a7d2500f9697eb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 18:46:35 +0200
Subject: [PATCH 172/235] decouple random number generator of each operation
 test

when changing one test the rng of others tests is not influenced anymore
---
 tests/test-grad0.cpp | 54 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 6d731e2208776..717d334ecc8f8 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -407,6 +407,7 @@ int main(int argc, const char ** argv) {
         }
     }
 
+    unsigned seed_iter = 1;
 
     // original loop: 1000
     int niter = 4;
@@ -418,6 +419,10 @@ int main(int argc, const char ** argv) {
         niter = atoi(argv[1]);
     }
     for (int iter = 0; iter < niter; ++iter) {
+        srand(seed_iter);
+        seed_iter = rand();
+        unsigned seed = rand();
+
         printf("test-grad0: iter:%d/%d\n", iter, niter);
         struct ggml_context * ctx0 = ggml_init(params);
 
@@ -427,6 +432,7 @@ int main(int argc, const char ** argv) {
 
         // add f32
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -443,6 +449,7 @@ int main(int argc, const char ** argv) {
 
         // add f16
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -459,6 +466,7 @@ int main(int argc, const char ** argv) {
 
         // sub
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -475,6 +483,7 @@ int main(int argc, const char ** argv) {
 
         // mul
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -491,6 +500,7 @@ int main(int argc, const char ** argv) {
 
         // div
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -507,6 +517,7 @@ int main(int argc, const char ** argv) {
 
         // sqr
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -523,6 +534,7 @@ int main(int argc, const char ** argv) {
 
         // sqrt
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -539,6 +551,7 @@ int main(int argc, const char ** argv) {
 
         // log
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -555,6 +568,7 @@ int main(int argc, const char ** argv) {
 
         // sum
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -572,6 +586,7 @@ int main(int argc, const char ** argv) {
 
         // sum_rows
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -589,6 +604,7 @@ int main(int argc, const char ** argv) {
         // mean, not yet fully implemented
         if(0)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -606,6 +622,7 @@ int main(int argc, const char ** argv) {
         // argmax
         if (0)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -622,6 +639,7 @@ int main(int argc, const char ** argv) {
 
         // repeat
         {
+            srand(seed);
             int64_t ne2[4];
             get_random_dims(ne2, 4);
 
@@ -644,6 +662,7 @@ int main(int argc, const char ** argv) {
 
         // repeat back
         {
+            srand(seed);
             int64_t ne2[4];
             get_random_dims(ne2, 4);
 
@@ -682,6 +701,7 @@ int main(int argc, const char ** argv) {
 
         // sgn
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -698,6 +718,7 @@ int main(int argc, const char ** argv) {
 
         // neg
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -714,6 +735,7 @@ int main(int argc, const char ** argv) {
 
         // step
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -731,6 +753,7 @@ int main(int argc, const char ** argv) {
         // tanh, not yet fully implemented
         if(0)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -747,6 +770,7 @@ int main(int argc, const char ** argv) {
 
         // mul_mat
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 2; ndims <= 4; ++ndims) {
@@ -784,6 +808,7 @@ int main(int argc, const char ** argv) {
         // elu, not yet fully implemented
         if(0)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -800,6 +825,7 @@ int main(int argc, const char ** argv) {
 
         // relu
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -817,6 +843,7 @@ int main(int argc, const char ** argv) {
         // gelu, not yet fully implemented
         if(0)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 4; ++ndims) {
@@ -833,6 +860,7 @@ int main(int argc, const char ** argv) {
 
         // silu
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -854,6 +882,7 @@ int main(int argc, const char ** argv) {
 
         // rms_norm
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -870,6 +899,7 @@ int main(int argc, const char ** argv) {
 
         // scale
         {
+            srand(seed);
             const int nargs = 2;
 
             int64_t ne2[4];
@@ -890,6 +920,7 @@ int main(int argc, const char ** argv) {
 
         // cpy f32
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -907,6 +938,7 @@ int main(int argc, const char ** argv) {
 
         // cpy f16
         {
+            srand(seed);
             const int nargs = 2;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -924,6 +956,7 @@ int main(int argc, const char ** argv) {
 
         // reshape (1d->nd)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -947,6 +980,7 @@ int main(int argc, const char ** argv) {
 
         // reshape (nd->1d)
         {
+            srand(seed);
             const int nargs = 1;
 
             for (int ndims = 1; ndims <= 2; ++ndims) {
@@ -970,6 +1004,7 @@ int main(int argc, const char ** argv) {
 
         // acc 1d
         {
+            srand(seed);
             int64_t ne2[4] = { 1, 1, 1, 1 };
 
             const int nargs = 2;
@@ -997,6 +1032,7 @@ int main(int argc, const char ** argv) {
 
         // acc 2d
         {
+            srand(seed);
             int64_t ne2[4]         = { 1, 1, 1, 1 };
             int64_t max_offsets[4] = { 0, 0, 0, 0 };
             int64_t offsets[4]     = { 0, 0, 0, 0 };
@@ -1029,6 +1065,7 @@ int main(int argc, const char ** argv) {
 
         // acc 3d
         {
+            srand(seed);
             int64_t ne2[4]         = { 1, 1, 1, 1 };
             int64_t max_offsets[4] = { 0, 0, 0, 0 };
             int64_t offsets[4]     = { 0, 0, 0, 0 };
@@ -1063,6 +1100,7 @@ int main(int argc, const char ** argv) {
 
         // acc 4d
         {
+            srand(seed);
             int64_t ne2[4]         = { 1, 1, 1, 1 };
             int64_t max_offsets[4] = { 0, 0, 0, 0 };
             int64_t offsets[4]     = { 0, 0, 0, 0 };
@@ -1099,6 +1137,7 @@ int main(int argc, const char ** argv) {
 
         // set_1d
         {
+            srand(seed);
             int64_t ne2[4];
 
             const int nargs = 2;
@@ -1126,6 +1165,7 @@ int main(int argc, const char ** argv) {
 
         // set_2d
         {
+            srand(seed);
             int64_t ne2[4];
             int64_t max_offsets[4] = { 0, 0, 0, 0 };
             int64_t offsets[4]     = { 0, 0, 0, 0 };
@@ -1158,6 +1198,7 @@ int main(int argc, const char ** argv) {
 
         // view_1d
         {
+            srand(seed);
             const int nargs = 1;
             for (int ndims = 1; ndims <= 4; ++ndims) {
 
@@ -1181,6 +1222,7 @@ int main(int argc, const char ** argv) {
 
         // view_2d
         {
+            srand(seed);
             int64_t ne2[4];
             int64_t nb2[4];
 
@@ -1211,6 +1253,7 @@ int main(int argc, const char ** argv) {
 
         // view_3d
         {
+            srand(seed);
             int64_t ne2[4] = {1,1,1,1};
             int64_t nb2[4] = {0,0,0,0};
 
@@ -1242,6 +1285,7 @@ int main(int argc, const char ** argv) {
 
         // permute
         {
+            srand(seed);
             int64_t ne2[4];
 
             const int nargs = 1;
@@ -1275,6 +1319,7 @@ int main(int argc, const char ** argv) {
 
         // transpose
         {
+            srand(seed);
             int64_t ne2[4];
 
             const int nargs = 1;
@@ -1302,6 +1347,7 @@ int main(int argc, const char ** argv) {
 
         // get_rows
         {
+            srand(seed);
             int64_t ne2[4] = {ne[0], ne[1], 1, 1};
             int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
             const int nargs = 1;
@@ -1318,6 +1364,7 @@ int main(int argc, const char ** argv) {
 
         // diag_mask_inf
         {
+            srand(seed);
             const int nargs = 1;
             const int ndims = 2;
 
@@ -1333,6 +1380,7 @@ int main(int argc, const char ** argv) {
 
         // diag_mask_zero
         {
+            srand(seed);
             const int nargs = 1;
             const int ndims = 2;
 
@@ -1348,6 +1396,7 @@ int main(int argc, const char ** argv) {
 
         // softmax
         {
+            srand(seed);
             const int nargs = 1;
 
             int64_t ne2[4];
@@ -1378,6 +1427,7 @@ int main(int argc, const char ** argv) {
 
         // cross_entropy_loss
         {
+            srand(seed);
             const int nargs = 1;
 
             int64_t ne2[4];
@@ -1408,6 +1458,7 @@ int main(int argc, const char ** argv) {
 
         // rope f32
         {
+            srand(seed);
             const int nargs = 1;
 
             int64_t ne2[4];
@@ -1442,6 +1493,7 @@ int main(int argc, const char ** argv) {
 
         // rope f16
         {
+            srand(seed);
             const int nargs = 1;
 
             int64_t ne2[4];
@@ -1476,6 +1528,7 @@ int main(int argc, const char ** argv) {
 
         // flash_attn f32
         {
+            srand(seed);
             const int nargs = 3;
 
             int64_t ne2[4];
@@ -1520,6 +1573,7 @@ int main(int argc, const char ** argv) {
         // flash_attn f16, not yet fully implemented
         if(0)
         {
+            srand(seed);
             const int nargs = 3;
 
             int64_t ne2[4];

From d3aaf0876afec3fc3ec466daa73f08e787861ac2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 18:47:27 +0200
Subject: [PATCH 173/235] add comment briefly describing what ggml_repeat_back
 does

---
 ggml.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml.h b/ggml.h
index 0d38a7110cd03..20e67f4fd80f5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -836,6 +836,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // sums repetitions in a into shape of b
     GGML_API struct ggml_tensor * ggml_repeat_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From d3f1b438a84bda322ee9d2c674b7c10393f1d940 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 18:55:18 +0200
Subject: [PATCH 174/235] simplify broadcasting mul_mat backward using
 ggml_repeat_back

---
 ggml.c | 37 ++-----------------------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2a8d95ec8282b..f6dca255f66db 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16842,48 +16842,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         ggml_out_prod(ctx, // [n,m,qq,rr]
                             src1,          // [n,p,qq,rr]
                             tensor->grad); // [m,p,qq,rr]
-                    const int64_t n  = s1_tg->ne[0];
-                    const int64_t m  = s1_tg->ne[1];
                     const int64_t qq = s1_tg->ne[2];
                     const int64_t rr = s1_tg->ne[3];
                     const int64_t q1 = src0->ne[2];
                     const int64_t r1 = src0->ne[3];
-                    const int64_t nq = qq/q1;
-                    const int64_t nr = rr/r1;
-                    GGML_ASSERT(qq % q1 == 0);
-                    GGML_ASSERT(rr % r1 == 0);
                     const bool ne2_broadcasted = qq > q1;
                     const bool ne3_broadcasted = rr > r1;
-                    // handling broadcasted will create a lot of overhead.
-                    // this could be greatly reduced if we had a ggml_sum_repetitions function.
-                    //   ggml_sum_repetitions(ctx, u with ne=[a,b,c,d], v with ne=[A,B,C,D]) -> ne=[A,B,C,D]
-                    //   with a % A == 0, b % B == 0, etc.
-                    //   TODO: implement such function if necessary, it should be quite trivial
-                    if (ne2_broadcasted) {
-                        printf("%s: ne2_broadcasted\n", __func__);
-                        // sum ne2 repetitions
-                        //                                          s1_tg->ne == [n,m,qq=nq*q1,rr]
-                        s1_tg = ggml_reshape_4d(ctx, s1_tg, n*m, nq, q1, rr); // [n*m,nq,q1,rr]
-                        s1_tg = ggml_transpose(ctx, s1_tg);                   // [nq,n*m,q1,rr]
-                        s1_tg = ggml_cont(ctx, s1_tg);                        // [nq,n*m,q1,rr]
-                        s1_tg = ggml_sum_rows(ctx, s1_tg);                    // [1,n*m,q1,rr]
-                        // due to following reshape we can omit this:
-                        // s1_tg = ggml_reshape_4d(ctx, s1_tg, n, m, q1, rr); // [n,m,q1,rr]
-                    }
-                    if (ne3_broadcasted) {
-                        printf("%s: ne3_broadcasted\n", __func__);
-                        // sum ne3 repetitions
-                        //                                          s1_tg->ne == [n,m,q1,rr=nr*r1]
-                        s1_tg = ggml_reshape_4d(ctx, s1_tg, n*m, q1, nr, r1); // [n*m,q1,nr,r1]
-                        s1_tg = ggml_permute(ctx, s1_tg, 1, 2, 0, 3);         // [nr,n*m,q1,r1]
-                        s1_tg = ggml_cont(ctx, s1_tg);                        // [nr,n*m,q1,r1]
-                        s1_tg = ggml_sum_rows(ctx, s1_tg);                    // [1,n*m,q1,r1]
-                        // due to following reshape we can omit this:
-                        // s1_tg = ggml_reshape_4d(ctx, s1_tg, n, m, q1, r1); // [n,m,q1,r1]
-                    }
                     if (ne2_broadcasted || ne3_broadcasted) {
-                        // make sure ne and n_dims match
-                        s1_tg = ggml_reshape(ctx, s1_tg, src0);
+                        // sum broadcast repetitions of s1_tg into shape of src0
+                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
                     }
                     src0->grad =
                         ggml_add_or_set(ctx,

From 917d2870b4f2774f4849d83ffb3ce8f9155d944a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 20:52:53 +0200
Subject: [PATCH 175/235] add cgraph evaluation order member and corresponding
 enum type

this controls in which order ggml_build_forward visits source nodes.
by default the nodes are visited left to right, i.e. src[0] first.
in some cases it is beneficial for ggml-alloc to visit in a different order.
two possible orders are supported: left-to-right (src[0] first) and right-to-left (src[0] last).
---
 ggml.c | 10 ++++++++--
 ggml.h |  8 ++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index f6dca255f66db..92c10e9957d85 100644
--- a/ggml.c
+++ b/ggml.c
@@ -17398,8 +17398,12 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
     }
 
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (node->src[i]) {
-            ggml_visit_parents(cgraph, node->src[i]);
+        const int k =
+            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
+            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
+            /* unknown order, just fall back to using i*/ i;
+        if (node->src[k]) {
+            ggml_visit_parents(cgraph, node->src[k]);
         }
     }
 
@@ -17458,6 +17462,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
         /*.hash_table   =*/ { NULL },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
@@ -17531,6 +17536,7 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
         /*.hash_table   =*/ { NULL },
+        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
diff --git a/ggml.h b/ggml.h
index 20e67f4fd80f5..8eafa67a09640 100644
--- a/ggml.h
+++ b/ggml.h
@@ -516,6 +516,12 @@ extern "C" {
     // #define GGML_GRAPH_HASHTABLE_SIZE 16411
     #define GGML_GRAPH_HASHTABLE_SIZE 32771
 
+    enum ggml_cgraph_eval_order {
+        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_CGRAPH_EVAL_ORDER_COUNT
+    };
+
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
@@ -527,6 +533,8 @@ extern "C" {
 
         void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
 
+        enum ggml_cgraph_eval_order order;
+
         // performance
         int     perf_runs;
         int64_t perf_cycles;

From ace90884a6bce071c8bfd05753ba486e7443104c Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 21:00:25 +0200
Subject: [PATCH 176/235] measure max compute size for each cgraph eval order
 and use best order

this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
---
 examples/finetune/finetune.cpp | 57 +++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 4d3f47fb4ca07..3547fca02ca25 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2721,7 +2721,7 @@ int main(int argc, char ** argv) {
         NULL,                           // mem_buffer
         true,                           // no_alloc
     };
-    struct ggml_context * ctx_compute = ggml_init(ctx_compute_params);
+    struct ggml_context * ctx_compute = NULL;
 
     struct ggml_tensor * loss   = NULL;
     struct ggml_tensor * logits = NULL;
@@ -2731,32 +2731,47 @@ int main(int argc, char ** argv) {
     struct ggml_cgraph * gb_tmp = NULL;
 
     // measure required memory for compute tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    gf = ggml_new_graph(ctx_compute);
-    gb = ggml_new_graph(ctx_compute);
-    gb_tmp = params.use_checkpointing
-        ? ggml_new_graph(ctx_compute)
-        : NULL;
-    loss = llama_build_lora_finetune_graphs(
-        &model, &lora, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.use_flash,
-        params.use_checkpointing
-    );
-    size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t best_compute_size = SIZE_MAX;
+    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
+    // find best evaluation order
+    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
+        ctx_compute = ggml_init(ctx_compute_params);
+        alloc = ggml_allocr_new_measure(tensor_alignment);
+        gf = ggml_new_graph(ctx_compute);
+        gf->order = (enum ggml_cgraph_eval_order) order;
+        gb = ggml_new_graph(ctx_compute);
+        gb_tmp = params.use_checkpointing
+            ? ggml_new_graph(ctx_compute)
+            : NULL;
+        loss = llama_build_lora_finetune_graphs(
+            &model, &lora, alloc, ctx_compute,
+            gf, gb, gb_tmp,
+            &logits, tokens_input, target_probs,
+            n_tokens, n_batch,
+            params.use_flash,
+            params.use_checkpointing
+        );
+        size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
+        if (max_compute_size < best_compute_size) {
+            best_compute_size = max_compute_size;
+            best_order = gf->order;
+        }
+        ggml_allocr_free(alloc);
+        ggml_free(ctx_compute);
+    }
+    size_t max_compute_size = best_compute_size;
     printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-
-    // reset compute context
-    ggml_free(ctx_compute);
-    ctx_compute = ggml_init(ctx_compute_params);
+    printf("%s: evaluation order = %s\n", __func__,
+        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
+        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
+        "invalid");
 
     // allocate compute tensors
     mem_compute_data.resize(max_compute_size);
+    ctx_compute = ggml_init(ctx_compute_params);
     alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
     gf = ggml_new_graph(ctx_compute);
+    gf->order = best_order;
     gb = ggml_new_graph(ctx_compute);
     gb_tmp = params.use_checkpointing
         ? ggml_new_graph(ctx_compute)

From 1cef45953b404b215f80ec6cceced73da8109abc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 9 Sep 2023 21:58:36 +0200
Subject: [PATCH 177/235] remove unused command line options

---
 examples/finetune/finetune.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 3547fca02ca25..ebe89ae4c1bfa 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1952,8 +1952,6 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
     fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
-    fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
-    fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
     fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
     fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
     fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);

From 0e32932931a650c8dacb7f15f591506abf1623d4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 13 Sep 2023 15:26:25 +0200
Subject: [PATCH 178/235] add sample start patterns and options to force new or
 by default resume last shuffling

---
 common/common.h                |   2 +
 examples/finetune/finetune.cpp | 630 ++++++++++++++++++++++++++-------
 2 files changed, 505 insertions(+), 127 deletions(-)

diff --git a/common/common.h b/common/common.h
index c3bb6dfc018de..fafe850dc68c8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -121,6 +121,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_random_prompt(std::mt19937 & rng);
 
+void process_escapes(std::string& input);
+
 //
 // Model utils
 //
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ebe89ae4c1bfa..89d9156f9d508 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "llama.h"
+#include "common.h"
 #include <unordered_map>
 #include <vector>
 #include <cassert>
@@ -12,6 +13,7 @@
 #include <stdexcept>
 #include <algorithm>
 #include <string>
+#include <sstream>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -284,9 +286,16 @@ struct my_llama_lora {
 
     std::vector<my_llama_lora_layer> layers;
 
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
+    uint64_t train_its = 0;
+    uint64_t train_samples = 0;
+    uint64_t train_tokens = 0;
+    uint64_t train_epochs = 0;
+
+    size_t      shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
+    std::string shuffle_rng_state_current;
+    std::string shuffle_rng_state_next;
+    size_t      shuffle_sample_count;
+    size_t      shuffle_next_sample;
 };
 
 // gguf constants
@@ -331,6 +340,12 @@ const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
 const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
 const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
 const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
+const char * LLM_KV_TRAINING_EPOCH_COUNT        = "training.epoch_count";
+const char * LLM_KV_TRAINING_SAMPLES_HASH       = "training.samples_hash";
+const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
+const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
+const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
+const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
 
 const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
 const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
@@ -1004,25 +1019,20 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     return t36;
 }
 
-void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_probs) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab  = target_probs->ne[0];
-
-    size_t sample = train_samples[example_id % n_train_samples];
-    GGML_ASSERT(sample+n_tokens-1 < n_train_data);
-
-    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
-    for (int i=1; i<n_tokens+1; ++i) {
-        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        ggml_set_f32_nd(target_probs,  token, i-1, 0, 0, +1.0f);
-        if (i<n_tokens) {
-            ggml_set_i32_1d(tokens_input, i, token);
-        }
-    }
-}
+int get_example_targets_batch(
+        struct llama_context * lctx,
+        const size_t         * samples_begin,
+        const size_t         * samples_size,
+              size_t           samples_count,
+        const llama_token    * train_data,
+        size_t                 n_train_data,
+        int                    example_id,
+        struct ggml_tensor   * tokens_input,
+        struct ggml_tensor   * target_probs,
+        bool                   separate_with_eos,
+        bool                   separate_with_bos,
+        bool                   fill_with_next_samples) {
 
-void get_example_targets_batch(struct llama_context* lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_probs->n_dims  == 3);
     int n_vocab  = target_probs->ne[0];
@@ -1032,24 +1042,60 @@ void get_example_targets_batch(struct llama_context* lctx, const int * train_sam
     GGML_ASSERT(n_tokens == target_probs->ne[1]);
     GGML_ASSERT(n_batch  == target_probs->ne[2]);
 
+    int used_samples = 0;
+
     ggml_set_f32(target_probs, 0.0f);
+    int bos = llama_token_bos(lctx);
+    int eos = llama_token_eos(lctx);
     // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
-        size_t sample_idx = (example_id*n_batch + k) % n_train_samples;
-        size_t sample = train_samples[sample_idx];
-        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
-        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+        size_t sample_offs  = 0;
+        size_t sample_idx   = (example_id + used_samples) % samples_count;
+        size_t sample_begin = samples_begin[sample_idx];
+        size_t sample_size  = samples_size[sample_idx];
+        ++used_samples;
 
-        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, llama_token_bos(lctx));
-        for (int i=1; i<n_tokens+1; ++i) {
-            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            ggml_set_f32_nd(target_probs,  token, i-1, k, 0, +1.0f);
-            if (i<n_tokens) {
-                ggml_set_i32_nd(tokens_input, i, k, 0, 0, token);
+        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
+        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
+
+        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
+        bool sample_separation_eos = !separate_with_eos;
+        bool sample_separation_bos = !separate_with_bos;
+        for (int i=0; i<n_tokens; ++i) {
+            int token = eos;
+            if (sample_offs >= sample_size && fill_with_next_samples) {
+                if (!sample_separation_eos) {
+                    // insert eos token to separate samples
+                    sample_separation_eos = true;
+                } else if (!sample_separation_bos) {
+                    // insert bos token to separate samples
+                    sample_separation_bos = true;
+                    token = bos;
+                } else {
+                    // sample separation is done, continue with next sample
+                    sample_separation_eos = !separate_with_eos;
+                    sample_separation_bos = !separate_with_bos;
+                    sample_offs  = 0;
+                    sample_idx   = (example_id + used_samples) % samples_count;
+                    sample_begin = samples_begin[sample_idx];
+                    sample_size  = samples_size[sample_idx];
+                    ++used_samples;
+                }
+            }
+            // note: no else-if here
+            if (sample_offs < sample_size) {
+                token = clamp(train_data[sample_begin+sample_offs], 0, n_vocab-1);
+                ++sample_offs;
+            }
+            ggml_set_f32_nd(target_probs,  token, i, k, 0, +1.0f);
+            if (i+1<n_tokens) {
+                ggml_set_i32_nd(tokens_input, i+1, k, 0, 0, token);
             }
         }
     }
+
+    return used_samples;
 }
 
 #ifdef __GNUC__
@@ -1156,61 +1202,278 @@ struct llama_file {
     }
 };
 
-int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
+static size_t utf8_len(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+// mark each byte with its utf8 unit number.
+// returns the number of utf8 characters.
+// e.g. when bytes == '\x61\xD0\xB0\x62',
+// then utf8_units will become [0,0,1,0]
+// utf8_nunits will become [1,2,2,1] and 3 is returned.
+// bytes where utf8_units is zero, are the begin of an utf8 character.
+static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
+    size_t offs = 0;
+    size_t count_utf8 = 0;
+    while(offs < count) {
+        size_t len = utf8_len(bytes[offs]);
+        for (size_t i=0; i<len; ++i) {
+            utf8_units[offs+i] = i;
+            utf8_nunits[offs+i] = len;
+        }
+        offs += len;
+        ++count_utf8;
+    }
+    return count_utf8;
+}
+
+size_t tokenize_file(
+        struct llama_context     * lctx,
+        const char               * filename,
+        const std::string        & sample_start,
+        bool                       include_sample_start,
+        bool                       overlapping_samples,
+        unsigned                   context_length,
+        std::vector<llama_token> & out_tokens,
+        std::vector<size_t>      & out_samples_begin,
+        std::vector<size_t>      & out_samples_size) {
     struct llama_file f(filename, "rb");
 
+    if (f.size == 0) {
+        out_tokens.clear();
+        out_samples_begin.clear();
+        out_samples_size.clear();
+        printf("%s: warning: empty or not existing training data file '%s'\n",
+            __func__, filename);
+        return out_tokens.size();
+    }
+
     std::vector<char> buf;
     buf.resize(f.size+1);
 
     f.read_raw(buf.data(), f.size);
     buf[f.size] = '\0';
 
-    out.resize(buf.size());
+    std::vector<int> utf8_units;
+    std::vector<int> utf8_nunits;
+    utf8_units.resize(buf.size());
+    utf8_nunits.resize(buf.size());
+    size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
 
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
-    }
+    if (sample_start.size() == 0) {
+        // tokenize all data at once
+        out_tokens.resize(buf.size());
 
-    bool verify = false;
-    if (verify) {
-        const char * in  = buf.data();
-        const char * end = buf.data() + buf.size();
-        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_get_text(lctx, out[i]);
-            int len = strlen(s);
-            if (in >= end) {
-                printf("%s: unexpected end of original text.\n", __func__);
-                break;
+        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), buf.size(), false);
+        if (n_tokens < 0) {
+            out_tokens.resize(-n_tokens);
+            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), buf.size(), false);
+        }
+        if (n_tokens >= 0) {
+            out_tokens.resize(n_tokens);
+        }
+
+        // generate sample starts at all token positions
+        out_samples_begin.clear();
+        out_samples_begin.push_back(0);
+        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
+        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
+        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
+            out_samples_begin.push_back(sample_begin);
+            out_samples_size.push_back(context_length);
+        }
+    } else {
+        // split data into samples and tokenize each sample
+        std::string data_str(buf.data(), buf.size()-1);
+        out_samples_begin.clear();
+        out_samples_size.clear();
+        out_tokens.clear();
+
+        // find all positions of pattern sample_start
+        size_t sample_begin = data_str.find(sample_start, 0);
+        while (sample_begin != std::string::npos) {
+            out_samples_begin.push_back(sample_begin);
+            const size_t search_start = sample_begin + sample_start.size();
+            sample_begin = data_str.find(sample_start, search_start);
+        }
+        if (out_samples_begin.size() == 0) {
+            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
+                __func__, sample_start.c_str());
+            out_samples_begin.push_back(0);
+        }
+
+        out_samples_size.resize(out_samples_begin.size(), 0);
+
+        std::vector<char>        buf_sample;
+        std::vector<llama_token> tok_sample;
+
+        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
+        size_t found_too_big_sample   = 0;
+        size_t found_too_small_sample = 0;
+        size_t found_empty_sample     = 0;
+        size_t found_min_sample_size  = SIZE_MAX;
+        size_t found_max_sample_size  = 0;
+
+        size_t max_token_text_size = 0;
+        int n_vocab = llama_n_vocab(lctx);
+        for (llama_token token=0; token < n_vocab; ++token) {
+            max_token_text_size = std::max(
+                max_token_text_size,
+                strlen(llama_token_get_text(lctx, token)));
+        }
+
+        // upper bound of context byte length.
+        // strings with this byte length should always tokenize to at least context_length tokens.
+        size_t context_byte_len = max_token_text_size*context_length;
+
+        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
+            // determine sample begin and end from pattern positions
+            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
+            size_t sample_end   = overlapping_samples
+                                    ? std::min(
+                                        data_str.size(),
+                                        sample_begin + context_byte_len)
+                                    : (i+1 < out_samples_begin.size()
+                                        ? out_samples_begin[i+1]
+                                        : data_str.size());
+            if (utf8_units[sample_end] > 0) {
+                // sample end is in the middle of an utf8 character.
+                // advance sample_end to the begin of the next utf8 character.
+                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
+            }
+            size_t sample_size = sample_end - sample_begin;
+            if (sample_size == 0) {
+                ++found_empty_sample;
             }
-            const bool matches = (strncmp(in, s, len) == 0);
-            if (matches) {
-                in += len;
+
+            if (sample_size > 0) {
+                // llama_tokenize expects zero terminated string,
+                // copy sample into buffer and zero terminate it.
+                buf_sample.resize(sample_size+1);
+                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
+                buf_sample[sample_size] = '\0';
+
+                // printf("sample: '%s'\n", buf_sample.data());
+
+                // tokenize the sample
+                tok_sample.resize(sample_size);
+                int n_tokens = llama_tokenize(lctx,
+                    buf_sample.data(),
+                    tok_sample.data(),
+                    sample_size, false);
+                if (n_tokens < 0) {
+                    tok_sample.resize(-n_tokens);
+                    n_tokens = llama_tokenize(lctx,
+                        buf_sample.data(),
+                        tok_sample.data(),
+                        sample_size, false);
+                    GGML_ASSERT(n_tokens >= 0);
+                }
+                GGML_ASSERT(n_tokens <= tok_sample.size());
+
+                if ((size_t) n_tokens > context_length) {
+                    ++found_too_big_sample;
+                } else if ((size_t) n_tokens < context_length) {
+                    ++found_too_small_sample;
+                }
+                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
+                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
+
+                // write out tokens, start and size of sample
+                // overwrite the string start position with the token start position
+                out_samples_begin[i] = out_tokens.size();
+                out_samples_size[i] = (size_t) n_tokens;
+                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
             } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                out_samples_begin[i] = out_tokens.size();
+                out_samples_size[i] = 0;
             }
+
+        }
+        if (found_too_big_sample > 0) {
+            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
+                __func__, found_too_big_sample, found_max_sample_size, context_length);
+        }
+
+        if (found_too_small_sample > 0) {
+            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
+                __func__, found_too_small_sample, found_min_sample_size, context_length);
+        }
+
+        if (found_empty_sample) {
+            printf("%s: warning: found %zu empty samples.\n",
+                __func__, found_empty_sample);
         }
     }
+    printf("%s: total number of samples: %zu\n",
+        __func__, out_samples_begin.size());
+
+    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
+
+    return out_tokens.size();
+}
+
+void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
+    std::stringstream s_rng_state;
+    s_rng_state.imbue(std::locale::classic());
+    s_rng_state.exceptions(std::stringstream::eofbit | std::stringstream::failbit | std::stringstream::badbit);
+    s_rng_state.str(rng_state);
+    s_rng_state >> rng;
+}
 
-    return n_tokens;
+std::string mt19937_get_state(const std::mt19937& rng) {
+    std::stringstream s_rng_state;
+    s_rng_state.imbue(std::locale::classic());
+    s_rng_state.exceptions(std::stringstream::badbit);
+    s_rng_state << rng;
+    return s_rng_state.str();
 }
 
-void shuffle_ints(int * begin, int * end) {
-    if (end <= begin) return;
-    int max=begin[0];
-    for (int i=1; i<end-begin; ++i) {
-        if (begin[i] > max) {
-            max = begin[i];
+std::string mt19937_seed_to_state(unsigned seed) {
+    std::mt19937 rng(seed);
+    return mt19937_get_state(rng);
+}
+
+std::string shuffle_samples(const std::string& rng_state, size_t * begins, size_t * sizes, size_t count) {
+    if (count == 0) return rng_state;
+
+    std::mt19937 rng;
+    mt19937_set_state(rng, rng_state);
+
+    // sort indices by random value for each index
+    std::vector<size_t> idcs;
+    {
+        std::vector<unsigned> rnd;
+        idcs.resize(count);
+        rnd.resize(count);
+        for (unsigned i=0; i<count; ++i) {
+            idcs[i] = i;
+            rnd[i]  = rng();
         }
+
+        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
+            // stable sort for reproducibility
+            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
+        });
     }
-    std::vector<float> vals;
-    vals.resize(max+1);
-    for (int i=0; i<max+1; ++i) {
-       vals[i] = frand();
+
+    // reorder begins and sizes by sorted indices
+    std::vector<size_t> reordered;
+    reordered.resize(count);
+
+    for (unsigned i=0; i<count; ++i) {
+        reordered[i] = begins[idcs[i]];
+    }
+    memcpy(begins, reordered.data(), sizeof(*begins)*reordered.size());
+
+    for (unsigned i=0; i<count; ++i) {
+        reordered[i] = sizes[idcs[i]];
     }
-    std::sort(begin, end, [&vals](int a, int b){
-       return vals.at(a) < vals.at(b);
-    });
+    memcpy(sizes, reordered.data(), sizeof(*sizes)*reordered.size());
+
+    return mt19937_get_state(rng);
 }
 
 std::string replace_str(const char * s, const char * needle, const char * replacement) {
@@ -1572,15 +1835,30 @@ void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context *
 
     uint32_t file_version;
     GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-    GGML_ASSERT(file_version == 0);
+    GGML_ASSERT(file_version <= 1);
 
     std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
     GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
     GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
 
-    GGUF_GET_KEY(fctx, lora->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-    GGUF_GET_KEY(fctx, lora->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-    GGUF_GET_KEY(fctx, lora->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+    if (file_version == 0) {
+
+        GGUF_GET_KEY(fctx, lora->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
+        GGUF_GET_KEY(fctx, lora->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, lora->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+
+    } else if (file_version == 1) {
+
+        GGUF_GET_KEY(fctx, lora->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
+        GGUF_GET_KEY(fctx, lora->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, lora->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
+        GGUF_GET_KEY(fctx, lora->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
+
+        GGUF_GET_KEY(fctx, lora->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
+        GGUF_GET_KEY(fctx, lora->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
+        GGUF_GET_KEY(fctx, lora->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, lora->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
+    }
 
     load_opt_context_gguf(fctx, f_ggml_ctx, opt);
 }
@@ -1588,11 +1866,17 @@ void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context *
 void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     save_llama_lora_gguf(fctx, model, lora);
 
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
     gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, lora->train_its);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    lora->train_samples);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     lora->train_tokens);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, lora->train_its);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    lora->train_samples);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     lora->train_tokens);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     lora->train_epochs);
+
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) lora->shuffle_samples_hash);
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    lora->shuffle_rng_state_current.c_str());
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) lora->shuffle_sample_count);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) lora->shuffle_next_sample);
 
     save_opt_context_gguf(fctx, opt);
 }
@@ -1792,11 +2076,20 @@ struct train_params {
     bool custom_n_rank_norm;
     bool custom_n_rank_output;
 
-    bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
     bool use_checkpointing;
 
+    std::string sample_start;
+    bool include_sample_start;
+    bool escape;
+    bool overlapping_samples;
+    bool fill_with_next_samples;
+    bool separate_with_eos;
+    bool separate_with_bos;
+
+    bool force_reshuffle;
+
     // only adam
     int   warmup;
     int   cos_decay_steps;
@@ -1880,11 +2173,19 @@ struct train_params get_default_train_params() {
     params.custom_n_rank_norm           = false;
     params.custom_n_rank_output         = false;
 
-    params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
     params.use_checkpointing      = true;
 
+    params.sample_start           = "";
+    params.include_sample_start   = false;
+    params.escape                 = false;
+    params.overlapping_samples    = false;
+    params.fill_with_next_samples = false;
+    params.separate_with_eos      = false;
+    params.separate_with_bos      = true;
+    params.force_reshuffle        = false;
+
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
     params.opt_max_no_improvement = 0;
@@ -1945,8 +2246,16 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
     fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
     fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
-    fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
-    fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
+    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
+    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
+    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
+    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
+    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
+    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
     fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
@@ -2184,8 +2493,30 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
             }
             params->n_rank_w3 = std::stoi(argv[i]);
             params->custom_n_rank_w3 = true;
-        } else if (arg == "--samples-after-nl") {
-            params->samples_start_after_nl = true;
+        } else if (arg == "--sample-start") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->sample_start = std::string(argv[i]);
+        } else if (arg == "--escape") {
+            params->escape = true;
+        } else if (arg == "--include-sample-start") {
+            params->include_sample_start = true;
+        } else if (arg == "--overlapping-samples") {
+            params->overlapping_samples = true;
+        } else if (arg == "--fill-with-next-samples") {
+            params->fill_with_next_samples = true;
+        } else if (arg == "--separate-with-eos") {
+            params->separate_with_eos = true;
+        } else if (arg == "--separate-with-bos") {
+            params->separate_with_bos = true;
+        } else if (arg == "--no-separate-with-eos") {
+            params->separate_with_eos = false;
+        } else if (arg == "--no-separate-with-bos") {
+            params->separate_with_bos = false;
+        } else if (arg == "--force-reshuffle") {
+            params->force_reshuffle = true;
         } else if (arg == "--use-lbfgs") {
             params->use_adam = false;
         } else if (arg == "--use-adam") {
@@ -2318,7 +2649,9 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
         train_print_usage(argc, argv, &default_params);
         exit(1);
     }
-
+    if (params->escape) {
+        process_escapes(params->sample_start);
+    }
     return true;
 }
 
@@ -2331,9 +2664,9 @@ struct opt_callback_data {
     int                       last_save_iter;
     llama_token *             tokens_data;
     size_t                    tokens_size;
-    int *                     samples_data;
-    size_t                    samples_size;
-    int                       shuffle_countdown;
+    size_t *                  samples_begin;
+    size_t *                  samples_size;
+    size_t                    samples_count;
     struct ggml_tensor *      tokens_input;
     struct ggml_tensor *      target_probs;
     int                       first_iter;
@@ -2428,8 +2761,9 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
         if (impr_plot > 0) impr_plot = 0;
         if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
-        printf("%s: iter=%*d sched=%f loss=%f",
-            __func__, 6, opt->iter, *sched, opt->loss_after);
+        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
+            __func__, opt->iter, std::min(1+data->lora->shuffle_next_sample, data->lora->shuffle_sample_count), data->lora->shuffle_sample_count,
+            *sched, opt->loss_after);
 
 
         if (data->millis_per_iter > 0) {
@@ -2451,26 +2785,33 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         printf("\n");
     }
 
-    if (data->shuffle_countdown < n_batch) {
-        printf("%s: reshuffle samples\n", __func__);
-        shuffle_ints(data->samples_data, data->samples_data + data->samples_size);
-        for (int i = 0; i < (int) data->samples_size; ++i) {
-            GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size);
-        }
-        data->shuffle_countdown = data->samples_size;
-    }
-
-    get_example_targets_batch(
+    int used_samples = get_example_targets_batch(
         data->lctx,
-        data->samples_data,
+        data->samples_begin,
         data->samples_size,
+        data->samples_count,
         data->tokens_data,
         data->tokens_size,
-        opt->iter*params->n_gradient_accumulation + accum_step,
+        data->lora->shuffle_next_sample,
         data->tokens_input,
-        data->target_probs);
-
-    data->shuffle_countdown -= n_batch;
+        data->target_probs,
+        params->separate_with_eos,
+        params->separate_with_bos,
+        params->fill_with_next_samples);
+
+    data->lora->shuffle_next_sample += used_samples;
+
+    if (data->lora->shuffle_next_sample >= data->lora->shuffle_sample_count) {
+        ++data->lora->train_epochs;
+        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) data->lora->train_epochs);
+        // note: we may have used some samples from the current shuffling more than once
+        data->lora->shuffle_rng_state_next = shuffle_samples(
+            data->lora->shuffle_rng_state_next,
+            data->samples_begin,
+            data->samples_size,
+            data->samples_count);
+        data->lora->shuffle_next_sample = 0;
+    }
 }
 
 int64_t get_parameter_count(struct my_llama_lora* lora) {
@@ -2506,6 +2847,22 @@ int64_t get_parameter_count(struct my_llama_lora* lora) {
     return nx;
 }
 
+size_t hash_combine(size_t h1, size_t h2) {
+    return h1 ^ (h2 << 1);
+}
+
+size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
+    std::hash<std::string> h_string;
+    std::hash<unsigned long long> h_ull;
+    size_t h = h_string(std::string(fn));
+    h = hash_combine(h, h_ull((unsigned long long) sample_count));
+    for (size_t i=0; i< sample_count; ++i) {
+        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
+        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
+    }
+    return h;
+}
+
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -2654,6 +3011,10 @@ int main(int argc, char ** argv) {
 
     print_params(&model.hparams);
     print_lora_params(&lora.hparams);
+    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) lora.train_its);
+    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) lora.train_samples);
+    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) lora.train_tokens);
+    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) lora.train_epochs);
     printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
     printf("%s: max_opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
     opt->iter = lora.train_its;
@@ -2786,11 +3147,21 @@ int main(int argc, char ** argv) {
 
     // tokenize data
     std::vector<llama_token> train_tokens;
+    std::vector<size_t> train_samples_begin;
+    std::vector<size_t> train_samples_size;
     printf("%s: tokenize training data\n", __func__);
-    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
-        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
-    }
-    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
+    tokenize_file(lctx,
+            params.fn_train_data,
+            params.sample_start,
+            params.include_sample_start,
+            params.overlapping_samples,
+            n_tokens,
+            train_tokens,
+            train_samples_begin,
+            train_samples_size);
+    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
+
+    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
 
     std::vector<size_t> token_noccurs;
     token_noccurs.resize(model.hparams.n_vocab, 0);
@@ -2804,20 +3175,25 @@ int main(int argc, char ** argv) {
     }
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
-    // generate token positions of training samples
-    std::vector<int> train_samples;
-    GGML_ASSERT(n_tokens < (int) train_tokens.size());
-    train_samples.push_back(0);
-    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        const bool is_valid_sample_start = !params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx));
-        if (is_valid_sample_start) {
-            train_samples.push_back(i);
-        }
+    size_t shuffle_samples_hash = compute_samples_hash(params.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
+    const bool changed_train_data = (shuffle_samples_hash != lora.shuffle_samples_hash) || (lora.shuffle_sample_count != train_samples_size.size());
+    if (changed_train_data) {
+        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
+    }
+    if (params.force_reshuffle) {
+        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
     }
-    shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-    for (int i = 0; i < (int) train_samples.size(); ++i) {
-        GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
+    if ((lora.shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
+        lora.shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
+        lora.shuffle_sample_count = train_samples_size.size();
+        lora.shuffle_next_sample = 0;
+        lora.shuffle_samples_hash = shuffle_samples_hash;
     }
+    lora.shuffle_rng_state_next = shuffle_samples(
+        lora.shuffle_rng_state_current,
+        train_samples_begin.data(),
+        train_samples_size.data(),
+        train_samples_size.size());
 
     printf("%s: begin training\n", __func__);
 
@@ -2827,17 +3203,17 @@ int main(int argc, char ** argv) {
     opt_cb_data.model  = &model;
     opt_cb_data.lora   = &lora;
     opt_cb_data.lctx   = lctx;
-    opt_cb_data.last_save_iter    = opt->iter;
-    opt_cb_data.tokens_data       = train_tokens.data();
-    opt_cb_data.tokens_size       = train_tokens.size();
-    opt_cb_data.samples_data      = train_samples.data();
-    opt_cb_data.samples_size      = train_samples.size();
-    opt_cb_data.shuffle_countdown = train_samples.size();
-    opt_cb_data.tokens_input      = tokens_input;
-    opt_cb_data.target_probs      = target_probs;
-    opt_cb_data.first_iter        = opt->iter;
-    opt_cb_data.last_time         = ggml_time_ms();
-    opt_cb_data.millis_per_iter   = 0.0;
+    opt_cb_data.last_save_iter         = opt->iter;
+    opt_cb_data.tokens_data            = train_tokens.data();
+    opt_cb_data.tokens_size            = train_tokens.size();
+    opt_cb_data.samples_begin          = train_samples_begin.data();
+    opt_cb_data.samples_size           = train_samples_size.data();
+    opt_cb_data.samples_count          = train_samples_size.size();
+    opt_cb_data.tokens_input           = tokens_input;
+    opt_cb_data.target_probs           = target_probs;
+    opt_cb_data.first_iter             = opt->iter;
+    opt_cb_data.last_time              = ggml_time_ms();
+    opt_cb_data.millis_per_iter        = 0.0;
 
     // measure required memory for work buffer
     size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE;

From 7898652dfb4ecd7a3676788da5dab06b557c1ffc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 13 Sep 2023 16:20:50 +0200
Subject: [PATCH 179/235] update shuffle rng state on reshuffle

---
 examples/finetune/finetune.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 89d9156f9d508..9c1fd49439ae4 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2805,8 +2805,9 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
         ++data->lora->train_epochs;
         printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) data->lora->train_epochs);
         // note: we may have used some samples from the current shuffling more than once
+        data->lora->shuffle_rng_state_current = data->lora->shuffle_rng_state_next;
         data->lora->shuffle_rng_state_next = shuffle_samples(
-            data->lora->shuffle_rng_state_next,
+            data->lora->shuffle_rng_state_current,
             data->samples_begin,
             data->samples_size,
             data->samples_count);

From ec57689f6466bdd68480b274e118c95d6164366d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Wed, 13 Sep 2023 18:34:06 +0200
Subject: [PATCH 180/235] exclude known zero values from computations in
 flash_attn_f32 & flash_attn_back_f32

---
 ggml.c | 85 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 41 insertions(+), 44 deletions(-)

diff --git a/ggml.c b/ggml.c
index b7dcb2c516d78..6296809a449c4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14475,20 +14475,17 @@ static void ggml_compute_forward_flash_attn_f32(
         // scale
         ggml_vec_scale_f32(nek1, S, scale);
 
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
+        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+        for (int64_t i = masked_begin; i < M; i++) {
+            S[i] = -INFINITY;
         }
 
         // softmax
-        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+        // exclude known -INF S[..] values from max and loop
         // dont forget to set their SW values to zero
         {
             float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
+            ggml_vec_max_f32(masked_begin, &max, S);
 
             ggml_float sum = 0.0;
             {
@@ -14502,10 +14499,15 @@ static void ggml_compute_forward_flash_attn_f32(
                 ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 
                 for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                    if (i >= masked_begin) {
+                        break;
+                    }
                     float * SS = S + i;
 
                     for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
+                        if (i + j >= masked_begin) {
+                            break;
+                        } else if (SS[j] == -INFINITY) {
                             SS[j] = 0.0f;
                         } else {
 #ifndef GGML_FLASH_ATTN_EXP_FP16
@@ -14530,10 +14532,10 @@ static void ggml_compute_forward_flash_attn_f32(
             assert(sum > 0.0);
 
             sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
+            ggml_vec_scale_f32(masked_begin, S, sum);
 
 #ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
+            for (int i = 0; i < masked_begin; ++i) {
                 assert(!isnan(S[i]));
                 assert(!isinf(S[i]));
             }
@@ -14550,7 +14552,7 @@ static void ggml_compute_forward_flash_attn_f32(
             const int iv2 = iq2 % nev2;
             const int iv3 = iq3;
 
-            ggml_vec_dot_f32(nev0,
+            ggml_vec_dot_f32(masked_begin,
                     (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
                     (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
                     S);
@@ -15126,20 +15128,17 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 // scale
                 ggml_vec_scale_f32(nek1, S, scale);
 
-                if (masked) {
-                    for (int64_t i = P; i < M; i++) {
-                        if (i > P + iq1) {
-                            S[i] = -INFINITY;
-                        }
-                    }
+                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+                for (int64_t i = masked_begin; i < M; i++) {
+                    S[i] = -INFINITY;
                 }
 
                 // softmax
-                // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
+                // exclude known -INF S[..] values from max and loop
                 // dont forget to set their SM values to zero
                 {
                     float max = -INFINITY;
-                    ggml_vec_max_f32(M, &max, S);
+                    ggml_vec_max_f32(masked_begin, &max, S);
 
                     ggml_float sum = 0.0;
                     {
@@ -15153,11 +15152,16 @@ static void ggml_compute_forward_flash_attn_back_f32(
                         ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
 
                         for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
+                            if (i >= masked_begin) {
+                                break;
+                            }
                             float * SR =  S + i;
                             float * SW = SM + i;
 
                             for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                                if (SR[j] == -INFINITY) {
+                                if (i + j >= masked_begin) {
+                                    break;
+                                } else if (SR[j] == -INFINITY) {
                                     SW[j] = 0.0f;
                                 } else {
 #ifndef GGML_FLASH_ATTN_EXP_FP16
@@ -15182,7 +15186,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
                     assert(sum > 0.0);
 
                     sum = 1.0/sum;
-                    ggml_vec_scale_f32(M, SM, sum);
+                    ggml_vec_scale_f32(masked_begin, SM, sum);
 
                 }
 
@@ -15253,9 +15257,10 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
                 // for ic:
                 //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
-                ggml_vec_set_f32(M, S, 0);
+                // exclude known future zero S[..] values from operation
+                ggml_vec_set_f32(masked_begin, S, 0);
                 for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(M,
+                    ggml_vec_mad_f32(masked_begin,
                             S,
                              (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
                             *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
@@ -15263,23 +15268,15 @@ static void ggml_compute_forward_flash_attn_back_f32(
 
                 // S = SM * (S - dot(SM, S))
                 float dot_SM_gradSM = 0;
-                ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S);
+                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
                 ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
-                ggml_vec_mul_f32 (M, S, S, SM);
+                ggml_vec_mul_f32 (masked_begin, S, S, SM);
 
                 // S = diag_mask_zero(S, P) * scale
-                if (masked) {
-                    // for (int64_t i = P + iq1 + 1; i < M; i++) {
-                    //     S[i] = 0;
-                    // }
-                    for (int64_t i = P; i < M; i++) {
-                        if (i > P + iq1) {
-                            S[i] = 0;
-                        }
-                    }
-                }
-                // todo: exclude known zero S[..] values from operation
-                ggml_vec_scale_f32(M, S, scale);
+                // already done by above ggml_vec_set_f32
+
+                // exclude known zero S[..] values from operation
+                ggml_vec_scale_f32(masked_begin, S, scale);
 
                 // S    shape [M,1]
                 // SM   shape [M,1]
@@ -15291,8 +15288,8 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
                 // for ic:
                 //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
-                // todo: exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < M; ++ic) {
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
                     ggml_vec_mad_f32(D,
                             (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
                             (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
@@ -15303,8 +15300,8 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 // for ic:
                 //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
                 //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
-                // todo: exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < M; ++ic) {
+                // exclude known zero S[..] values from loop
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
                     ggml_vec_mad_f32(D,
                             (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
                             (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
@@ -15315,9 +15312,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 // for ic:
                 //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
                 //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
-                // todo: exclude known zero SM[..] values from mad
+                // exclude known zero SM[..] values from mad
                 for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(M,
+                    ggml_vec_mad_f32(masked_begin,
                             (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
                             SM,
                             *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));

From 7f378a7561d82909122443330c44f01ec365db08 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 00:21:05 +0200
Subject: [PATCH 181/235] remove probably unnecessary exception type flags from
 stringstream

---
 examples/finetune/finetune.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 9c1fd49439ae4..cf8094dcb69ea 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1418,7 +1418,7 @@ size_t tokenize_file(
 void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
     std::stringstream s_rng_state;
     s_rng_state.imbue(std::locale::classic());
-    s_rng_state.exceptions(std::stringstream::eofbit | std::stringstream::failbit | std::stringstream::badbit);
+    s_rng_state.exceptions(std::stringstream::failbit);
     s_rng_state.str(rng_state);
     s_rng_state >> rng;
 }
@@ -1426,7 +1426,6 @@ void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
 std::string mt19937_get_state(const std::mt19937& rng) {
     std::stringstream s_rng_state;
     s_rng_state.imbue(std::locale::classic());
-    s_rng_state.exceptions(std::stringstream::badbit);
     s_rng_state << rng;
     return s_rng_state.str();
 }

From f627e2fe9cf7a79f82af5b1960c27bc0056bcdfc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 03:04:04 +0200
Subject: [PATCH 182/235] pass correct max number of tokens to llama_tokenize

---
 examples/finetune/finetune.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index cf8094dcb69ea..8563f80cd52af 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1266,10 +1266,10 @@ size_t tokenize_file(
         // tokenize all data at once
         out_tokens.resize(buf.size());
 
-        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), buf.size(), false);
+        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
         if (n_tokens < 0) {
             out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), buf.size(), false);
+            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
         }
         if (n_tokens >= 0) {
             out_tokens.resize(n_tokens);
@@ -1362,13 +1362,13 @@ size_t tokenize_file(
                 int n_tokens = llama_tokenize(lctx,
                     buf_sample.data(),
                     tok_sample.data(),
-                    sample_size, false);
+                    tok_sample.size(), false);
                 if (n_tokens < 0) {
                     tok_sample.resize(-n_tokens);
                     n_tokens = llama_tokenize(lctx,
                         buf_sample.data(),
                         tok_sample.data(),
-                        sample_size, false);
+                        tok_sample.size(), false);
                     GGML_ASSERT(n_tokens >= 0);
                 }
                 GGML_ASSERT(n_tokens <= tok_sample.size());

From 2c59f7bea33c8c057836087911c3591c8855ec50 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 10:48:38 +0200
Subject: [PATCH 183/235] account for possible leading whitespace that will be
 added by tokenizer e.g. '\t' will be tokenized by llama spm tokenizer to
 [29871, 12]

---
 examples/finetune/finetune.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 8563f80cd52af..f476813abcff6 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1250,6 +1250,10 @@ size_t tokenize_file(
         return out_tokens.size();
     }
 
+    // account for possible leading whitespace that will be added by tokenizer
+    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
+    const int n_max_tokens_overhead = 1;
+
     std::vector<char> buf;
     buf.resize(f.size+1);
 
@@ -1264,7 +1268,7 @@ size_t tokenize_file(
 
     if (sample_start.size() == 0) {
         // tokenize all data at once
-        out_tokens.resize(buf.size());
+        out_tokens.resize(buf.size() + n_max_tokens_overhead);
 
         int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
         if (n_tokens < 0) {
@@ -1358,7 +1362,7 @@ size_t tokenize_file(
                 // printf("sample: '%s'\n", buf_sample.data());
 
                 // tokenize the sample
-                tok_sample.resize(sample_size);
+                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
                 int n_tokens = llama_tokenize(lctx,
                     buf_sample.data(),
                     tok_sample.data(),

From 20cf1a4589a52af3f4485895ef561f75b2fb3a91 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 14:27:34 +0200
Subject: [PATCH 184/235] use unrolled vec_mad in out_prod

y is vec_mad result vec.
x is vec_mad input vec.
v is vec_mad input scalar.

ggml_vec_mad_f32_unroll will internally loop over x and v with same y.

GGML_VEC_MAD_UNROLL is by default defined to 32.

This value is empirical optimized using performance test runs of out-prod in openllama-3b finetune with 256 context length and batch size 1. It gives 23% performance boost for out_prod.

Full measurements of out-prod runtime in ms:
	unroll_xv	unroll_yv
1	67014.643	87826.469
2	77117.552	89077.656
4	72091.311	109121.657
8	61077.543	88678.334
16	56914.67	79514.947
24	59024.595	84350.254
28	55952.446	83368.73
32	51476.658	85177.745
36	55973.792	84659.92
40	55139.616	93844.738
48	60736.392	93330.267
64	99856.878	116994.99

Second column is when unrollying yv instead of xv
---
 ggml.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 82 insertions(+), 10 deletions(-)

diff --git a/ggml.c b/ggml.c
index 6296809a449c4..f88aba042e8d7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -134,6 +134,7 @@ typedef void * thread_ret_t;
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
+#define GGML_VEC_MAD_UNROLL  32
 
 //
 // logging
@@ -3707,6 +3708,58 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 #endif
 }
 
+// xs and vs are byte strides of x and v
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+
+    const float * restrict x[GGML_VEC_MAD_UNROLL];
+    const float * restrict v[GGML_VEC_MAD_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
+        x[i] = (const float *) ((const char *) xv + i*xs);
+        v[i] = (const float *) ((const char *) vv + i*vs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
+    }
+
+    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+            }
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = np; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#else
+    // scalar
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = 0; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#endif
+}
+
 //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_USE_ACCELERATE)
@@ -11745,6 +11798,13 @@ static void ggml_compute_forward_out_prod_f32(
         return;
     }
 
+    // dst[:,:,:,:] = 0
+    // for i2,i3:
+    //   for i1:
+    //     for i01:
+    //       for i0:
+    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
+
     // parallelize by last three dimensions
 
     // total rows in dst
@@ -11757,13 +11817,6 @@ static void ggml_compute_forward_out_prod_f32(
     const int64_t ir0 = dr*ith;
     const int64_t ir1 = MIN(ir0 + dr, nr);
 
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
     for (int64_t ir = ir0; ir < ir1; ++ir) {
         // dst indices
         const int64_t i3 = ir/(ne2*ne1);
@@ -11776,7 +11829,27 @@ static void ggml_compute_forward_out_prod_f32(
         //const int64_t i10 = i1;
         const int64_t i12 = i2;
         const int64_t i13 = i3;
+#if GGML_VEC_MAD_UNROLL > 2
+        const int64_t ne01_unroll = ne01 - (ne01 % GGML_VEC_MAD_UNROLL);
+        for (int64_t i01 = 0; i01 < ne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+
+            ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
+        }
+        for (int64_t i01 = ne01_unroll; i01 < ne01; ++i01) {
+            const int64_t i11 = i01;
+
+            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
+            ggml_vec_mad_f32(ne0, d, s0, *s1);
+        }
+#else
         for (int64_t i01 = 0; i01 < ne01; ++i01) {
             const int64_t i11 = i01;
 
@@ -11785,12 +11858,11 @@ static void ggml_compute_forward_out_prod_f32(
             float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
             ggml_vec_mad_f32(ne0, d, s0, *s1);
-            // for (int64_t i0 = 0; i0 < ne0; ++i0) {
-            //     d[i0] += s0[i0] * s1[i1];
-            // }
         }
+#endif
     }
 
+
     //int64_t t1 = ggml_perf_time_us();
     //static int64_t acc = 0;
     //acc += t1 - t0;

From 3a9c1d7f5a5a237d728479450ed25e9182ddb0d6 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 17:58:31 +0200
Subject: [PATCH 185/235] set lora_alpha to value of lora_r if it is not set
 via command line

otherwise only changing lora_r will change scaling of lora adapter used in prediction
---
 examples/finetune/finetune.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index f476813abcff6..260b89ce38b5a 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2052,6 +2052,7 @@ struct train_params {
 
     int32_t lora_r;
     int32_t lora_alpha;
+    bool custom_lora_alpha;
 
     uint32_t n_rank_attention_norm;
     uint32_t n_rank_wq;
@@ -2147,8 +2148,9 @@ struct train_params get_default_train_params() {
     params.custom_rope_freq_base  = false;
     params.custom_rope_freq_scale = false;
 
-    params.lora_alpha  = 4;
     params.lora_r      = 4;
+    params.lora_alpha  = 4;
+    params.custom_lora_alpha = false;
 
     params.n_rank_attention_norm = 1;
     params.n_rank_wq             = 4;
@@ -2406,6 +2408,7 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) {
                 break;
             }
             params->lora_alpha = std::stoi(argv[i]);
+            params->custom_lora_alpha = true;
         } else if (arg == "--lora-r") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2909,7 +2912,7 @@ int main(int argc, char ** argv) {
         lora.hparams.rope_freq_scale = params.rope_freq_scale;
     }
     lora.hparams.lora_r                = params.lora_r;
-    lora.hparams.lora_alpha            = params.lora_alpha;
+    lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;
     int n_rank_attention_norm          = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
     int n_rank_wq                      = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
     int n_rank_wk                      = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;

From 0971fee710fbd881ca2688b7cbdc9feac723ace1 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 18:21:23 +0200
Subject: [PATCH 186/235] reshuffle original sample order instead of the
 previous shuffled order

otherwise resumed reshuffle will not result in same sample order
---
 examples/finetune/finetune.cpp | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 260b89ce38b5a..37be889d36952 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1439,7 +1439,13 @@ std::string mt19937_seed_to_state(unsigned seed) {
     return mt19937_get_state(rng);
 }
 
-std::string shuffle_samples(const std::string& rng_state, size_t * begins, size_t * sizes, size_t count) {
+std::string shuffle_samples(
+        const std::string & rng_state,
+        const size_t      * begins,
+        const size_t      * sizes,
+        size_t            * shuffled_begins,
+        size_t            * shuffled_sizes,
+        size_t              count) {
     if (count == 0) return rng_state;
 
     std::mt19937 rng;
@@ -1463,18 +1469,13 @@ std::string shuffle_samples(const std::string& rng_state, size_t * begins, size_
     }
 
     // reorder begins and sizes by sorted indices
-    std::vector<size_t> reordered;
-    reordered.resize(count);
-
     for (unsigned i=0; i<count; ++i) {
-        reordered[i] = begins[idcs[i]];
+        shuffled_begins[i] = begins[idcs[i]];
     }
-    memcpy(begins, reordered.data(), sizeof(*begins)*reordered.size());
 
     for (unsigned i=0; i<count; ++i) {
-        reordered[i] = sizes[idcs[i]];
+        shuffled_sizes[i] = sizes[idcs[i]];
     }
-    memcpy(sizes, reordered.data(), sizeof(*sizes)*reordered.size());
 
     return mt19937_get_state(rng);
 }
@@ -2672,6 +2673,8 @@ struct opt_callback_data {
     size_t                    tokens_size;
     size_t *                  samples_begin;
     size_t *                  samples_size;
+    size_t *                  shuffled_samples_begin;
+    size_t *                  shuffled_samples_size;
     size_t                    samples_count;
     struct ggml_tensor *      tokens_input;
     struct ggml_tensor *      target_probs;
@@ -2793,8 +2796,8 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
 
     int used_samples = get_example_targets_batch(
         data->lctx,
-        data->samples_begin,
-        data->samples_size,
+        data->shuffled_samples_begin,
+        data->shuffled_samples_size,
         data->samples_count,
         data->tokens_data,
         data->tokens_size,
@@ -2816,6 +2819,8 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
             data->lora->shuffle_rng_state_current,
             data->samples_begin,
             data->samples_size,
+            data->shuffled_samples_begin,
+            data->shuffled_samples_size,
             data->samples_count);
         data->lora->shuffle_next_sample = 0;
     }
@@ -3196,10 +3201,16 @@ int main(int argc, char ** argv) {
         lora.shuffle_next_sample = 0;
         lora.shuffle_samples_hash = shuffle_samples_hash;
     }
+    std::vector<size_t> train_shuffled_samples_begin;
+    std::vector<size_t> train_shuffled_samples_size;
+    train_shuffled_samples_begin.resize(train_samples_begin.size());
+    train_shuffled_samples_size.resize(train_samples_size.size());
     lora.shuffle_rng_state_next = shuffle_samples(
         lora.shuffle_rng_state_current,
         train_samples_begin.data(),
         train_samples_size.data(),
+        train_shuffled_samples_begin.data(),
+        train_shuffled_samples_size.data(),
         train_samples_size.size());
 
     printf("%s: begin training\n", __func__);
@@ -3215,6 +3226,8 @@ int main(int argc, char ** argv) {
     opt_cb_data.tokens_size            = train_tokens.size();
     opt_cb_data.samples_begin          = train_samples_begin.data();
     opt_cb_data.samples_size           = train_samples_size.data();
+    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
+    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
     opt_cb_data.samples_count          = train_samples_size.size();
     opt_cb_data.tokens_input           = tokens_input;
     opt_cb_data.target_probs           = target_probs;

From d88dae2980c7d01590eb60afe653cc04fe34ee96 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 18:39:46 +0200
Subject: [PATCH 187/235] block tiling for out-prod inspired by mul-mat

block sizes are empirically optimized

roughly doubles the flops of out-prod
---
 ggml.c | 75 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/ggml.c b/ggml.c
index f88aba042e8d7..e88c046aa6159 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11817,49 +11817,60 @@ static void ggml_compute_forward_out_prod_f32(
     const int64_t ir0 = dr*ith;
     const int64_t ir1 = MIN(ir0 + dr, nr);
 
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        // dst indices
-        const int64_t i3 = ir/(ne2*ne1);
-        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+    // block-tiling attempt
+    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
+    const int64_t blck_1 = 16;
 
-        const int64_t i02 = i2;
-        const int64_t i03 = i3;
+    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
+        const int64_t bir1 = MIN(bir + blck_1, ir1);
+        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
+            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
+            for (int64_t ir = bir; ir < bir1; ++ir) {
+                // dst indices
+                const int64_t i3 = ir/(ne2*ne1);
+                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
+                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+                const int64_t i02 = i2;
+                const int64_t i03 = i3;
+
+                //const int64_t i10 = i1;
+                const int64_t i12 = i2;
+                const int64_t i13 = i3;
 
-        //const int64_t i10 = i1;
-        const int64_t i12 = i2;
-        const int64_t i13 = i3;
 #if GGML_VEC_MAD_UNROLL > 2
-        const int64_t ne01_unroll = ne01 - (ne01 % GGML_VEC_MAD_UNROLL);
-        for (int64_t i01 = 0; i01 < ne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
-            const int64_t i11 = i01;
+                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
+                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
+                    const int64_t i11 = i01;
 
-            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
-            ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
-        }
-        for (int64_t i01 = ne01_unroll; i01 < ne01; ++i01) {
-            const int64_t i11 = i01;
+                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
+                }
+                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
 
-            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
-            ggml_vec_mad_f32(ne0, d, s0, *s1);
-        }
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
 #else
-        for (int64_t i01 = 0; i01 < ne01; ++i01) {
-            const int64_t i11 = i01;
+                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
+                    const int64_t i11 = i01;
 
-            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
+                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
 
-            ggml_vec_mad_f32(ne0, d, s0, *s1);
-        }
+                    ggml_vec_mad_f32(ne0, d, s0, *s1);
+                }
 #endif
+            }
+        }
     }
 
 

From 76804fab1ddb008b8bd4bd22d0f924dcd16163e0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Thu, 14 Sep 2023 22:18:20 +0200
Subject: [PATCH 188/235] exclude some more known zero values from computations
 in flash_attn_f32 & flash_attn_back_f32

---
 ggml.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index e88c046aa6159..35e067c689e89 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14540,7 +14540,8 @@ static void ggml_compute_forward_flash_attn_f32(
             S[i] = -INFINITY;
         }
 
-        for (int64_t ic = 0; ic < nek1; ++ic) {
+        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+        for (int64_t ic = 0; ic < masked_begin; ++ic) {
             // k indices
             const int ik3 = iq3;
             const int ik2 = iq2 % nek2;
@@ -14556,9 +14557,8 @@ static void ggml_compute_forward_flash_attn_f32(
         }
 
         // scale
-        ggml_vec_scale_f32(nek1, S, scale);
+        ggml_vec_scale_f32(masked_begin, S, scale);
 
-        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
         for (int64_t i = masked_begin; i < M; i++) {
             S[i] = -INFINITY;
         }
@@ -15195,7 +15195,8 @@ static void ggml_compute_forward_flash_attn_back_f32(
                     S[i] = -INFINITY;
                 }
 
-                for (int64_t ic = 0; ic < nek1; ++ic) {
+                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
+                for (int64_t ic = 0; ic < masked_begin; ++ic) {
                     // k indices
                     const int ik1 = ic;
 
@@ -15209,9 +15210,8 @@ static void ggml_compute_forward_flash_attn_back_f32(
                 }
 
                 // scale
-                ggml_vec_scale_f32(nek1, S, scale);
+                ggml_vec_scale_f32(masked_begin, S, scale);
 
-                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
                 for (int64_t i = masked_begin; i < M; i++) {
                     S[i] = -INFINITY;
                 }

From 4f2ce91b9e050faa04ab7d73e2bcf7e2d8c6f667 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 15 Sep 2023 23:25:13 +0200
Subject: [PATCH 189/235] add static keywords

---
 examples/finetune/finetune.cpp                | 272 +++++++++---------
 .../train-text-from-scratch.cpp               | 234 +++++++--------
 2 files changed, 253 insertions(+), 253 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 37be889d36952..584ada1817d43 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -33,39 +33,39 @@ struct random_uniform_distribution {
     std::uniform_real_distribution<float> rd;
 };
 
-void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+static void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
     rnd->gen = std::mt19937(seed);
     rnd->rd = std::normal_distribution<float>{mean, std};
     rnd->min = min;
     rnd->max = max;
 }
 
-void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
+static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
     rnd->gen = std::mt19937(seed);
     rnd->rd = std::uniform_real_distribution<float>{min, max};
 }
 
-int clamp(const int v, const int min, const int max) {
+static int clamp(const int v, const int min, const int max) {
     return ((v < min) ? (min) : (v > max) ? (max) : v);
 }
 
-float fclamp(const float v, const float min, const float max) {
+static float fclamp(const float v, const float min, const float max) {
     return ((v < min) ? (min) : (v > max) ? (max) : v);
 }
 
-float frand() {
+static float frand() {
     return (float)rand()/(float)RAND_MAX;
 }
 
-float frand_normal(struct random_normal_distribution * rnd) {
+static float frand_normal(struct random_normal_distribution * rnd) {
     return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
 }
 
-float frand_uniform(struct random_uniform_distribution * rnd) {
+static float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
-struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+static struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
         case 1:
@@ -114,7 +114,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
     return tensor;
 }
 
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
+static struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
     switch (tensor->n_dims) {
         case 1:
             for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@@ -299,96 +299,96 @@ struct my_llama_lora {
 };
 
 // gguf constants
-const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
-const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
-const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
-const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
-const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
-const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
-const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
-const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
-const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
-const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
-const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
-const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
-const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
-
-const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
-const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
-const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
-
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
-
-const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
-const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
-const char * LLM_KV_TRAINING_TYPE               = "training.type";
-const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
-const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
-const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
-const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
-const char * LLM_KV_TRAINING_EPOCH_COUNT        = "training.epoch_count";
-const char * LLM_KV_TRAINING_SAMPLES_HASH       = "training.samples_hash";
-const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
-const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
-const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
-const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
-
-const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
-const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
-const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
-const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm";
-const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q";
-const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k";
-const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v";
-const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output";
-const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm";
-const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate";
-const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down";
-const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up";
+static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
+static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
+static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
+static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
+static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
+static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
+static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
+static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
+static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
+static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
+static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
+static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
+static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
+static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
+
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
+
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
+
+static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
+static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
+static const char * LLM_KV_TRAINING_TYPE               = "training.type";
+static const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
+static const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
+static const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
+static const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
+static const char * LLM_KV_TRAINING_EPOCH_COUNT        = "training.epoch_count";
+static const char * LLM_KV_TRAINING_SAMPLES_HASH       = "training.samples_hash";
+static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
+static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
+static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
+static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
+
+static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
+static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
+static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
+static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm";
+static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q";
+static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k";
+static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v";
+static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output";
+static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm";
+static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate";
+static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down";
+static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up";
 
 // gguf constants (sync with gguf.py)
 
-const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-const char * LLM_TENSOR_OUTPUT        = "output";
-const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-void print_params(struct my_llama_hparams * params) {
+static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
+static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
+
+static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
+static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
+static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
+static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
+static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
+static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
+static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
+static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
+static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
+
+static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
+static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
+static const char * LLM_TENSOR_OUTPUT        = "output";
+static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
+static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
+static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
+static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
+static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
+static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
+static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
+static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
+static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
+
+static void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
     printf("%s: n_embd:  %u\n", __func__, params->n_embd);
@@ -398,7 +398,7 @@ void print_params(struct my_llama_hparams * params) {
     printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 
-void print_lora_params(struct my_llama_lora_hparams * params) {
+static void print_lora_params(struct my_llama_lora_hparams * params) {
     printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm);
     printf("%s: n_rank_wq             : %u\n", __func__, params->n_rank_wq);
     printf("%s: n_rank_wk             : %u\n", __func__, params->n_rank_wk);
@@ -416,7 +416,7 @@ void print_lora_params(struct my_llama_lora_hparams * params) {
     printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
 }
 
-void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
+static void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
     auto & hparams = model->hparams;
 
     std::vector<char> tn_buf;
@@ -462,7 +462,7 @@ void init_model(struct llama_model * input, struct my_llama_model * model, uint3
     }
 }
 
-void set_param_lora(struct my_llama_lora * lora) {
+static void set_param_lora(struct my_llama_lora * lora) {
     const uint32_t n_layer = lora->layers.size();
 
     struct ggml_context* ctx = lora->ctx;
@@ -498,7 +498,7 @@ void set_param_lora(struct my_llama_lora * lora) {
     }
 }
 
-void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
+static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
     const auto & lparams = lora->hparams;
 
     const uint32_t n_embd     = model->hparams.n_embd;
@@ -716,7 +716,7 @@ void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora)
 
 
 
-void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
+static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
     const uint32_t n_layer = lora->layers.size();
 
     struct random_normal_distribution rnd;
@@ -755,25 +755,25 @@ void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std
     }
 }
 
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
     GGML_ASSERT(tensor->n_dims == 1);
     GGML_ASSERT(tensor->ne[0] == ne0);
 }
 
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
     GGML_ASSERT(tensor->n_dims == 2);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
 }
 
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
     GGML_ASSERT(tensor->n_dims == 3);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
     GGML_ASSERT(tensor->ne[2] == ne2);
 }
 
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
     GGML_ASSERT(tensor->n_dims == 4);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
@@ -781,7 +781,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-struct ggml_tensor * llama_build_lora_finetune_graphs(
+static struct ggml_tensor * llama_build_lora_finetune_graphs(
         struct my_llama_model * model,
         struct my_llama_lora  * lora,
         struct ggml_allocr    * alloc,
@@ -1019,7 +1019,7 @@ struct ggml_tensor * llama_build_lora_finetune_graphs(
     return t36;
 }
 
-int get_example_targets_batch(
+static int get_example_targets_batch(
         struct llama_context * lctx,
         const size_t         * samples_begin,
         const size_t         * samples_size,
@@ -1229,7 +1229,7 @@ static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nu
     return count_utf8;
 }
 
-size_t tokenize_file(
+static size_t tokenize_file(
         struct llama_context     * lctx,
         const char               * filename,
         const std::string        & sample_start,
@@ -1419,7 +1419,7 @@ size_t tokenize_file(
     return out_tokens.size();
 }
 
-void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
+static void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
     std::stringstream s_rng_state;
     s_rng_state.imbue(std::locale::classic());
     s_rng_state.exceptions(std::stringstream::failbit);
@@ -1427,19 +1427,19 @@ void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
     s_rng_state >> rng;
 }
 
-std::string mt19937_get_state(const std::mt19937& rng) {
+static std::string mt19937_get_state(const std::mt19937& rng) {
     std::stringstream s_rng_state;
     s_rng_state.imbue(std::locale::classic());
     s_rng_state << rng;
     return s_rng_state.str();
 }
 
-std::string mt19937_seed_to_state(unsigned seed) {
+static std::string mt19937_seed_to_state(unsigned seed) {
     std::mt19937 rng(seed);
     return mt19937_get_state(rng);
 }
 
-std::string shuffle_samples(
+static std::string shuffle_samples(
         const std::string & rng_state,
         const size_t      * begins,
         const size_t      * sizes,
@@ -1480,7 +1480,7 @@ std::string shuffle_samples(
     return mt19937_get_state(rng);
 }
 
-std::string replace_str(const char * s, const char * needle, const char * replacement) {
+static std::string replace_str(const char * s, const char * needle, const char * replacement) {
     std::string str = s;
     size_t pos = str.find(needle);
     if (pos != std::string::npos) {
@@ -1504,7 +1504,7 @@ std::string replace_str(const char * s, const char * needle, const char * replac
     } \
 }
 
-bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
+static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
     GGML_ASSERT(a != NULL);
     GGML_ASSERT(b != NULL);
     GGML_ASSERT(a->type == b->type);
@@ -1514,7 +1514,7 @@ bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
     return true;
 }
 
-void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
+static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
     if (dst == NULL) {
         return;
     }
@@ -1527,7 +1527,7 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
     }
 }
 
-void load_default_lora_params_from_base_model(const char * fn_base_model, struct my_llama_lora_hparams * lora_params) {
+static void load_default_lora_params_from_base_model(const char * fn_base_model, struct my_llama_lora_hparams * lora_params) {
     if (strlen(fn_base_model) == 0) {
         return;
     }
@@ -1558,7 +1558,7 @@ void load_default_lora_params_from_base_model(const char * fn_base_model, struct
     gguf_free(fctx);
 }
 
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
+static void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
     uint32_t file_version;
@@ -1617,7 +1617,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     }
 }
 
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
+static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
     gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
@@ -1684,7 +1684,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
     }
 }
 
-void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
+static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
     std::string arch;
@@ -1767,7 +1767,7 @@ void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
     }
 }
 
-void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
+static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
     const char * arch = "llama";
     enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
 
@@ -1834,7 +1834,7 @@ void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * mo
     }
 }
 
-void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
 
     uint32_t file_version;
@@ -1867,7 +1867,7 @@ void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context *
     load_opt_context_gguf(fctx, f_ggml_ctx, opt);
 }
 
-void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     save_llama_lora_gguf(fctx, model, lora);
 
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
@@ -1885,7 +1885,7 @@ void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model
     save_opt_context_gguf(fctx, opt);
 }
 
-bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
     struct ggml_context * f_ggml_ctx;
     struct gguf_init_params params;
     params.no_alloc = false;
@@ -1901,7 +1901,7 @@ bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * mo
     return true;
 }
 
-void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
@@ -1915,7 +1915,7 @@ void save_checkpoint_lora_file(const char * filename, struct my_llama_model * mo
     gguf_free(fctx);
 }
 
-void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
+static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
     if (tensor == NULL) {
         file->write_u32(0);
         file->write_u32(0);
@@ -1941,7 +1941,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const c
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration, const char * latest) {
+static void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
@@ -2002,7 +2002,7 @@ void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, cons
     }
 }
 
-float cosine_decay(const int decay_steps, const float minimum, int step) {
+static float cosine_decay(const int decay_steps, const float minimum, int step) {
     if (step > decay_steps) {
         step = decay_steps;
     }
@@ -2011,7 +2011,7 @@ float cosine_decay(const int decay_steps, const float minimum, int step) {
     return decay;
 }
 
-float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
+static float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
     if (enable_restart) {
         while (step > decay_steps) {
             step -= decay_steps;
@@ -2118,7 +2118,7 @@ struct train_params {
     float adam_eps_f;
 };
 
-struct train_params get_default_train_params() {
+static struct train_params get_default_train_params() {
     struct train_params params;
     params.fn_model_base     = "";
     params.fn_train_data     = "shakespeare.txt";
@@ -2216,7 +2216,7 @@ struct train_params get_default_train_params() {
     return params;
 }
 
-void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+static void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -2289,7 +2289,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "\n");
 }
 
-bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
     bool invalid_param = false;
     std::string arg;
     struct train_params default_params = get_default_train_params();
@@ -2683,7 +2683,7 @@ struct opt_callback_data {
     double                    millis_per_iter;
 };
 
-void print_duration(double fmillis) {
+static void print_duration(double fmillis) {
     if (fmillis < 1000.0f) {
         printf("%.1fms", (float) fmillis);
         return;
@@ -2706,7 +2706,7 @@ void print_duration(double fmillis) {
     printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
 }
 
-void opt_callback(void * vdata, int accum_step, float * sched) {
+static void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
     struct ggml_opt_context * opt   = data->opt;
@@ -2826,7 +2826,7 @@ void opt_callback(void * vdata, int accum_step, float * sched) {
     }
 }
 
-int64_t get_parameter_count(struct my_llama_lora* lora) {
+static int64_t get_parameter_count(struct my_llama_lora* lora) {
     int64_t nx = 0;
     nx += ggml_nelements(lora->tok_embeddings_a);
     nx += ggml_nelements(lora->tok_embeddings_b);
@@ -2859,11 +2859,11 @@ int64_t get_parameter_count(struct my_llama_lora* lora) {
     return nx;
 }
 
-size_t hash_combine(size_t h1, size_t h2) {
+static size_t hash_combine(size_t h1, size_t h2) {
     return h1 ^ (h2 << 1);
 }
 
-size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
+static size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
     std::hash<std::string> h_string;
     std::hash<unsigned long long> h_ull;
     size_t h = h_string(std::string(fn));
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 745f8f21f6c0a..aa8fbd6b6bbd2 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -30,39 +30,39 @@ struct random_uniform_distribution {
     std::uniform_real_distribution<float> rd;
 };
 
-void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
+static void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
     rnd->gen = std::mt19937(seed);
     rnd->rd = std::normal_distribution<float>{mean, std};
     rnd->min = min;
     rnd->max = max;
 }
 
-void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
+static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
     rnd->gen = std::mt19937(seed);
     rnd->rd = std::uniform_real_distribution<float>{min, max};
 }
 
-int clamp(const int v, const int min, const int max) {
+static int clamp(const int v, const int min, const int max) {
     return ((v < min) ? (min) : (v > max) ? (max) : v);
 }
 
-float fclamp(const float v, const float min, const float max) {
+static float fclamp(const float v, const float min, const float max) {
     return ((v < min) ? (min) : (v > max) ? (max) : v);
 }
 
-float frand() {
+static float frand() {
     return (float)rand()/(float)RAND_MAX;
 }
 
-float frand_normal(struct random_normal_distribution * rnd) {
+static float frand_normal(struct random_normal_distribution * rnd) {
     return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
 }
 
-float frand_uniform(struct random_uniform_distribution * rnd) {
+static float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
-struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+static struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
         case 1:
@@ -111,7 +111,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
     return tensor;
 }
 
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
+static struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
     switch (tensor->n_dims) {
         case 1:
             for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@@ -208,88 +208,88 @@ struct my_llama_model {
 };
 
 // gguf constants
-const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
-const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
-const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
-const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
-const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
-const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
-const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
-const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
-const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
-const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
-const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
-const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
-const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
-const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
-
-const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
-const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
-const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
-
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
-const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
-
-const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
-const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
-const char * LLM_KV_TRAINING_TYPE               = "training.type";
-const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
-const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
-const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
-const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
+static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
+static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
+static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
+static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
+static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
+static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
+static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
+static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
+static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
+static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
+static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
+static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
+static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
+static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
+
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
+
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
+
+static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
+static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
+static const char * LLM_KV_TRAINING_TYPE               = "training.type";
+static const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
+static const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
+static const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
+static const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
 
 // gguf constants (sync with gguf.py)
 
-const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
-const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
-const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
-const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
-const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
-const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
-const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
-const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
-const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
-const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
-
-const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-const char * LLM_TENSOR_OUTPUT        = "output";
-const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-void print_params(struct my_llama_hparams * params) {
+static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
+static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
+
+static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
+static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
+static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
+static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
+static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
+static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
+static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
+static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
+static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
+
+static const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
+static const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
+static const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
+static const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
+static const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
+static const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
+static const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
+static const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
+static const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
+static const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
+
+static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
+static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
+static const char * LLM_TENSOR_OUTPUT        = "output";
+static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
+static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
+static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
+static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
+static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
+static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
+static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
+static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
+static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
+
+static void print_params(struct my_llama_hparams * params) {
     printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
     printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
     printf("%s: n_embd:  %d\n", __func__, params->n_embd);
@@ -299,7 +299,7 @@ void print_params(struct my_llama_hparams * params) {
     printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }
 
-void init_model(struct my_llama_model * model) {
+static void init_model(struct my_llama_model * model) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_embd  = hparams.n_embd;
@@ -366,7 +366,7 @@ void init_model(struct my_llama_model * model) {
     }
 }
 
-void set_param_model(struct my_llama_model * model) {
+static void set_param_model(struct my_llama_model * model) {
     const auto& hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -392,7 +392,7 @@ void set_param_model(struct my_llama_model * model) {
     }
 }
 
-void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
+static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
     const auto & hparams = model->hparams;
 
     const uint32_t n_layer = hparams.n_layer;
@@ -421,25 +421,25 @@ void randomize_model(struct my_llama_model * model, int seed, float mean, float
     }
 }
 
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
     GGML_ASSERT(tensor->n_dims == 1);
     GGML_ASSERT(tensor->ne[0] == ne0);
 }
 
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
     GGML_ASSERT(tensor->n_dims == 2);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
 }
 
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
     GGML_ASSERT(tensor->n_dims == 3);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
     GGML_ASSERT(tensor->ne[2] == ne2);
 }
 
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
     GGML_ASSERT(tensor->n_dims == 4);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
@@ -447,7 +447,7 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
     GGML_ASSERT(tensor->ne[3] == ne3);
 }
 
-struct ggml_tensor * llama_build_train_graphs(
+static struct ggml_tensor * llama_build_train_graphs(
         struct my_llama_model * model,
         struct ggml_allocr    * alloc,
         struct ggml_context   * ctx,
@@ -623,7 +623,7 @@ struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+static void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     int n_tokens = tokens_input->ne[0];
     int n_vocab  = target_logits->ne[0];
 
@@ -643,7 +643,7 @@ void get_example_targets(struct llama_context * lctx, const int * train_samples,
     }
 }
 
-void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+static void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -678,7 +678,7 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
     }
 }
 
-int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
+static int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
     FILE * fp = std::fopen(filename, "rb");
     if (fp == NULL) {
         return 0;
@@ -749,7 +749,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     return n_tokens;
 }
 
-void shuffle_ints(int * begin, int * end) {
+static void shuffle_ints(int * begin, int * end) {
     if (end <= begin) return;
     int max=begin[0];
     for (int i=1; i<end-begin; ++i) {
@@ -767,7 +767,7 @@ void shuffle_ints(int * begin, int * end) {
     });
 }
 
-std::string replace_str(const char * s, const char * needle, const char * replacement) {
+static std::string replace_str(const char * s, const char * needle, const char * replacement) {
     std::string str = s;
     size_t pos = str.find(needle);
     if (pos != std::string::npos) {
@@ -792,7 +792,7 @@ std::string replace_str(const char * s, const char * needle, const char * replac
 }
 
 
-bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
+static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
     GGML_ASSERT(a != NULL);
     GGML_ASSERT(b != NULL);
     GGML_ASSERT(a->type == b->type);
@@ -802,7 +802,7 @@ bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
     return true;
 }
 
-void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
+static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
     if (dst == NULL) {
         return;
     }
@@ -815,7 +815,7 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
     }
 }
 
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
+static void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
     uint32_t file_version;
@@ -876,7 +876,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     }
 }
 
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
+static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
     gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
     gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
@@ -943,7 +943,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
     }
 }
 
-void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
+static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
     std::string arch;
 
@@ -1014,7 +1014,7 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     }
 }
 
-void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
+static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
     const char * arch = "llama";
     enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
 
@@ -1157,7 +1157,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
     }
 }
 
-void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, const char * pattern_it, int iteration, const char * latest) {
+static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
@@ -1171,7 +1171,7 @@ void save_llama_model_file(const char * filename, const char * fn_vocab_model, s
     gguf_free(fctx);
 }
 
-void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
+static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
     load_llama_model_gguf(fctx, f_ggml_ctx, model);
 
     if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) >= 0) {
@@ -1193,7 +1193,7 @@ void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
     }
 }
 
-void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
+static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
     save_llama_model_gguf(fctx, fn_vocab_model, model);
 
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
@@ -1205,7 +1205,7 @@ void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_mode
     save_opt_context_gguf(fctx, opt);
 }
 
-bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) {
+static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) {
     struct ggml_context * f_ggml_ctx;
     struct gguf_init_params params;
     params.no_alloc = false;
@@ -1220,7 +1220,7 @@ bool load_checkpoint_file(const char * filename, struct my_llama_model * model,
     return true;
 }
 
-void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
@@ -1234,7 +1234,7 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st
     gguf_free(fctx);
 }
 
-float cosine_decay(const int decay_steps, const float minimum, int step) {
+static float cosine_decay(const int decay_steps, const float minimum, int step) {
     if (step > decay_steps) {
         step = decay_steps;
     }
@@ -1243,7 +1243,7 @@ float cosine_decay(const int decay_steps, const float minimum, int step) {
     return decay;
 }
 
-float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
+static float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
     if (enable_restart) {
         while (step > decay_steps) {
             step -= decay_steps;
@@ -1381,7 +1381,7 @@ struct train_params get_default_train_params() {
     return params;
 }
 
-void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+static void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -1442,7 +1442,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
     fprintf(stderr, "\n");
 }
 
-bool train_params_parse(int argc, char ** argv, struct train_params * params) {
+static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
     bool invalid_param = false;
     std::string arg;
     struct train_params default_params = get_default_train_params();
@@ -1762,7 +1762,7 @@ struct opt_callback_data {
     struct ggml_tensor *      target_probs;
 };
 
-void opt_callback(void * vdata, int accum_step, float * sched) {
+static void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
     struct ggml_opt_context * opt   = data->opt;

From cc60b3f6398613106cc6a994100c3f4011d40ed8 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 15 Sep 2023 23:45:05 +0200
Subject: [PATCH 190/235] remove outcommented old code

---
 examples/finetune/finetune.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 584ada1817d43..6e09de776595a 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2790,7 +2790,6 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
             printf("-");
         }
         printf(">");
-        // printf("improvement: %*d>", impr_plot, (int)0);
         printf("\n");
     }
 

From ab56b63b27bee5ed259fb24477a9c625640878e2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 15 Sep 2023 23:45:54 +0200
Subject: [PATCH 191/235] update train-text-from-scratch with tokenization,
 sample selection and shuffling from finetune

---
 .../train-text-from-scratch.cpp               | 854 ++++++++++++++----
 1 file changed, 672 insertions(+), 182 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index aa8fbd6b6bbd2..8e06838ad3922 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -202,9 +202,16 @@ struct my_llama_model {
 
     std::vector<my_llama_layer> layers;
 
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
+    uint64_t train_its = 0;
+    uint64_t train_samples = 0;
+    uint64_t train_tokens = 0;
+    uint64_t train_epochs = 0;
+
+    size_t      shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
+    std::string shuffle_rng_state_current;
+    std::string shuffle_rng_state_next;
+    size_t      shuffle_sample_count;
+    size_t      shuffle_next_sample;
 };
 
 // gguf constants
@@ -242,13 +249,19 @@ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.
 static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
 static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
 
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE               = "training.type";
-static const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
-static const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
-static const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
-static const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
+static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
+static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
+static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
+static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
+static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
+static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
+static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
+static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
+static const char * LLM_KV_TRAINING_SAMPLES_HASH         = "training.samples_hash";
+static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
+static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
+static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
+static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
 
 // gguf constants (sync with gguf.py)
 
@@ -312,6 +325,7 @@ static void init_model(struct my_llama_model * model) {
     model->train_its = 0;
     model->train_samples = 0;
     model->train_tokens = 0;
+    model->train_epochs = 0;
 
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
@@ -623,148 +637,450 @@ static struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-static void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
-    int n_tokens = tokens_input->ne[0];
-    int n_vocab  = target_logits->ne[0];
-
-    size_t sample = train_samples[example_id % n_train_samples];
-    GGML_ASSERT(sample+n_tokens-1 < n_train_data);
+static int get_example_targets_batch(
+        struct llama_context * lctx,
+        const size_t         * samples_begin,
+        const size_t         * samples_size,
+              size_t           samples_count,
+        const llama_token    * train_data,
+        size_t                 n_train_data,
+        int                    example_id,
+        struct ggml_tensor   * tokens_input,
+        struct ggml_tensor   * target_probs,
+        bool                   separate_with_eos,
+        bool                   separate_with_bos,
+        bool                   fill_with_next_samples) {
 
-    ggml_set_f32(target_logits, -1.0f/n_vocab);
-    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
-    for (int i=1; i<n_tokens+1; ++i) {
-        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-        ggml_set_f32_nd(target_logits, token, i-1, 0, 0, +1.0f);
-        ggml_set_f32_nd(target_probs,  token, i-1, 0, 0, +1.0f);
-        if (i<n_tokens) {
-            ggml_set_i32_1d(tokens_input, i, token);
-        }
-    }
-}
-
-static void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
     GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_logits->n_dims == 3);
     GGML_ASSERT(target_probs->n_dims  == 3);
-    int n_vocab  = target_logits->ne[0];
+    int n_vocab  = target_probs->ne[0];
     int n_tokens = tokens_input->ne[0];
     int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_tokens == target_logits->ne[1]);
-    GGML_ASSERT(n_batch  == target_logits->ne[2]);
     GGML_ASSERT(n_vocab  == target_probs->ne[0]);
     GGML_ASSERT(n_tokens == target_probs->ne[1]);
     GGML_ASSERT(n_batch  == target_probs->ne[2]);
 
-    ggml_set_f32(target_logits, -1.0f/n_vocab);
+    int used_samples = 0;
+
     ggml_set_f32(target_probs, 0.0f);
+    int bos = llama_token_bos(lctx);
+    int eos = llama_token_eos(lctx);
     // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
-        size_t sample_idx = (example_id*n_batch + k) % n_train_samples;
-        size_t sample = train_samples[sample_idx];
+        size_t sample_offs  = 0;
+        size_t sample_idx   = (example_id + used_samples) % samples_count;
+        size_t sample_begin = samples_begin[sample_idx];
+        size_t sample_size  = samples_size[sample_idx];
+        ++used_samples;
+
         // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
-        GGML_ASSERT(sample+n_tokens-1 < n_train_data);
-
-        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, llama_token_bos(lctx));
-        for (int i=1; i<n_tokens+1; ++i) {
-            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
-            ggml_set_f32_nd(target_logits, token, i-1, k, 0, +1.0f);
-            ggml_set_f32_nd(target_probs,  token, i-1, k, 0, +1.0f);
-            if (i<n_tokens) {
-                ggml_set_i32_nd(tokens_input, i, k, 0, 0, token);
+        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
+
+        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
+        bool sample_separation_eos = !separate_with_eos;
+        bool sample_separation_bos = !separate_with_bos;
+        for (int i=0; i<n_tokens; ++i) {
+            int token = eos;
+            if (sample_offs >= sample_size && fill_with_next_samples) {
+                if (!sample_separation_eos) {
+                    // insert eos token to separate samples
+                    sample_separation_eos = true;
+                } else if (!sample_separation_bos) {
+                    // insert bos token to separate samples
+                    sample_separation_bos = true;
+                    token = bos;
+                } else {
+                    // sample separation is done, continue with next sample
+                    sample_separation_eos = !separate_with_eos;
+                    sample_separation_bos = !separate_with_bos;
+                    sample_offs  = 0;
+                    sample_idx   = (example_id + used_samples) % samples_count;
+                    sample_begin = samples_begin[sample_idx];
+                    sample_size  = samples_size[sample_idx];
+                    ++used_samples;
+                }
+            }
+            // note: no else-if here
+            if (sample_offs < sample_size) {
+                token = clamp(train_data[sample_begin+sample_offs], 0, n_vocab-1);
+                ++sample_offs;
+            }
+            ggml_set_f32_nd(target_probs,  token, i, k, 0, +1.0f);
+            if (i+1<n_tokens) {
+                ggml_set_i32_nd(tokens_input, i+1, k, 0, 0, token);
             }
         }
     }
-}
 
-static int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
-    FILE * fp = std::fopen(filename, "rb");
-    if (fp == NULL) {
-        return 0;
-    }
+    return used_samples;
+}
 
-#ifdef _WIN32
-    GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0);
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
 #else
-    GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_END) == 0);
+__attribute__((format(printf, 1, 2)))
 #endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
 
-    size_t size = 0;
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
 #ifdef _WIN32
-    __int64 ret = _ftelli64(fp);
-    size = ret;
+        __int64 ret = _ftelli64(fp);
 #else
-    long ret = std::ftell(fp);
-    size = ret;
+        long ret = std::ftell(fp);
 #endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
 
+    void seek(size_t offset, int whence) {
 #ifdef _WIN32
-    GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_SET) == 0);
+        int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
-    GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_SET) == 0);
+        int ret = std::fseek(fp, (long) offset, whence);
 #endif
+        GGML_ASSERT(ret == 0); // same
+    }
 
-    std::vector<char> buf;
-    buf.resize(size+1);
-    out.resize(size+1);
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
 
-    if (std::fread(buf.data(), size, 1, fp) != 1) {
-        die("unexpectedly reached end of file");
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
     }
-    if (ferror(fp)) {
-        die_fmt("fread failed: %s", strerror(errno));
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
     }
 
-    buf[size] = '\0';
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
 
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
-    if (n_tokens < 0) {
-        out.resize(-n_tokens);
-        n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+static size_t utf8_len(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+// mark each byte with its utf8 unit number.
+// returns the number of utf8 characters.
+// e.g. when bytes == '\x61\xD0\xB0\x62',
+// then utf8_units will become [0,0,1,0]
+// utf8_nunits will become [1,2,2,1] and 3 is returned.
+// bytes where utf8_units is zero, are the begin of an utf8 character.
+static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
+    size_t offs = 0;
+    size_t count_utf8 = 0;
+    while(offs < count) {
+        size_t len = utf8_len(bytes[offs]);
+        for (size_t i=0; i<len; ++i) {
+            utf8_units[offs+i] = i;
+            utf8_nunits[offs+i] = len;
+        }
+        offs += len;
+        ++count_utf8;
     }
-    GGML_ASSERT(n_tokens >= 0);
-    out.resize(n_tokens);
-
-    bool verify = false;
-    if (verify) {
-        const char * in  = buf.data();
-        const char * end = buf.data() + buf.size();
-        for (int i = 0; i < (int) out.size(); ++i) {
-            std::string s = llama_token_to_piece(lctx, out[i]);
-            int len = s.length();
-            if (in >= end) {
-                printf("%s: unexpected end of original text.\n", __func__);
-                break;
+    return count_utf8;
+}
+
+static size_t tokenize_file(
+        struct llama_context     * lctx,
+        const char               * filename,
+        const std::string        & sample_start,
+        bool                       include_sample_start,
+        bool                       overlapping_samples,
+        unsigned                   context_length,
+        std::vector<llama_token> & out_tokens,
+        std::vector<size_t>      & out_samples_begin,
+        std::vector<size_t>      & out_samples_size) {
+    struct llama_file f(filename, "rb");
+
+    if (f.size == 0) {
+        out_tokens.clear();
+        out_samples_begin.clear();
+        out_samples_size.clear();
+        printf("%s: warning: empty or not existing training data file '%s'\n",
+            __func__, filename);
+        return out_tokens.size();
+    }
+
+    // account for possible leading whitespace that will be added by tokenizer
+    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
+    const int n_max_tokens_overhead = 1;
+
+    std::vector<char> buf;
+    buf.resize(f.size+1);
+
+    f.read_raw(buf.data(), f.size);
+    buf[f.size] = '\0';
+
+    std::vector<int> utf8_units;
+    std::vector<int> utf8_nunits;
+    utf8_units.resize(buf.size());
+    utf8_nunits.resize(buf.size());
+    size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
+
+    if (sample_start.size() == 0) {
+        // tokenize all data at once
+        out_tokens.resize(buf.size() + n_max_tokens_overhead);
+
+        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
+        if (n_tokens < 0) {
+            out_tokens.resize(-n_tokens);
+            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
+        }
+        if (n_tokens >= 0) {
+            out_tokens.resize(n_tokens);
+        }
+
+        // generate sample starts at all token positions
+        out_samples_begin.clear();
+        out_samples_begin.push_back(0);
+        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
+        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
+        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
+            out_samples_begin.push_back(sample_begin);
+            out_samples_size.push_back(context_length);
+        }
+    } else {
+        // split data into samples and tokenize each sample
+        std::string data_str(buf.data(), buf.size()-1);
+        out_samples_begin.clear();
+        out_samples_size.clear();
+        out_tokens.clear();
+
+        // find all positions of pattern sample_start
+        size_t sample_begin = data_str.find(sample_start, 0);
+        while (sample_begin != std::string::npos) {
+            out_samples_begin.push_back(sample_begin);
+            const size_t search_start = sample_begin + sample_start.size();
+            sample_begin = data_str.find(sample_start, search_start);
+        }
+        if (out_samples_begin.size() == 0) {
+            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
+                __func__, sample_start.c_str());
+            out_samples_begin.push_back(0);
+        }
+
+        out_samples_size.resize(out_samples_begin.size(), 0);
+
+        std::vector<char>        buf_sample;
+        std::vector<llama_token> tok_sample;
+
+        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
+        size_t found_too_big_sample   = 0;
+        size_t found_too_small_sample = 0;
+        size_t found_empty_sample     = 0;
+        size_t found_min_sample_size  = SIZE_MAX;
+        size_t found_max_sample_size  = 0;
+
+        size_t max_token_text_size = 0;
+        int n_vocab = llama_n_vocab(lctx);
+        for (llama_token token=0; token < n_vocab; ++token) {
+            max_token_text_size = std::max(
+                max_token_text_size,
+                strlen(llama_token_get_text(lctx, token)));
+        }
+
+        // upper bound of context byte length.
+        // strings with this byte length should always tokenize to at least context_length tokens.
+        size_t context_byte_len = max_token_text_size*context_length;
+
+        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
+            // determine sample begin and end from pattern positions
+            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
+            size_t sample_end   = overlapping_samples
+                                    ? std::min(
+                                        data_str.size(),
+                                        sample_begin + context_byte_len)
+                                    : (i+1 < out_samples_begin.size()
+                                        ? out_samples_begin[i+1]
+                                        : data_str.size());
+            if (utf8_units[sample_end] > 0) {
+                // sample end is in the middle of an utf8 character.
+                // advance sample_end to the begin of the next utf8 character.
+                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
             }
-            const bool matches = (strncmp(in, s.c_str(), len) == 0);
-            if (matches) {
-                in += len;
+            size_t sample_size = sample_end - sample_begin;
+            if (sample_size == 0) {
+                ++found_empty_sample;
+            }
+
+            if (sample_size > 0) {
+                // llama_tokenize expects zero terminated string,
+                // copy sample into buffer and zero terminate it.
+                buf_sample.resize(sample_size+1);
+                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
+                buf_sample[sample_size] = '\0';
+
+                // printf("sample: '%s'\n", buf_sample.data());
+
+                // tokenize the sample
+                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
+                int n_tokens = llama_tokenize(lctx,
+                    buf_sample.data(),
+                    tok_sample.data(),
+                    tok_sample.size(), false);
+                if (n_tokens < 0) {
+                    tok_sample.resize(-n_tokens);
+                    n_tokens = llama_tokenize(lctx,
+                        buf_sample.data(),
+                        tok_sample.data(),
+                        tok_sample.size(), false);
+                    GGML_ASSERT(n_tokens >= 0);
+                }
+                GGML_ASSERT(n_tokens <= tok_sample.size());
+
+                if ((size_t) n_tokens > context_length) {
+                    ++found_too_big_sample;
+                } else if ((size_t) n_tokens < context_length) {
+                    ++found_too_small_sample;
+                }
+                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
+                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
+
+                // write out tokens, start and size of sample
+                // overwrite the string start position with the token start position
+                out_samples_begin[i] = out_tokens.size();
+                out_samples_size[i] = (size_t) n_tokens;
+                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
             } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
+                out_samples_begin[i] = out_tokens.size();
+                out_samples_size[i] = 0;
             }
+
+        }
+        if (found_too_big_sample > 0) {
+            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
+                __func__, found_too_big_sample, found_max_sample_size, context_length);
+        }
+
+        if (found_too_small_sample > 0) {
+            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
+                __func__, found_too_small_sample, found_min_sample_size, context_length);
+        }
+
+        if (found_empty_sample) {
+            printf("%s: warning: found %zu empty samples.\n",
+                __func__, found_empty_sample);
         }
     }
+    printf("%s: total number of samples: %zu\n",
+        __func__, out_samples_begin.size());
 
-    return n_tokens;
+    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
+
+    return out_tokens.size();
 }
 
-static void shuffle_ints(int * begin, int * end) {
-    if (end <= begin) return;
-    int max=begin[0];
-    for (int i=1; i<end-begin; ++i) {
-        if (begin[i] > max) {
-            max = begin[i];
+static void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
+    std::stringstream s_rng_state;
+    s_rng_state.imbue(std::locale::classic());
+    s_rng_state.exceptions(std::stringstream::failbit);
+    s_rng_state.str(rng_state);
+    s_rng_state >> rng;
+}
+
+static std::string mt19937_get_state(const std::mt19937& rng) {
+    std::stringstream s_rng_state;
+    s_rng_state.imbue(std::locale::classic());
+    s_rng_state << rng;
+    return s_rng_state.str();
+}
+
+static std::string mt19937_seed_to_state(unsigned seed) {
+    std::mt19937 rng(seed);
+    return mt19937_get_state(rng);
+}
+
+static std::string shuffle_samples(
+        const std::string & rng_state,
+        const size_t      * begins,
+        const size_t      * sizes,
+        size_t            * shuffled_begins,
+        size_t            * shuffled_sizes,
+        size_t              count) {
+    if (count == 0) return rng_state;
+
+    std::mt19937 rng;
+    mt19937_set_state(rng, rng_state);
+
+    // sort indices by random value for each index
+    std::vector<size_t> idcs;
+    {
+        std::vector<unsigned> rnd;
+        idcs.resize(count);
+        rnd.resize(count);
+        for (unsigned i=0; i<count; ++i) {
+            idcs[i] = i;
+            rnd[i]  = rng();
         }
+
+        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
+            // stable sort for reproducibility
+            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
+        });
+    }
+
+    // reorder begins and sizes by sorted indices
+    for (unsigned i=0; i<count; ++i) {
+        shuffled_begins[i] = begins[idcs[i]];
     }
-    std::vector<float> vals;
-    vals.resize(max+1);
-    for (int i=0; i<max+1; ++i) {
-       vals[i] = frand();
+
+    for (unsigned i=0; i<count; ++i) {
+        shuffled_sizes[i] = sizes[idcs[i]];
     }
-    std::sort(begin, end, [&vals](int a, int b){
-       return vals.at(a) < vals.at(b);
-    });
+
+    return mt19937_get_state(rng);
 }
 
 static std::string replace_str(const char * s, const char * needle, const char * replacement) {
@@ -1177,15 +1493,30 @@ static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context
     if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) >= 0) {
         uint32_t file_version = 0xFFFFFFFFu;
         GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-        GGML_ASSERT(file_version == 0);
+        GGML_ASSERT(file_version <= 1);
 
         std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
         GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
         GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
 
-        GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+        if (file_version == 0) {
+
+            GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
+            GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+            GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+
+        } else if (file_version == 1) {
+
+            GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
+            GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+            GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
+            GGUF_GET_KEY(fctx, model->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
+
+            GGUF_GET_KEY(fctx, model->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
+            GGUF_GET_KEY(fctx, model->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
+            GGUF_GET_KEY(fctx, model->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
+            GGUF_GET_KEY(fctx, model->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
+        }
 
         load_opt_context_gguf(fctx, f_ggml_ctx, opt);
     } else {
@@ -1196,11 +1527,17 @@ static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context
 static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
     save_llama_model_gguf(fctx, fn_vocab_model, model);
 
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    0);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
     gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    model->train_samples);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     model->train_tokens);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    model->train_samples);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     model->train_tokens);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     model->train_epochs);
+
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) model->shuffle_samples_hash);
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    model->shuffle_rng_state_current.c_str());
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) model->shuffle_sample_count);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) model->shuffle_next_sample);
 
     save_opt_context_gguf(fctx, opt);
 }
@@ -1283,12 +1620,21 @@ struct train_params {
 
     int print_info_interval;
 
-    bool samples_start_after_nl;
     bool use_adam;
     bool use_flash;
     bool use_checkpointing;
     bool use_alloc;
 
+    std::string sample_start;
+    bool include_sample_start;
+    bool escape;
+    bool overlapping_samples;
+    bool fill_with_next_samples;
+    bool separate_with_eos;
+    bool separate_with_bos;
+
+    bool force_reshuffle;
+
     // only adam
     int   warmup;
     int   cos_decay_steps;
@@ -1347,12 +1693,20 @@ struct train_params get_default_train_params() {
 
     params.print_info_interval    = 1;
 
-    params.samples_start_after_nl = false;
     params.use_adam               = true;
     params.use_flash              = true;
     params.use_checkpointing      = true;
     params.use_alloc              = true;
 
+    params.sample_start           = "";
+    params.include_sample_start   = false;
+    params.escape                 = false;
+    params.overlapping_samples    = false;
+    params.fill_with_next_samples = false;
+    params.separate_with_eos      = false;
+    params.separate_with_bos      = true;
+    params.force_reshuffle        = false;
+
     params.opt_past               = 0;
     params.opt_delta              = 1e-5f;
     params.opt_max_no_improvement = 0;
@@ -1408,7 +1762,16 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
     fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
-    fprintf(stderr, "  --samples-after-nl         Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off");
+    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
+    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
+    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
+    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
+    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
+    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
     fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
     fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
     fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
@@ -1586,8 +1949,30 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->print_info_interval = std::stoi(argv[i]);
-        } else if (arg == "--samples-after-nl") {
-            params->samples_start_after_nl = true;
+         } else if (arg == "--sample-start") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->sample_start = std::string(argv[i]);
+        } else if (arg == "--escape") {
+            params->escape = true;
+        } else if (arg == "--include-sample-start") {
+            params->include_sample_start = true;
+        } else if (arg == "--overlapping-samples") {
+            params->overlapping_samples = true;
+        } else if (arg == "--fill-with-next-samples") {
+            params->fill_with_next_samples = true;
+        } else if (arg == "--separate-with-eos") {
+            params->separate_with_eos = true;
+        } else if (arg == "--separate-with-bos") {
+            params->separate_with_bos = true;
+        } else if (arg == "--no-separate-with-eos") {
+            params->separate_with_eos = false;
+        } else if (arg == "--no-separate-with-bos") {
+            params->separate_with_bos = false;
+        } else if (arg == "--force-reshuffle") {
+            params->force_reshuffle = true;
         } else if (arg == "--use-lbfgs") {
             params->use_adam = false;
         } else if (arg == "--use-adam") {
@@ -1742,6 +2127,9 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
         train_print_usage(argc, argv, &default_params);
         exit(1);
     }
+    if (params->escape) {
+        process_escapes(params->sample_start);
+    }
 
     return true;
 }
@@ -1754,14 +2142,42 @@ struct opt_callback_data {
     int                       last_save_iter;
     llama_token *             tokens_data;
     size_t                    tokens_size;
-    int *                     samples_data;
-    size_t                    samples_size;
-    int                       shuffle_countdown;
+    size_t *                  samples_begin;
+    size_t *                  samples_size;
+    size_t *                  shuffled_samples_begin;
+    size_t *                  shuffled_samples_size;
+    size_t                    samples_count;
     struct ggml_tensor *      tokens_input;
     struct ggml_tensor *      target_logits;
     struct ggml_tensor *      target_probs;
+    int                       first_iter;
+    int64_t                   last_time;
+    double                    millis_per_iter;
 };
 
+static void print_duration(double fmillis) {
+    if (fmillis < 1000.0f) {
+        printf("%.1fms", (float) fmillis);
+        return;
+    }
+    const int64_t one_sec  = 1000;
+    const int64_t one_min  = one_sec  * 60;
+    const int64_t one_hour = one_min  * 60;
+    const int64_t one_day  = one_hour * 24;
+
+    int64_t millis  = (int64_t) fmillis;
+    int64_t days    = millis/one_day;
+    int64_t hours   = (millis - days*one_day)/one_hour;
+    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
+    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
+
+    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
+    if (days > 0) {
+        printf("%lldd ", (long long int) days);
+    }
+    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
+}
+
 static void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
@@ -1770,6 +2186,27 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
     int n_ctx = params->n_ctx;
 
     if (accum_step == 0) {
+        // time measurement
+        int64_t now = ggml_time_ms();
+        if (now > data->last_time && opt->iter > data->first_iter) {
+            double dt = now - data->last_time;
+            if (data->millis_per_iter == 0.0) {
+                data->millis_per_iter = dt;
+            } else {
+                const double gain = 0.7;
+                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
+            }
+        }
+
+        double remaining_millis = 0.0;
+        if (data->millis_per_iter > 0.0) {
+            const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
+            const int done_iter = opt->iter - data->first_iter;
+            const int remaining_iter = n_iter - done_iter;
+            remaining_millis = remaining_iter * data->millis_per_iter;
+        }
+
+        // file saving
         const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
@@ -1789,6 +2226,9 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
             data->last_save_iter = opt->iter;
         }
 
+        // exclude file saving from time measurement, by measuring last_time after saving
+        data->last_time = ggml_time_ms();
+
         *sched = (opt->iter < params->warmup)
                     ? (float) opt->iter / (float) params->warmup
                     : cosine_decay_restart(
@@ -1800,32 +2240,79 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         float min_sched = params->adam_min_alpha / params->adam_alpha;
         *sched = min_sched + *sched * (1.0f - min_sched);
 
-        int impr_plot = std::isnan(opt->loss_after) ? 0 : -std::lround(1 + (opt->loss_before - opt->loss_after) * 10.0f);
-        printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0);
+        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+        if (impr_plot > 0) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
+            __func__, opt->iter, std::min(1+data->model->shuffle_next_sample, data->model->shuffle_sample_count), data->model->shuffle_sample_count,
+            *sched, opt->loss_after);
 
-    }
 
-    if (data->shuffle_countdown < n_batch) {
-        printf("%s: reshuffle samples\n", __func__);
-        shuffle_ints(data->samples_data, data->samples_data + data->samples_size);
-        for (int i = 0; i < (int) data->samples_size; ++i) {
-            GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size);
+        if (data->millis_per_iter > 0) {
+            printf(" dt=");
+            print_duration(data->millis_per_iter);
+            printf(" eta=");
+            print_duration(remaining_millis);
+        }
+
+        float improvement = opt->loss_before - opt->loss_after;
+        const float plot_scale = 10.0f;
+        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
+        printf(" |");
+        for (int i=0; i<bar_len; ++i) {
+            printf("-");
         }
-        data->shuffle_countdown = data->samples_size;
+        printf(">");
+        printf("\n");
     }
 
-    get_example_targets_batch(
+    int used_samples = get_example_targets_batch(
         data->lctx,
-        data->samples_data,
-        data->samples_size,
+        data->shuffled_samples_begin,
+        data->shuffled_samples_size,
+        data->samples_count,
         data->tokens_data,
         data->tokens_size,
-        opt->iter*params->n_gradient_accumulation + accum_step,
+        data->model->shuffle_next_sample,
         data->tokens_input,
-        data->target_logits,
-        data->target_probs);
+        data->target_probs,
+        params->separate_with_eos,
+        params->separate_with_bos,
+        params->fill_with_next_samples);
+
+    data->model->shuffle_next_sample += used_samples;
+
+    if (data->model->shuffle_next_sample >= data->model->shuffle_sample_count) {
+        ++data->model->train_epochs;
+        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) data->model->train_epochs);
+        // note: we may have used some samples from the current shuffling more than once
+        data->model->shuffle_rng_state_current = data->model->shuffle_rng_state_next;
+        data->model->shuffle_rng_state_next = shuffle_samples(
+            data->model->shuffle_rng_state_current,
+            data->samples_begin,
+            data->samples_size,
+            data->shuffled_samples_begin,
+            data->shuffled_samples_size,
+            data->samples_count);
+        data->model->shuffle_next_sample = 0;
+    }
+
+}
+
+static size_t hash_combine(size_t h1, size_t h2) {
+    return h1 ^ (h2 << 1);
+}
 
-    data->shuffle_countdown -= n_batch;
+static size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
+    std::hash<std::string> h_string;
+    std::hash<unsigned long long> h_ull;
+    size_t h = h_string(std::string(fn));
+    h = hash_combine(h, h_ull((unsigned long long) sample_count));
+    for (size_t i=0; i< sample_count; ++i) {
+        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
+        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
+    }
+    return h;
 }
 
 int main(int argc, char ** argv) {
@@ -1847,13 +2334,6 @@ int main(int argc, char ** argv) {
     struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
-    printf("%s: tokenize training data\n", __func__);
-    std::vector<llama_token> train_tokens;
-    if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) {
-        fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data);
-    }
-    printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size());
-
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
     model.hparams.n_ctx   = params.n_ctx;
@@ -1869,24 +2349,6 @@ int main(int argc, char ** argv) {
 
     print_params(&model.hparams);
 
-    std::vector<size_t> token_noccurs;
-    std::vector<bool>   token_notavail;
-    token_noccurs.resize(model.hparams.n_vocab, 0);
-    token_notavail.resize(model.hparams.n_vocab, true);
-    for (int i = 0; i < (int) train_tokens.size(); ++i) {
-        ++token_noccurs[train_tokens[i]];
-        token_notavail[train_tokens[i]] = false;
-    }
-
-    std::vector<float> token_freq;
-    token_freq.resize(model.hparams.n_vocab, 0);
-    int n_unique_tokens = 0;
-    for (int i = 0; i < (int) token_noccurs.size(); ++i) {
-        token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size();
-        n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0;
-    }
-    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
-
     struct ggml_init_params lcparams;
     lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
     lcparams.mem_buffer = NULL;
@@ -1965,19 +2427,48 @@ int main(int argc, char ** argv) {
         alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
     }
 
-    GGML_ASSERT(n_tokens < (int) train_tokens.size());
-    std::vector<int> train_samples;
-    train_samples.push_back(0);
-    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
-            train_samples.push_back(i);
-        }
+    std::vector<llama_token> train_tokens;
+    std::vector<size_t> train_samples_begin;
+    std::vector<size_t> train_samples_size;
+    printf("%s: tokenize training data\n", __func__);
+    tokenize_file(lctx,
+            params.fn_train_data,
+            params.sample_start,
+            params.include_sample_start,
+            params.overlapping_samples,
+            n_tokens,
+            train_tokens,
+            train_samples_begin,
+            train_samples_size);
+    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
+
+    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
+
+    size_t shuffle_samples_hash = compute_samples_hash(params.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
+    const bool changed_train_data = (shuffle_samples_hash != model.shuffle_samples_hash) || (model.shuffle_sample_count != train_samples_size.size());
+    if (changed_train_data) {
+        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
     }
-    shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-    for (int i = 0; i < (int) train_samples.size(); ++i) {
-        GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
+    if (params.force_reshuffle) {
+        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
     }
-
+    if ((model.shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
+        model.shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
+        model.shuffle_sample_count = train_samples_size.size();
+        model.shuffle_next_sample = 0;
+        model.shuffle_samples_hash = shuffle_samples_hash;
+    }
+    std::vector<size_t> train_shuffled_samples_begin;
+    std::vector<size_t> train_shuffled_samples_size;
+    train_shuffled_samples_begin.resize(train_samples_begin.size());
+    train_shuffled_samples_size.resize(train_samples_size.size());
+    model.shuffle_rng_state_next = shuffle_samples(
+        model.shuffle_rng_state_current,
+        train_samples_begin.data(),
+        train_samples_size.data(),
+        train_shuffled_samples_begin.data(),
+        train_shuffled_samples_size.data(),
+        train_samples_size.size());
     printf("%s: begin training\n", __func__);
 
     struct opt_callback_data opt_cb_data;
@@ -1988,22 +2479,21 @@ int main(int argc, char ** argv) {
     opt_cb_data.last_save_iter = opt->iter;
     opt_cb_data.tokens_data = train_tokens.data();
     opt_cb_data.tokens_size = train_tokens.size();
-    opt_cb_data.samples_data = train_samples.data();
-    opt_cb_data.samples_size = train_samples.size();
-    opt_cb_data.shuffle_countdown = train_samples.size();
+    opt_cb_data.samples_begin          = train_samples_begin.data();
+    opt_cb_data.samples_size           = train_samples_size.data();
+    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
+    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
+    opt_cb_data.samples_count          = train_samples_size.size();
     opt_cb_data.tokens_input  = NULL;
     opt_cb_data.target_logits = NULL;
     opt_cb_data.target_probs  = NULL;
+    opt_cb_data.first_iter             = opt->iter;
+    opt_cb_data.last_time              = ggml_time_ms();
+    opt_cb_data.millis_per_iter        = 0.0;
 
     int64_t t0 = ggml_time_ms();
 
     for (int ex = 0; ex < params.n_examples; ++ex) {
-        if (ex*n_batch >= (int) train_samples.size()) {
-            shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
-            for (int i = 0; i < (int) train_samples.size(); ++i) {
-                GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
-            }
-        }
 
         struct ggml_init_params cparams = {
             compute_size, // mem_size

From 00b656f6db43c2ede7c0a548b5a313f5dd27884d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 15:59:38 +0200
Subject: [PATCH 192/235] remove lbfgs related train parameters

---
 examples/finetune/finetune.cpp                | 66 +++++--------------
 .../train-text-from-scratch.cpp               | 37 ++---------
 2 files changed, 22 insertions(+), 81 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 6e09de776595a..623e1bcad041f 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2081,7 +2081,6 @@ struct train_params {
     bool custom_n_rank_norm;
     bool custom_n_rank_output;
 
-    bool use_adam;
     bool use_flash;
     bool use_checkpointing;
 
@@ -2095,7 +2094,6 @@ struct train_params {
 
     bool force_reshuffle;
 
-    // only adam
     int   warmup;
     int   cos_decay_steps;
     float cos_decay_restart;
@@ -2106,7 +2104,6 @@ struct train_params {
     float opt_delta;
     int   opt_max_no_improvement;
 
-    int   lbfgs_n_iter;
     int   adam_n_iter;
     float adam_alpha;
     float adam_min_alpha;
@@ -2179,7 +2176,6 @@ static struct train_params get_default_train_params() {
     params.custom_n_rank_norm           = false;
     params.custom_n_rank_output         = false;
 
-    params.use_adam               = true;
     params.use_flash              = true;
     params.use_checkpointing      = true;
 
@@ -2196,14 +2192,12 @@ static struct train_params get_default_train_params() {
     params.opt_delta              = 1e-5f;
     params.opt_max_no_improvement = 0;
 
-    // only adam
     params.warmup            =  100;
     params.cos_decay_steps   = 1000;
     params.cos_decay_restart = 1.1f;
     params.cos_decay_min     = 0.1f;
     params.enable_restart    = false;
 
-    params.lbfgs_n_iter        = 256;
     params.adam_n_iter         = 256;
     params.adam_alpha          = 1e-3f;
     params.adam_min_alpha      = 0;
@@ -2262,7 +2256,6 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
     fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
     fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
-    fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
     fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
@@ -2285,7 +2278,6 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
-    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "\n");
 }
 
@@ -2524,10 +2516,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             params->separate_with_bos = false;
         } else if (arg == "--force-reshuffle") {
             params->force_reshuffle = true;
-        } else if (arg == "--use-lbfgs") {
-            params->use_adam = false;
-        } else if (arg == "--use-adam") {
-            params->use_adam = true;
         } else if (arg == "--no-flash") {
             params->use_flash = false;
         } else if (arg == "--use-flash") {
@@ -2636,12 +2624,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->adam_gclip = std::stof(argv[i]);
-        } else if (arg == "--lbfgs-iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lbfgs_n_iter = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             train_print_usage(argc, argv, &default_params);
             exit(0);
@@ -2728,7 +2710,7 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
 
         double remaining_millis = 0.0;
         if (data->millis_per_iter > 0.0) {
-            const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
+            const int n_iter = params->adam_n_iter;
             const int done_iter = opt->iter - data->first_iter;
             const int remaining_iter = n_iter - done_iter;
             remaining_millis = remaining_iter * data->millis_per_iter;
@@ -2943,35 +2925,23 @@ int main(int argc, char ** argv) {
     lora.hparams.n_rank_output         = n_rank_output;
 
     // set opt params from command line
-    if (params.use_adam) {
-        opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
-        opt->params.print_forward_graph     = false;
-        opt->params.print_backward_graph    = false;
-        opt->params.n_threads               = params.n_threads;
-        opt->params.past                    = params.opt_past;
-        opt->params.delta                   = params.opt_delta;
-        opt->params.max_no_improvement      = params.opt_max_no_improvement;
-        opt->params.n_gradient_accumulation = params.n_gradient_accumulation;
-        opt->params.adam.n_iter             = params.adam_n_iter;
-        opt->params.adam.sched              = 1.0f;
-        opt->params.adam.alpha              = params.adam_alpha;
-        opt->params.adam.decay              = params.adam_decay;
-        opt->params.adam.decay_min_ndim     = params.adam_decay_min_ndim;
-        opt->params.adam.beta1              = params.adam_beta1;
-        opt->params.adam.beta2              = params.adam_beta2;
-        opt->params.adam.gclip              = params.adam_gclip;
-        opt->params.adam.eps_f              = params.adam_eps_f;
-    } else {
-        opt->params = ggml_opt_default_params(GGML_OPT_LBFGS);
-        opt->params.print_forward_graph     = false;
-        opt->params.print_backward_graph    = false;
-        opt->params.n_threads               = params.n_threads;
-        opt->params.past                    = params.opt_past;
-        opt->params.delta                   = params.opt_delta;
-        opt->params.max_no_improvement      = params.opt_max_no_improvement;
-        opt->params.n_gradient_accumulation = params.n_gradient_accumulation;
-        opt->params.lbfgs.n_iter            = params.lbfgs_n_iter;
-    }
+    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    opt->params.print_forward_graph     = false;
+    opt->params.print_backward_graph    = false;
+    opt->params.n_threads               = params.n_threads;
+    opt->params.past                    = params.opt_past;
+    opt->params.delta                   = params.opt_delta;
+    opt->params.max_no_improvement      = params.opt_max_no_improvement;
+    opt->params.n_gradient_accumulation = params.n_gradient_accumulation;
+    opt->params.adam.n_iter             = params.adam_n_iter;
+    opt->params.adam.sched              = 1.0f;
+    opt->params.adam.alpha              = params.adam_alpha;
+    opt->params.adam.decay              = params.adam_decay;
+    opt->params.adam.decay_min_ndim     = params.adam_decay_min_ndim;
+    opt->params.adam.beta1              = params.adam_beta1;
+    opt->params.adam.beta2              = params.adam_beta2;
+    opt->params.adam.gclip              = params.adam_gclip;
+    opt->params.adam.eps_f              = params.adam_eps_f;
 
     ggml_allocr * alloc = NULL;
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 8e06838ad3922..39a85967a50cc 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1620,7 +1620,6 @@ struct train_params {
 
     int print_info_interval;
 
-    bool use_adam;
     bool use_flash;
     bool use_checkpointing;
     bool use_alloc;
@@ -1635,7 +1634,6 @@ struct train_params {
 
     bool force_reshuffle;
 
-    // only adam
     int   warmup;
     int   cos_decay_steps;
     float cos_decay_restart;
@@ -1646,7 +1644,6 @@ struct train_params {
     float opt_delta;
     int   opt_max_no_improvement;
 
-    int   lbfgs_n_iter;
     int   adam_n_iter;
     float adam_alpha;
     float adam_min_alpha;
@@ -1693,7 +1690,6 @@ struct train_params get_default_train_params() {
 
     params.print_info_interval    = 1;
 
-    params.use_adam               = true;
     params.use_flash              = true;
     params.use_checkpointing      = true;
     params.use_alloc              = true;
@@ -1711,14 +1707,12 @@ struct train_params get_default_train_params() {
     params.opt_delta              = 1e-5f;
     params.opt_max_no_improvement = 0;
 
-    // only adam
     params.warmup            =  100;
     params.cos_decay_steps   = 1000;
     params.cos_decay_restart = 1.1f;
     params.cos_decay_min     = 0.1f;
     params.enable_restart    = false;
 
-    params.lbfgs_n_iter        = 256;
     params.adam_n_iter         = 256;
     params.adam_alpha          = 1e-3f;
     params.adam_min_alpha      = 0;
@@ -1772,8 +1766,6 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
     fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
     fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
-    fprintf(stderr, "  --use-lbfgs                Use LBFGS optimizer instead of default Adam\n");
-    fprintf(stderr, "  --use-adam                 Use Adam optimizer (default)\n");
     fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
     fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
@@ -1798,7 +1790,6 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
-    fprintf(stderr, "  --lbfgs-iter N             Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
@@ -1973,10 +1964,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             params->separate_with_bos = false;
         } else if (arg == "--force-reshuffle") {
             params->force_reshuffle = true;
-        } else if (arg == "--use-lbfgs") {
-            params->use_adam = false;
-        } else if (arg == "--use-adam") {
-            params->use_adam = true;
         } else if (arg == "--no-flash") {
             params->use_flash = false;
         } else if (arg == "--use-flash") {
@@ -2089,12 +2076,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->adam_gclip = std::stof(argv[i]);
-        } else if (arg == "--lbfgs-iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lbfgs_n_iter = std::stoi(argv[i]);
         } else if (arg == "--mem-model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -2200,7 +2181,7 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
 
         double remaining_millis = 0.0;
         if (data->millis_per_iter > 0.0) {
-            const int n_iter = params->use_adam ? params->adam_n_iter : params->lbfgs_n_iter;
+            const int n_iter = params->adam_n_iter;
             const int done_iter = opt->iter - data->first_iter;
             const int remaining_iter = n_iter - done_iter;
             remaining_millis = remaining_iter * data->millis_per_iter;
@@ -2364,7 +2345,6 @@ int main(int argc, char ** argv) {
     memset(opt, 0, sizeof(struct ggml_opt_context));
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
-    struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
     opt_params_adam.print_forward_graph     = false;
     opt_params_adam.print_backward_graph    = false;
     opt_params_adam.n_threads               = params.n_threads;
@@ -2382,17 +2362,8 @@ int main(int argc, char ** argv) {
     opt_params_adam.adam.gclip              = params.adam_gclip;
     opt_params_adam.adam.eps_f              = params.adam_eps_f;
 
-    opt_params_lbfgs.print_forward_graph     = false;
-    opt_params_lbfgs.print_backward_graph    = false;
-    opt_params_lbfgs.n_threads               = params.n_threads;
-    opt_params_lbfgs.past                    = params.opt_past;
-    opt_params_lbfgs.delta                   = params.opt_delta;
-    opt_params_lbfgs.max_no_improvement      = params.opt_max_no_improvement;
-    opt_params_lbfgs.n_gradient_accumulation = params.n_gradient_accumulation;
-    opt_params_lbfgs.lbfgs.n_iter            = params.lbfgs_n_iter;
-
     opt->ctx = model.ctx;
-    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
+    opt->params = opt_params_adam;
 
     printf("%s: init model\n", __func__);
     bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt);
@@ -2401,7 +2372,7 @@ int main(int argc, char ** argv) {
     }
     set_param_model(&model);
 
-    opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs;
+    opt->params = opt_params_adam;
 
     opt->iter = model.train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
@@ -2563,7 +2534,7 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
-        int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter;
+        int n_iter = params.adam_n_iter;
         model.train_its = opt->iter;
         model.train_samples += n_batch * n_iter;
         model.train_tokens  += n_batch * n_tokens * n_iter;

From 9f4b1bf88d3e077441d8e249f2bc04f88db917b3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 14:58:34 +0200
Subject: [PATCH 193/235] move common train functions into common/train.[h|cpp]

---
 Makefile                                      |    9 +-
 common/CMakeLists.txt                         |    2 +
 common/train.cpp                              |  914 ++++++++++++++
 common/train.h                                |  113 ++
 examples/baby-llama/baby-llama.cpp            |  176 +--
 examples/finetune/finetune.cpp                | 1105 +++--------------
 .../train-text-from-scratch.cpp               |  969 +--------------
 7 files changed, 1287 insertions(+), 2001 deletions(-)
 create mode 100644 common/train.cpp
 create mode 100644 common/train.h

diff --git a/Makefile b/Makefile
index 63557d93e7cc4..f41ebdc6910bb 100644
--- a/Makefile
+++ b/Makefile
@@ -485,6 +485,9 @@ console.o: common/console.cpp common/console.h
 grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+train.o: common/train.cpp common/train.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
@@ -532,7 +535,7 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
@@ -541,13 +544,13 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
+baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index dead56118bac8..951aa8340c7e4 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT
     console.cpp
     grammar-parser.h
     grammar-parser.cpp
+    train.h
+    train.cpp
     )
 
 if (BUILD_SHARED_LIBS)
diff --git a/common/train.cpp b/common/train.cpp
new file mode 100644
index 0000000000000..a1e35e5a3d91d
--- /dev/null
+++ b/common/train.cpp
@@ -0,0 +1,914 @@
+#include "train.h"
+#include "common.h"
+
+#include <random>
+#include <sstream>
+#include <functional>
+
+struct random_normal_distribution {
+    std::mt19937 gen;
+    std::normal_distribution<float> rd;
+    float min;
+    float max;
+};
+
+struct random_uniform_distribution {
+    std::mt19937 gen;
+    std::uniform_real_distribution<float> rd;
+};
+
+struct random_normal_distribution * init_random_normal_distribution(int seed, float mean, float std, float min, float max) {
+    struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::normal_distribution<float>{mean, std};
+    rnd->min = min;
+    rnd->max = max;
+    return rnd;
+}
+
+struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
+    struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution));
+    rnd->gen = std::mt19937(seed);
+    rnd->rd = std::uniform_real_distribution<float>{min, max};
+    return rnd;
+}
+
+void free_random_normal_distribution (struct random_normal_distribution  * rnd) {
+    free(rnd);
+}
+
+void free_random_uniform_distribution(struct random_uniform_distribution * rnd) {
+    free(rnd);
+}
+
+struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
+    float scale = 1.0f; // xavier
+    switch (tensor->n_dims) {
+        case 1:
+            scale /= sqrtf((float) tensor->ne[0]);
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = scale * frand_normal(rnd);
+            }
+            break;
+        case 2:
+            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = scale * frand_normal(rnd);
+                }
+            }
+            break;
+        case 3:
+            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = scale * frand_normal(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = scale * frand_normal(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            GGML_ASSERT(!"Unsupported tensor->n_dims");
+    };
+    return tensor;
+}
+
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
+    switch (tensor->n_dims) {
+        case 1:
+            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
+                *dst = frand_uniform(rnd);
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+                    *dst = frand_uniform(rnd);
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
+                        *dst = frand_uniform(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
+                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
+                            *dst = frand_uniform(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            GGML_ASSERT(!"Unsupported tensor->n_dims");
+    };
+    return tensor;
+}
+
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+float frand_normal(struct random_normal_distribution * rnd) {
+    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
+}
+
+float frand_uniform(struct random_uniform_distribution * rnd) {
+    return rnd->rd(rnd->gen);
+}
+
+int clamp(const int v, const int min, const int max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+float fclamp(const float v, const float min, const float max) {
+    return ((v < min) ? (min) : (v > max) ? (max) : v);
+}
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+}
+
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+}
+
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+}
+
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == ne3);
+}
+
+int64_t get_example_targets_batch(
+        struct llama_context * lctx,
+        struct ggml_tensor   * tokens_input,
+        struct ggml_tensor   * target_probs,
+        int64_t                example_id,
+        const size_t         * samples_begin,
+        const size_t         * samples_size,
+              size_t           samples_count,
+        const llama_token    * train_data,
+        size_t                 n_train_data,
+        bool                   separate_with_eos,
+        bool                   separate_with_bos,
+        bool                   fill_with_next_samples) {
+
+    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(target_probs->n_dims  == 3);
+    int64_t n_vocab  = target_probs->ne[0];
+    int64_t n_tokens = tokens_input->ne[0];
+    int64_t n_batch  = tokens_input->ne[1];
+    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
+    GGML_ASSERT(n_tokens == target_probs->ne[1]);
+    GGML_ASSERT(n_batch  == target_probs->ne[2]);
+
+    int64_t used_samples = 0;
+
+    ggml_set_f32(target_probs, 0.0f);
+    llama_token bos = llama_token_bos(lctx);
+    llama_token eos = llama_token_eos(lctx);
+    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
+    for (int k=0; k<n_batch; ++k) {
+        // printf("%s: batch %d\n", __func__, k);
+        size_t sample_offs  = 0;
+        size_t sample_idx   = (example_id + used_samples) % samples_count;
+        size_t sample_begin = samples_begin[sample_idx];
+        size_t sample_size  = samples_size[sample_idx];
+        ++used_samples;
+
+        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
+        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
+
+        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
+        bool sample_separation_eos = !separate_with_eos;
+        bool sample_separation_bos = !separate_with_bos;
+        for (int64_t i=0; i<n_tokens; ++i) {
+            llama_token token = eos;
+            if (sample_offs >= sample_size && fill_with_next_samples) {
+                if (!sample_separation_eos) {
+                    // insert eos token to separate samples
+                    sample_separation_eos = true;
+                } else if (!sample_separation_bos) {
+                    // insert bos token to separate samples
+                    sample_separation_bos = true;
+                    token = bos;
+                } else {
+                    // sample separation is done, continue with next sample
+                    sample_separation_eos = !separate_with_eos;
+                    sample_separation_bos = !separate_with_bos;
+                    sample_offs  = 0;
+                    sample_idx   = (example_id + used_samples) % samples_count;
+                    sample_begin = samples_begin[sample_idx];
+                    sample_size  = samples_size[sample_idx];
+                    ++used_samples;
+                }
+            }
+            // note: no else-if here
+            if (sample_offs < sample_size) {
+                token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
+                ++sample_offs;
+            }
+            ggml_set_f32_nd(target_probs,  token, (int) i, (int) k, 0, +1.0f);
+            if (i+1<n_tokens) {
+                ggml_set_i32_nd(tokens_input, (int) (i + 1), (int) k, 0, 0, token);
+            }
+        }
+    }
+
+    return used_samples;
+}
+
+void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
+    std::stringstream s_rng_state;
+    s_rng_state.imbue(std::locale::classic());
+    s_rng_state.exceptions(std::stringstream::failbit);
+    s_rng_state.str(rng_state);
+    s_rng_state >> rng;
+}
+
+std::string mt19937_get_state(const std::mt19937& rng) {
+    std::stringstream s_rng_state;
+    s_rng_state.imbue(std::locale::classic());
+    s_rng_state << rng;
+    return s_rng_state.str();
+}
+
+std::string mt19937_seed_to_state(unsigned seed) {
+    std::mt19937 rng(seed);
+    return mt19937_get_state(rng);
+}
+
+std::string shuffle_samples(
+        const std::string & rng_state,
+        size_t            * shuffled_begins,
+        size_t            * shuffled_sizes,
+        const size_t      * begins,
+        const size_t      * sizes,
+        size_t              count) {
+    if (count == 0) return rng_state;
+
+    std::mt19937 rng;
+    mt19937_set_state(rng, rng_state);
+
+    // sort indices by random value for each index
+    std::vector<size_t> idcs;
+    {
+        std::vector<unsigned> rnd;
+        idcs.resize(count);
+        rnd.resize(count);
+        for (unsigned i=0; i<count; ++i) {
+            idcs[i] = i;
+            rnd[i]  = rng();
+        }
+
+        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
+            // stable sort for reproducibility
+            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
+        });
+    }
+
+    // reorder begins and sizes by sorted indices
+    for (unsigned i=0; i<count; ++i) {
+        shuffled_begins[i] = begins[idcs[i]];
+    }
+
+    for (unsigned i=0; i<count; ++i) {
+        shuffled_sizes[i] = sizes[idcs[i]];
+    }
+
+    return mt19937_get_state(rng);
+}
+
+size_t hash_combine(size_t h1, size_t h2) {
+    return h1 ^ (h2 << 1);
+}
+
+size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
+    std::hash<std::string> h_string;
+    std::hash<unsigned long long> h_ull;
+    size_t h = h_string(std::string(fn));
+    h = hash_combine(h, h_ull((unsigned long long) sample_count));
+    for (size_t i=0; i< sample_count; ++i) {
+        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
+        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
+    }
+    return h;
+}
+
+std::string replace_str(const char * s, const char * needle, const char * replacement) {
+    std::string str = s;
+    size_t pos = str.find(needle);
+    if (pos != std::string::npos) {
+        str.replace(pos, strlen(needle), replacement);
+    }
+    return str;
+}
+
+void print_duration(double fmillis) {
+    if (fmillis < 1000.0f) {
+        printf("%.1fms", (float) fmillis);
+        return;
+    }
+    const int64_t one_sec  = 1000;
+    const int64_t one_min  = one_sec  * 60;
+    const int64_t one_hour = one_min  * 60;
+    const int64_t one_day  = one_hour * 24;
+
+    int64_t millis  = (int64_t) fmillis;
+    int64_t days    = millis/one_day;
+    int64_t hours   = (millis - days*one_day)/one_hour;
+    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
+    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
+
+    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
+    if (days > 0) {
+        printf("%lldd ", (long long int) days);
+    }
+    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
+}
+
+float cosine_decay(int64_t step, int64_t decay_steps, float minimum) {
+    if (step > decay_steps) {
+        step = decay_steps;
+    }
+    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
+    const float decay = (1 - minimum)*cosine_decay + minimum;
+    return decay;
+}
+
+float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) {
+    while (step > decay_steps) {
+        step -= decay_steps;
+        decay_steps = (int64_t) (restart_step_mult * decay_steps);
+    }
+    return cosine_decay(step, decay_steps, minimum);
+}
+
+float learning_schedule(
+    int64_t step,
+    int64_t warmup_steps,
+    int64_t cos_decay_steps,
+    float   learning_rate,
+    float   overall_minimum,
+    float   cos_decay_minimum,
+    float   cos_decay_restart_step_mult,
+    bool    enable_restart) {
+
+    float result =
+        (step < warmup_steps)
+            ? (float) step / (float) warmup_steps
+            : enable_restart
+                ? cosine_decay_restart(
+                    step - warmup_steps,
+                    cos_decay_steps,
+                    cos_decay_minimum,
+                    cos_decay_restart_step_mult)
+                : cosine_decay(
+                    step,
+                    cos_decay_steps,
+                    cos_decay_minimum);
+
+    float min = overall_minimum / learning_rate;
+    result = min + result * (1.0f - min);
+    return result;
+}
+
+static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
+    GGML_ASSERT(a != NULL);
+    GGML_ASSERT(b != NULL);
+    GGML_ASSERT(a->type == b->type);
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
+
+    return true;
+}
+
+void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
+    if (dst == NULL) {
+        return;
+    }
+    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
+    GGML_ASSERT(are_same_layout(dst, t));
+    memcpy(dst->data, t->data, ggml_nbytes(t));
+
+    if (strlen(ggml_get_name(dst)) == 0) {
+        ggml_set_name(dst, name);
+    }
+}
+
+// gguf constants
+static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
+static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
+static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
+static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
+static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
+static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
+static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
+static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
+static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
+static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
+static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
+static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
+static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
+static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
+static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
+
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
+static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
+
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
+static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
+
+#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
+{ \
+    const std::string skey(key); \
+    const int kid = gguf_find_key(ctx, skey.c_str()); \
+    if (kid >= 0) { \
+        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
+        if (ktype != (type)) { \
+            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
+        } \
+        (dst) = func(ctx, kid); \
+    } else if (req) { \
+        die_fmt("key not found in model: %s", skey.c_str()); \
+    } \
+}
+
+void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
+    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
+
+    uint32_t file_version;
+    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
+    GGML_ASSERT(file_version == 0);
+
+    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
+    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
+    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
+
+    uint64_t nx;
+    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
+    opt->nx = (size_t) nx;
+
+    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
+
+    std::string opt_type;
+    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
+    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
+        opt->params.type = GGML_OPT_ADAM;
+
+        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
+        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
+        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
+
+        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
+
+        copy_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+        copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
+        copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
+        opt->params.type = GGML_OPT_LBFGS;
+
+        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
+        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
+        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
+        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
+        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
+        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
+        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
+
+        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
+
+        copy_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
+        copy_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
+        copy_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
+        copy_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
+        copy_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
+        copy_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+        copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
+        copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
+        copy_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
+        copy_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
+    } else {
+        throw std::runtime_error("unknown optimizer type\n");
+    }
+}
+
+void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
+    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
+    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
+    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
+
+    switch (opt->params.type) {
+        case GGML_OPT_ADAM:
+            {
+                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
+
+                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
+                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
+                if (opt->adam.pf) {
+                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
+                }
+
+                gguf_add_tensor(fctx, opt->adam.m);
+                gguf_add_tensor(fctx, opt->adam.v);
+                if (opt->adam.pf) {
+                    gguf_add_tensor(fctx, opt->adam.pf);
+                }
+            } break;
+        case GGML_OPT_LBFGS:
+            {
+                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
+                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
+                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
+                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
+
+                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
+                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
+                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
+                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
+                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
+                if (opt->lbfgs.pf) {
+                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
+                }
+                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
+                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
+                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
+                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
+
+                gguf_add_tensor(fctx, opt->lbfgs.x);
+                gguf_add_tensor(fctx, opt->lbfgs.xp);
+                gguf_add_tensor(fctx, opt->lbfgs.g);
+                gguf_add_tensor(fctx, opt->lbfgs.gp);
+                gguf_add_tensor(fctx, opt->lbfgs.d);
+                if (opt->lbfgs.pf) {
+                    gguf_add_tensor(fctx, opt->lbfgs.pf);
+                }
+                gguf_add_tensor(fctx, opt->lbfgs.lmal);
+                gguf_add_tensor(fctx, opt->lbfgs.lmys);
+                gguf_add_tensor(fctx, opt->lbfgs.lms);
+                gguf_add_tensor(fctx, opt->lbfgs.lmy);
+            } break;
+    }
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            die_fmt("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+static size_t utf8_len(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+// mark each byte with its utf8 unit number.
+// returns the number of utf8 characters.
+// e.g. when bytes == '\x61\xD0\xB0\x62',
+// then utf8_units will become [0,0,1,0]
+// utf8_nunits will become [1,2,2,1] and 3 is returned.
+// bytes where utf8_units is zero, are the begin of an utf8 character.
+static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
+    size_t offs = 0;
+    size_t count_utf8 = 0;
+    while(offs < count) {
+        int len = (int) utf8_len(bytes[offs]);
+        for (int i=0; i<len; ++i) {
+            utf8_units[offs+i]  = i;
+            utf8_nunits[offs+i] = len;
+        }
+        offs += len;
+        ++count_utf8;
+    }
+    return count_utf8;
+}
+
+size_t tokenize_file(
+        struct llama_context     * lctx,
+        const char               * filename,
+        const std::string        & sample_start,
+        bool                       include_sample_start,
+        bool                       overlapping_samples,
+        unsigned                   context_length,
+        std::vector<llama_token> & out_tokens,
+        std::vector<size_t>      & out_samples_begin,
+        std::vector<size_t>      & out_samples_size) {
+    struct llama_file f(filename, "rb");
+
+    if (f.size == 0) {
+        out_tokens.clear();
+        out_samples_begin.clear();
+        out_samples_size.clear();
+        printf("%s: warning: empty or not existing training data file '%s'\n",
+            __func__, filename);
+        return out_tokens.size();
+    }
+
+    // account for possible leading whitespace that will be added by tokenizer
+    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
+    const int n_max_tokens_overhead = 1;
+
+    std::vector<char> buf;
+    buf.resize(f.size+1);
+
+    f.read_raw(buf.data(), f.size);
+    buf[f.size] = '\0';
+
+    std::vector<int> utf8_units;
+    std::vector<int> utf8_nunits;
+    utf8_units.resize(buf.size());
+    utf8_nunits.resize(buf.size());
+    size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
+
+    if (sample_start.size() == 0) {
+        // tokenize all data at once
+        out_tokens.resize(buf.size() + n_max_tokens_overhead);
+
+        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false);
+        if (n_tokens < 0) {
+            out_tokens.resize(-n_tokens);
+            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false);
+        }
+        if (n_tokens >= 0) {
+            out_tokens.resize(n_tokens);
+        }
+
+        // generate sample starts at all token positions
+        out_samples_begin.clear();
+        out_samples_begin.push_back(0);
+        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
+        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
+        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
+            out_samples_begin.push_back(sample_begin);
+            out_samples_size.push_back(context_length);
+        }
+    } else {
+        // split data into samples and tokenize each sample
+        std::string data_str(buf.data(), buf.size()-1);
+        out_samples_begin.clear();
+        out_samples_size.clear();
+        out_tokens.clear();
+
+        // find all positions of pattern sample_start
+        size_t sample_begin = data_str.find(sample_start, 0);
+        while (sample_begin != std::string::npos) {
+            out_samples_begin.push_back(sample_begin);
+            const size_t search_start = sample_begin + sample_start.size();
+            sample_begin = data_str.find(sample_start, search_start);
+        }
+        if (out_samples_begin.size() == 0) {
+            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
+                __func__, sample_start.c_str());
+            out_samples_begin.push_back(0);
+        }
+
+        out_samples_size.resize(out_samples_begin.size(), 0);
+
+        std::vector<char>        buf_sample;
+        std::vector<llama_token> tok_sample;
+
+        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
+        size_t found_too_big_sample   = 0;
+        size_t found_too_small_sample = 0;
+        size_t found_empty_sample     = 0;
+        size_t found_min_sample_size  = SIZE_MAX;
+        size_t found_max_sample_size  = 0;
+
+        size_t max_token_text_size = 0;
+        int n_vocab = llama_n_vocab(lctx);
+        for (llama_token token=0; token < n_vocab; ++token) {
+            max_token_text_size = std::max(
+                max_token_text_size,
+                strlen(llama_token_get_text(lctx, token)));
+        }
+
+        // upper bound of context byte length.
+        // strings with this byte length should always tokenize to at least context_length tokens.
+        size_t context_byte_len = max_token_text_size*context_length;
+
+        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
+            // determine sample begin and end from pattern positions
+            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
+            size_t sample_end   = overlapping_samples
+                                    ? std::min(
+                                        data_str.size(),
+                                        sample_begin + context_byte_len)
+                                    : (i+1 < out_samples_begin.size()
+                                        ? out_samples_begin[i+1]
+                                        : data_str.size());
+            if (utf8_units[sample_end] > 0) {
+                // sample end is in the middle of an utf8 character.
+                // advance sample_end to the begin of the next utf8 character.
+                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
+            }
+            size_t sample_size = sample_end - sample_begin;
+            if (sample_size == 0) {
+                ++found_empty_sample;
+            }
+
+            if (sample_size > 0) {
+                // llama_tokenize expects zero terminated string,
+                // copy sample into buffer and zero terminate it.
+                buf_sample.resize(sample_size+1);
+                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
+                buf_sample[sample_size] = '\0';
+
+                // printf("sample: '%s'\n", buf_sample.data());
+
+                // tokenize the sample
+                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
+                int n_tokens = llama_tokenize(lctx,
+                    buf_sample.data(),
+                    tok_sample.data(),
+                    (int) tok_sample.size(), false);
+                if (n_tokens < 0) {
+                    tok_sample.resize(-n_tokens);
+                    n_tokens = llama_tokenize(lctx,
+                        buf_sample.data(),
+                        tok_sample.data(),
+                        (int) tok_sample.size(), false);
+                    GGML_ASSERT(n_tokens >= 0);
+                }
+                GGML_ASSERT(n_tokens <= (int) tok_sample.size());
+
+                if ((size_t) n_tokens > context_length) {
+                    ++found_too_big_sample;
+                } else if ((size_t) n_tokens < context_length) {
+                    ++found_too_small_sample;
+                }
+                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
+                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
+
+                // write out tokens, start and size of sample
+                // overwrite the string start position with the token start position
+                out_samples_begin[i] = out_tokens.size();
+                out_samples_size[i] = (size_t) n_tokens;
+                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
+            } else {
+                out_samples_begin[i] = out_tokens.size();
+                out_samples_size[i] = 0;
+            }
+
+        }
+        if (found_too_big_sample > 0) {
+            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
+                __func__, found_too_big_sample, found_max_sample_size, context_length);
+        }
+
+        if (found_too_small_sample > 0) {
+            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
+                __func__, found_too_small_sample, found_min_sample_size, context_length);
+        }
+
+        if (found_empty_sample) {
+            printf("%s: warning: found %zu empty samples.\n",
+                __func__, found_empty_sample);
+        }
+    }
+    printf("%s: total number of samples: %zu\n",
+        __func__, out_samples_begin.size());
+
+    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
+
+    return out_tokens.size();
+}
diff --git a/common/train.h b/common/train.h
new file mode 100644
index 0000000000000..9d629beb7095b
--- /dev/null
+++ b/common/train.h
@@ -0,0 +1,113 @@
+// Various helper functions and utilities for training
+
+#pragma once
+
+#include <string>
+#include <random>
+#include <vector>
+
+#include "ggml.h"
+#include "llama.h"
+
+struct random_normal_distribution;
+struct random_uniform_distribution;
+
+struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
+struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
+
+void free_random_normal_distribution (struct random_normal_distribution  * rnd);
+void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
+
+struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
+struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
+
+float frand();
+float frand_normal (struct random_normal_distribution * rnd);
+float frand_uniform(struct random_uniform_distribution * rnd);
+
+int   clamp (const int v, const int min, const int max);
+float fclamp(const float v, const float min, const float max);
+
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
+
+size_t tokenize_file(
+        struct llama_context     * lctx,
+        const char               * filename,
+        const std::string        & sample_start,
+        bool                       include_sample_start,
+        bool                       overlapping_samples,
+        unsigned                   context_length,
+        std::vector<llama_token> & out_tokens,
+        std::vector<size_t>      & out_samples_begin,
+        std::vector<size_t>      & out_samples_size);
+
+int64_t get_example_targets_batch(
+        struct llama_context * lctx,
+        struct ggml_tensor   * tokens_input,
+        struct ggml_tensor   * target_probs,
+        int64_t                example_id,
+        const size_t         * samples_begin,
+        const size_t         * samples_size,
+              size_t           samples_count,
+        const llama_token    * train_data,
+        size_t                 n_train_data,
+        bool                   separate_with_eos,
+        bool                   separate_with_bos,
+        bool                   fill_with_next_samples);
+
+typedef std::string mt19937_state;
+
+void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
+mt19937_state mt19937_get_state(const std::mt19937& rng);
+mt19937_state mt19937_seed_to_state(unsigned seed);
+
+mt19937_state shuffle_samples(
+        const mt19937_state & rng_state,
+        size_t              * shuffled_begins,
+        size_t              * shuffled_sizes,
+        const size_t        * begins,
+        const size_t        * sizes,
+        size_t                count);
+
+size_t hash_combine(size_t h1, size_t h2);
+
+size_t compute_samples_hash(
+    const char* fn,
+    const size_t* samples_begin,
+    const size_t* samples_size,
+    size_t sample_count);
+
+
+std::string replace_str(const char * s, const char * needle, const char * replacement);
+
+void print_duration(double milliseconds);
+
+float cosine_decay(
+    int64_t step,
+    int64_t decay_steps,
+    float   minimum);
+
+float cosine_decay_restart(
+    int64_t step,
+    int64_t decay_steps,
+    float   minimum,
+    float   restart_step_mult);
+
+float learning_schedule(
+    int64_t step,
+    int64_t warmup_steps,
+    int64_t decay_steps,
+    float   learning_rate,
+    float   overall_minimum,
+    float   cos_decay_minimum,
+    float   cos_decay_restart_step_mult,
+    bool    enable_restart);
+
+void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
+
+void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
+void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
+
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index a99ece9a66fd1..e7a8e457788be 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "train.h"
 #include <vector>
 #include <cassert>
 #include <random>
@@ -14,29 +15,6 @@ static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 static const float rms_norm_eps = 5e-6f;
 #endif
 
-float frand() {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> nd;
-    float min;
-    float max;
-};
-
-void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->nd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-}
-
-float frand_normal(struct random_normal_distribution * rnd) {
-    const float r = rnd->nd(rnd->gen);
-    return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
-}
-
 void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
@@ -95,56 +73,6 @@ struct ggml_tensor * randomize_tensor(
     return tensor;
 }
 
-struct ggml_tensor * randomize_tensor_normal(
-        struct ggml_tensor * tensor,
-        int ndims,
-        const int64_t ne[],
-        struct random_normal_distribution * rnd) {
-    float scale = 1.0; // xavier
-    switch (ndims) {
-        case 1:
-            scale /= sqrtf(ne[0]);
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf(ne[0]+ne[1]);
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf(ne[0]+ne[1]);
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf(ne[0]+ne[1]);
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return tensor;
-}
-
 struct llama_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;   // this is provided as user input?
@@ -402,27 +330,29 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
 
     const uint32_t n_layer = hparams.n_layer;
 
-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
-    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
-    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
-    randomize_tensor_normal(model->output,         model->output->n_dims,         model->output->ne,         &rnd);
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings , rnd);
+    randomize_tensor_normal(model->norm           , rnd);
+    randomize_tensor_normal(model->output         , rnd);
 
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);
 
-        randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
-        randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
-        randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
-        randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
+        randomize_tensor_normal(layer.wq, rnd);
+        randomize_tensor_normal(layer.wk, rnd);
+        randomize_tensor_normal(layer.wv, rnd);
+        randomize_tensor_normal(layer.wo, rnd);
 
-        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, rnd);
 
-        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
-        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
-        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+        randomize_tensor_normal(layer.w1, rnd);
+        randomize_tensor_normal(layer.w2, rnd);
+        randomize_tensor_normal(layer.w3, rnd);
     }
+
+    free_random_normal_distribution(rnd);
 }
 
 
@@ -431,32 +361,34 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
 
     const uint32_t n_layer = hparams.n_layer;
 
-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
-    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
-    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
-    randomize_tensor_normal(model->outputa,        model->outputa->n_dims,        model->outputa->ne,         &rnd);
-    randomize_tensor_normal(model->outputb,        model->outputb->n_dims,        model->outputb->ne,         &rnd);
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
+
+    randomize_tensor_normal(model->tok_embeddings, rnd);
+    randomize_tensor_normal(model->norm          , rnd);
+    randomize_tensor_normal(model->outputa       , rnd);
+    randomize_tensor_normal(model->outputb       , rnd);
 
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
-
-        randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
-        randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
-        randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
-        randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
-        randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
-        randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
-        randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
-        randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
-
-        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
-        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
-        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);
+
+        randomize_tensor_normal(layer.wqa, rnd);
+        randomize_tensor_normal(layer.wqb, rnd);
+        randomize_tensor_normal(layer.wka, rnd);
+        randomize_tensor_normal(layer.wkb, rnd);
+        randomize_tensor_normal(layer.wva, rnd);
+        randomize_tensor_normal(layer.wvb, rnd);
+        randomize_tensor_normal(layer.woa, rnd);
+        randomize_tensor_normal(layer.wob, rnd);
+
+        randomize_tensor_normal(layer.ffn_norm, rnd);
+
+        randomize_tensor_normal(layer.w1, rnd);
+        randomize_tensor_normal(layer.w2, rnd);
+        randomize_tensor_normal(layer.w3, rnd);
     }
+
+    free_random_normal_distribution(rnd);
 }
 
 bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
@@ -756,32 +688,6 @@ struct ggml_tensor * forward(
     return inpL;
 }
 
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-}
-
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-}
-
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-}
-
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
-}
-
 struct ggml_tensor * forward_batch(
         struct llama_model    * model,
         struct llama_kv_cache * cache,
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 623e1bcad041f..ce6f28bad3e0e 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -2,6 +2,7 @@
 #include "ggml-alloc.h"
 #include "llama.h"
 #include "common.h"
+#include "train.h"
 #include <unordered_map>
 #include <vector>
 #include <cassert>
@@ -13,7 +14,6 @@
 #include <stdexcept>
 #include <algorithm>
 #include <string>
-#include <sstream>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -21,143 +21,6 @@
 
 static const size_t tensor_alignment = 32;
 
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> rd;
-    float min;
-    float max;
-};
-
-struct random_uniform_distribution {
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> rd;
-};
-
-static void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-}
-
-static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::uniform_real_distribution<float>{min, max};
-}
-
-static int clamp(const int v, const int min, const int max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-static float fclamp(const float v, const float min, const float max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-static float frand() {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static float frand_normal(struct random_normal_distribution * rnd) {
-    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
-}
-
-static float frand_uniform(struct random_uniform_distribution * rnd) {
-    return rnd->rd(rnd->gen);
-}
-
-static struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
-    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
-        case 1:
-            scale /= sqrtf(tensor->ne[0]);
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-    return tensor;
-}
-
-static struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
-        case 1:
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = frand_uniform(rnd);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = frand_uniform(rnd);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = frand_uniform(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = frand_uniform(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-    return tensor;
-}
-
 struct my_llama_hparams {
     uint32_t n_vocab    = 32000;
     uint32_t n_ctx      = 512;
@@ -299,40 +162,6 @@ struct my_llama_lora {
 };
 
 // gguf constants
-static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
-static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
-static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
-static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
-static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
-static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
-static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
-static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
-static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
-
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
-
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
-
 static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
 static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
 static const char * LLM_KV_TRAINING_TYPE               = "training.type";
@@ -719,66 +548,41 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
 static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
     const uint32_t n_layer = lora->layers.size();
 
-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
 
-    randomize_tensor_normal(lora->tok_embeddings_a, &rnd);
-    randomize_tensor_normal(lora->tok_embeddings_b, &rnd);
-    randomize_tensor_normal(lora->norm_a,           &rnd);
-    randomize_tensor_normal(lora->norm_b,           &rnd);
-    randomize_tensor_normal(lora->output_a,         &rnd);
-    randomize_tensor_normal(lora->output_b,         &rnd);
+    randomize_tensor_normal(lora->tok_embeddings_a, rnd);
+    randomize_tensor_normal(lora->tok_embeddings_b, rnd);
+    randomize_tensor_normal(lora->norm_a,           rnd);
+    randomize_tensor_normal(lora->norm_b,           rnd);
+    randomize_tensor_normal(lora->output_a,         rnd);
+    randomize_tensor_normal(lora->output_b,         rnd);
 
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = lora->layers[i];
-        randomize_tensor_normal(layer.attention_norm_a, &rnd);
-        randomize_tensor_normal(layer.attention_norm_b, &rnd);
-
-        randomize_tensor_normal(layer.wq_a, &rnd);
-        randomize_tensor_normal(layer.wq_b, &rnd);
-        randomize_tensor_normal(layer.wk_a, &rnd);
-        randomize_tensor_normal(layer.wk_b, &rnd);
-        randomize_tensor_normal(layer.wv_a, &rnd);
-        randomize_tensor_normal(layer.wv_b, &rnd);
-        randomize_tensor_normal(layer.wo_a, &rnd);
-        randomize_tensor_normal(layer.wo_b, &rnd);
-
-        randomize_tensor_normal(layer.ffn_norm_a, &rnd);
-        randomize_tensor_normal(layer.ffn_norm_b, &rnd);
-
-        randomize_tensor_normal(layer.w1_a, &rnd);
-        randomize_tensor_normal(layer.w1_b, &rnd);
-        randomize_tensor_normal(layer.w2_a, &rnd);
-        randomize_tensor_normal(layer.w2_b, &rnd);
-        randomize_tensor_normal(layer.w3_a, &rnd);
-        randomize_tensor_normal(layer.w3_b, &rnd);
-    }
-}
-
-static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-}
-
-static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-}
-
-static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-}
-
-static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
+        randomize_tensor_normal(layer.attention_norm_a, rnd);
+        randomize_tensor_normal(layer.attention_norm_b, rnd);
+
+        randomize_tensor_normal(layer.wq_a, rnd);
+        randomize_tensor_normal(layer.wq_b, rnd);
+        randomize_tensor_normal(layer.wk_a, rnd);
+        randomize_tensor_normal(layer.wk_b, rnd);
+        randomize_tensor_normal(layer.wv_a, rnd);
+        randomize_tensor_normal(layer.wv_b, rnd);
+        randomize_tensor_normal(layer.wo_a, rnd);
+        randomize_tensor_normal(layer.wo_b, rnd);
+
+        randomize_tensor_normal(layer.ffn_norm_a, rnd);
+        randomize_tensor_normal(layer.ffn_norm_b, rnd);
+
+        randomize_tensor_normal(layer.w1_a, rnd);
+        randomize_tensor_normal(layer.w1_b, rnd);
+        randomize_tensor_normal(layer.w2_a, rnd);
+        randomize_tensor_normal(layer.w2_b, rnd);
+        randomize_tensor_normal(layer.w3_a, rnd);
+        randomize_tensor_normal(layer.w3_b, rnd);
+    }
+
+    free_random_normal_distribution(rnd);
 }
 
 static struct ggml_tensor * llama_build_lora_finetune_graphs(
@@ -1019,476 +823,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     return t36;
 }
 
-static int get_example_targets_batch(
-        struct llama_context * lctx,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        int                    example_id,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples) {
-
-    GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_probs->n_dims  == 3);
-    int n_vocab  = target_probs->ne[0];
-    int n_tokens = tokens_input->ne[0];
-    int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
-    GGML_ASSERT(n_tokens == target_probs->ne[1]);
-    GGML_ASSERT(n_batch  == target_probs->ne[2]);
-
-    int used_samples = 0;
-
-    ggml_set_f32(target_probs, 0.0f);
-    int bos = llama_token_bos(lctx);
-    int eos = llama_token_eos(lctx);
-    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
-    for (int k=0; k<n_batch; ++k) {
-        // printf("%s: batch %d\n", __func__, k);
-        size_t sample_offs  = 0;
-        size_t sample_idx   = (example_id + used_samples) % samples_count;
-        size_t sample_begin = samples_begin[sample_idx];
-        size_t sample_size  = samples_size[sample_idx];
-        ++used_samples;
-
-        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
-        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
-
-        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
-        bool sample_separation_eos = !separate_with_eos;
-        bool sample_separation_bos = !separate_with_bos;
-        for (int i=0; i<n_tokens; ++i) {
-            int token = eos;
-            if (sample_offs >= sample_size && fill_with_next_samples) {
-                if (!sample_separation_eos) {
-                    // insert eos token to separate samples
-                    sample_separation_eos = true;
-                } else if (!sample_separation_bos) {
-                    // insert bos token to separate samples
-                    sample_separation_bos = true;
-                    token = bos;
-                } else {
-                    // sample separation is done, continue with next sample
-                    sample_separation_eos = !separate_with_eos;
-                    sample_separation_bos = !separate_with_bos;
-                    sample_offs  = 0;
-                    sample_idx   = (example_id + used_samples) % samples_count;
-                    sample_begin = samples_begin[sample_idx];
-                    sample_size  = samples_size[sample_idx];
-                    ++used_samples;
-                }
-            }
-            // note: no else-if here
-            if (sample_offs < sample_size) {
-                token = clamp(train_data[sample_begin+sample_offs], 0, n_vocab-1);
-                ++sample_offs;
-            }
-            ggml_set_f32_nd(target_probs,  token, i, k, 0, +1.0f);
-            if (i+1<n_tokens) {
-                ggml_set_i32_nd(tokens_input, i+1, k, 0, 0, token);
-            }
-        }
-    }
-
-    return used_samples;
-}
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
-
-// mark each byte with its utf8 unit number.
-// returns the number of utf8 characters.
-// e.g. when bytes == '\x61\xD0\xB0\x62',
-// then utf8_units will become [0,0,1,0]
-// utf8_nunits will become [1,2,2,1] and 3 is returned.
-// bytes where utf8_units is zero, are the begin of an utf8 character.
-static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
-    size_t offs = 0;
-    size_t count_utf8 = 0;
-    while(offs < count) {
-        size_t len = utf8_len(bytes[offs]);
-        for (size_t i=0; i<len; ++i) {
-            utf8_units[offs+i] = i;
-            utf8_nunits[offs+i] = len;
-        }
-        offs += len;
-        ++count_utf8;
-    }
-    return count_utf8;
-}
-
-static size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size) {
-    struct llama_file f(filename, "rb");
-
-    if (f.size == 0) {
-        out_tokens.clear();
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        printf("%s: warning: empty or not existing training data file '%s'\n",
-            __func__, filename);
-        return out_tokens.size();
-    }
-
-    // account for possible leading whitespace that will be added by tokenizer
-    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
-    const int n_max_tokens_overhead = 1;
-
-    std::vector<char> buf;
-    buf.resize(f.size+1);
-
-    f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';
-
-    std::vector<int> utf8_units;
-    std::vector<int> utf8_nunits;
-    utf8_units.resize(buf.size());
-    utf8_nunits.resize(buf.size());
-    size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
-
-    if (sample_start.size() == 0) {
-        // tokenize all data at once
-        out_tokens.resize(buf.size() + n_max_tokens_overhead);
-
-        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
-        if (n_tokens < 0) {
-            out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
-        }
-        if (n_tokens >= 0) {
-            out_tokens.resize(n_tokens);
-        }
-
-        // generate sample starts at all token positions
-        out_samples_begin.clear();
-        out_samples_begin.push_back(0);
-        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
-        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
-        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
-            out_samples_begin.push_back(sample_begin);
-            out_samples_size.push_back(context_length);
-        }
-    } else {
-        // split data into samples and tokenize each sample
-        std::string data_str(buf.data(), buf.size()-1);
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        out_tokens.clear();
-
-        // find all positions of pattern sample_start
-        size_t sample_begin = data_str.find(sample_start, 0);
-        while (sample_begin != std::string::npos) {
-            out_samples_begin.push_back(sample_begin);
-            const size_t search_start = sample_begin + sample_start.size();
-            sample_begin = data_str.find(sample_start, search_start);
-        }
-        if (out_samples_begin.size() == 0) {
-            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
-                __func__, sample_start.c_str());
-            out_samples_begin.push_back(0);
-        }
-
-        out_samples_size.resize(out_samples_begin.size(), 0);
-
-        std::vector<char>        buf_sample;
-        std::vector<llama_token> tok_sample;
-
-        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
-        size_t found_too_big_sample   = 0;
-        size_t found_too_small_sample = 0;
-        size_t found_empty_sample     = 0;
-        size_t found_min_sample_size  = SIZE_MAX;
-        size_t found_max_sample_size  = 0;
-
-        size_t max_token_text_size = 0;
-        int n_vocab = llama_n_vocab(lctx);
-        for (llama_token token=0; token < n_vocab; ++token) {
-            max_token_text_size = std::max(
-                max_token_text_size,
-                strlen(llama_token_get_text(lctx, token)));
-        }
-
-        // upper bound of context byte length.
-        // strings with this byte length should always tokenize to at least context_length tokens.
-        size_t context_byte_len = max_token_text_size*context_length;
-
-        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
-            // determine sample begin and end from pattern positions
-            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
-            size_t sample_end   = overlapping_samples
-                                    ? std::min(
-                                        data_str.size(),
-                                        sample_begin + context_byte_len)
-                                    : (i+1 < out_samples_begin.size()
-                                        ? out_samples_begin[i+1]
-                                        : data_str.size());
-            if (utf8_units[sample_end] > 0) {
-                // sample end is in the middle of an utf8 character.
-                // advance sample_end to the begin of the next utf8 character.
-                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
-            }
-            size_t sample_size = sample_end - sample_begin;
-            if (sample_size == 0) {
-                ++found_empty_sample;
-            }
-
-            if (sample_size > 0) {
-                // llama_tokenize expects zero terminated string,
-                // copy sample into buffer and zero terminate it.
-                buf_sample.resize(sample_size+1);
-                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
-                buf_sample[sample_size] = '\0';
-
-                // printf("sample: '%s'\n", buf_sample.data());
-
-                // tokenize the sample
-                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
-                int n_tokens = llama_tokenize(lctx,
-                    buf_sample.data(),
-                    tok_sample.data(),
-                    tok_sample.size(), false);
-                if (n_tokens < 0) {
-                    tok_sample.resize(-n_tokens);
-                    n_tokens = llama_tokenize(lctx,
-                        buf_sample.data(),
-                        tok_sample.data(),
-                        tok_sample.size(), false);
-                    GGML_ASSERT(n_tokens >= 0);
-                }
-                GGML_ASSERT(n_tokens <= tok_sample.size());
-
-                if ((size_t) n_tokens > context_length) {
-                    ++found_too_big_sample;
-                } else if ((size_t) n_tokens < context_length) {
-                    ++found_too_small_sample;
-                }
-                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
-                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
-
-                // write out tokens, start and size of sample
-                // overwrite the string start position with the token start position
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = (size_t) n_tokens;
-                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
-            } else {
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = 0;
-            }
-
-        }
-        if (found_too_big_sample > 0) {
-            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
-                __func__, found_too_big_sample, found_max_sample_size, context_length);
-        }
-
-        if (found_too_small_sample > 0) {
-            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
-                __func__, found_too_small_sample, found_min_sample_size, context_length);
-        }
-
-        if (found_empty_sample) {
-            printf("%s: warning: found %zu empty samples.\n",
-                __func__, found_empty_sample);
-        }
-    }
-    printf("%s: total number of samples: %zu\n",
-        __func__, out_samples_begin.size());
-
-    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
-
-    return out_tokens.size();
-}
-
-static void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state.exceptions(std::stringstream::failbit);
-    s_rng_state.str(rng_state);
-    s_rng_state >> rng;
-}
-
-static std::string mt19937_get_state(const std::mt19937& rng) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state << rng;
-    return s_rng_state.str();
-}
-
-static std::string mt19937_seed_to_state(unsigned seed) {
-    std::mt19937 rng(seed);
-    return mt19937_get_state(rng);
-}
-
-static std::string shuffle_samples(
-        const std::string & rng_state,
-        const size_t      * begins,
-        const size_t      * sizes,
-        size_t            * shuffled_begins,
-        size_t            * shuffled_sizes,
-        size_t              count) {
-    if (count == 0) return rng_state;
-
-    std::mt19937 rng;
-    mt19937_set_state(rng, rng_state);
-
-    // sort indices by random value for each index
-    std::vector<size_t> idcs;
-    {
-        std::vector<unsigned> rnd;
-        idcs.resize(count);
-        rnd.resize(count);
-        for (unsigned i=0; i<count; ++i) {
-            idcs[i] = i;
-            rnd[i]  = rng();
-        }
-
-        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
-            // stable sort for reproducibility
-            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
-        });
-    }
-
-    // reorder begins and sizes by sorted indices
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_begins[i] = begins[idcs[i]];
-    }
-
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_sizes[i] = sizes[idcs[i]];
-    }
-
-    return mt19937_get_state(rng);
-}
-
-static std::string replace_str(const char * s, const char * needle, const char * replacement) {
-    std::string str = s;
-    size_t pos = str.find(needle);
-    if (pos != std::string::npos) {
-        str.replace(pos, strlen(needle), replacement);
-    }
-    return str;
-}
-
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 { \
     const std::string skey(key); \
@@ -1496,37 +830,14 @@ static std::string replace_str(const char * s, const char * needle, const char *
     if (kid >= 0) { \
         enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
         if (ktype != (type)) { \
-            throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
+            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
         } \
         (dst) = func(ctx, kid); \
     } else if (req) { \
-        throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
+        die_fmt("key not found in model: %s", skey.c_str()); \
     } \
 }
 
-static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
-    GGML_ASSERT(a != NULL);
-    GGML_ASSERT(b != NULL);
-    GGML_ASSERT(a->type == b->type);
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
-
-    return true;
-}
-
-static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
-    if (dst == NULL) {
-        return;
-    }
-    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
-    GGML_ASSERT(are_same_layout(dst, t));
-    memcpy(dst->data, t->data, ggml_nbytes(t));
-
-    if (strlen(ggml_get_name(dst)) == 0) {
-        ggml_set_name(dst, name);
-    }
-}
-
 static void load_default_lora_params_from_base_model(const char * fn_base_model, struct my_llama_lora_hparams * lora_params) {
     if (strlen(fn_base_model) == 0) {
         return;
@@ -1558,131 +869,7 @@ static void load_default_lora_params_from_base_model(const char * fn_base_model,
     gguf_free(fctx);
 }
 
-static void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
-    GGML_ASSERT(file_version == 0);
-
-    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
-    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
-    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
-
-    uint64_t nx;
-    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
-    opt->nx = (size_t) nx;
-
-    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
-
-    std::string opt_type;
-    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
-    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_ADAM;
-
-        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
-
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_LBFGS;
-
-        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
-        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
-        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
-        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
-        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
-        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
-
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        read_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-        read_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-        read_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-        read_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-        read_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-        read_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-        read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-        read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-        read_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-        read_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-    } else {
-        throw std::runtime_error("unknown optimizer type\n");
-    }
-}
-
-static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
-    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
-    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
-
-                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-                if (opt->adam.pf) {
-                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-                }
-
-                gguf_add_tensor(fctx, opt->adam.m);
-                gguf_add_tensor(fctx, opt->adam.v);
-                if (opt->adam.pf) {
-                    gguf_add_tensor(fctx, opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
-
-                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-                if (opt->lbfgs.pf) {
-                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-                }
-                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-
-                gguf_add_tensor(fctx, opt->lbfgs.x);
-                gguf_add_tensor(fctx, opt->lbfgs.xp);
-                gguf_add_tensor(fctx, opt->lbfgs.g);
-                gguf_add_tensor(fctx, opt->lbfgs.gp);
-                gguf_add_tensor(fctx, opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    gguf_add_tensor(fctx, opt->lbfgs.pf);
-                }
-                gguf_add_tensor(fctx, opt->lbfgs.lmal);
-                gguf_add_tensor(fctx, opt->lbfgs.lmys);
-                gguf_add_tensor(fctx, opt->lbfgs.lms);
-                gguf_add_tensor(fctx, opt->lbfgs.lmy);
-            } break;
-    }
-}
 
 static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
@@ -1737,33 +924,33 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
 
     init_lora(model, lora);
 
-    read_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
-    read_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
-    read_tensor_by_name(lora->norm_a,           f_ggml_ctx, ggml_get_name(lora->norm_a));
-    read_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
-    read_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
-    read_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
+    copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
+    copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
+    copy_tensor_by_name(lora->norm_a,           f_ggml_ctx, ggml_get_name(lora->norm_a));
+    copy_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
+    copy_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
+    copy_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
 
     for (uint32_t i = 0; i < lora->layers.size(); ++i) {
         auto & layer = lora->layers[i];
-        read_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
-        read_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
-        read_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
-        read_tensor_by_name(layer.wq_b,             f_ggml_ctx, ggml_get_name(layer.wq_b));
-        read_tensor_by_name(layer.wk_a,             f_ggml_ctx, ggml_get_name(layer.wk_a));
-        read_tensor_by_name(layer.wk_b,             f_ggml_ctx, ggml_get_name(layer.wk_b));
-        read_tensor_by_name(layer.wv_a,             f_ggml_ctx, ggml_get_name(layer.wv_a));
-        read_tensor_by_name(layer.wv_b,             f_ggml_ctx, ggml_get_name(layer.wv_b));
-        read_tensor_by_name(layer.wo_a,             f_ggml_ctx, ggml_get_name(layer.wo_a));
-        read_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
-        read_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
-        read_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        read_tensor_by_name(layer.w1_a,             f_ggml_ctx, ggml_get_name(layer.w1_a));
-        read_tensor_by_name(layer.w1_b,             f_ggml_ctx, ggml_get_name(layer.w1_b));
-        read_tensor_by_name(layer.w2_a,             f_ggml_ctx, ggml_get_name(layer.w2_a));
-        read_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
-        read_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
-        read_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
+        copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
+        copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
+        copy_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
+        copy_tensor_by_name(layer.wq_b,             f_ggml_ctx, ggml_get_name(layer.wq_b));
+        copy_tensor_by_name(layer.wk_a,             f_ggml_ctx, ggml_get_name(layer.wk_a));
+        copy_tensor_by_name(layer.wk_b,             f_ggml_ctx, ggml_get_name(layer.wk_b));
+        copy_tensor_by_name(layer.wv_a,             f_ggml_ctx, ggml_get_name(layer.wv_a));
+        copy_tensor_by_name(layer.wv_b,             f_ggml_ctx, ggml_get_name(layer.wv_b));
+        copy_tensor_by_name(layer.wo_a,             f_ggml_ctx, ggml_get_name(layer.wo_a));
+        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
+        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
+        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
+        copy_tensor_by_name(layer.w1_a,             f_ggml_ctx, ggml_get_name(layer.w1_a));
+        copy_tensor_by_name(layer.w1_b,             f_ggml_ctx, ggml_get_name(layer.w1_b));
+        copy_tensor_by_name(layer.w2_a,             f_ggml_ctx, ggml_get_name(layer.w2_a));
+        copy_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
+        copy_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
+        copy_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
     }
 }
 
@@ -1915,6 +1102,89 @@ static void save_checkpoint_lora_file(const char * filename, struct my_llama_mod
     gguf_free(fctx);
 }
 
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            die_fmt("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
 static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
     if (tensor == NULL) {
         file->write_u32(0);
@@ -2002,25 +1272,6 @@ static void save_as_llama_lora(struct my_llama_lora * lora, const char * filenam
     }
 }
 
-static float cosine_decay(const int decay_steps, const float minimum, int step) {
-    if (step > decay_steps) {
-        step = decay_steps;
-    }
-    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
-    const float decay = (1 - minimum)*cosine_decay + minimum;
-    return decay;
-}
-
-static float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
-    if (enable_restart) {
-        while (step > decay_steps) {
-            step -= decay_steps;
-            decay_steps = (int) restart_step_mult * decay_steps;
-        }
-    }
-    return cosine_decay(decay_steps, minimum, step);
-}
-
 struct train_params {
     const char * fn_model_base;
     const char * fn_train_data;
@@ -2665,29 +1916,6 @@ struct opt_callback_data {
     double                    millis_per_iter;
 };
 
-static void print_duration(double fmillis) {
-    if (fmillis < 1000.0f) {
-        printf("%.1fms", (float) fmillis);
-        return;
-    }
-    const int64_t one_sec  = 1000;
-    const int64_t one_min  = one_sec  * 60;
-    const int64_t one_hour = one_min  * 60;
-    const int64_t one_day  = one_hour * 24;
-
-    int64_t millis  = (int64_t) fmillis;
-    int64_t days    = millis/one_day;
-    int64_t hours   = (millis - days*one_day)/one_hour;
-    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
-    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
-
-    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
-    if (days > 0) {
-        printf("%lldd ", (long long int) days);
-    }
-    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
-}
-
 static void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
@@ -2738,16 +1966,15 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         // exclude file saving from time measurement, by measuring last_time after saving
         data->last_time = ggml_time_ms();
 
-        *sched = (opt->iter < params->warmup)
-                    ? (float) opt->iter / (float) params->warmup
-                    : cosine_decay_restart(
-                        params->cos_decay_steps,
-                        params->cos_decay_min,
-                        opt->iter - params->warmup,
-                        params->cos_decay_restart,
-                        params->enable_restart);
-        float min_sched = params->adam_min_alpha / params->adam_alpha;
-        *sched = min_sched + *sched * (1.0f - min_sched);
+        *sched = learning_schedule(
+            opt->iter,
+            params->warmup,
+            params->cos_decay_steps,
+            params->adam_alpha,
+            params->adam_min_alpha,
+            params->cos_decay_min,
+            params->cos_decay_restart,
+            params->enable_restart);
 
         int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
         if (impr_plot > 0) impr_plot = 0;
@@ -2775,16 +2002,16 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         printf("\n");
     }
 
-    int used_samples = get_example_targets_batch(
+    int64_t used_samples = get_example_targets_batch(
         data->lctx,
+        data->tokens_input,
+        data->target_probs,
+        data->lora->shuffle_next_sample,
         data->shuffled_samples_begin,
         data->shuffled_samples_size,
         data->samples_count,
         data->tokens_data,
         data->tokens_size,
-        data->lora->shuffle_next_sample,
-        data->tokens_input,
-        data->target_probs,
         params->separate_with_eos,
         params->separate_with_bos,
         params->fill_with_next_samples);
@@ -2798,10 +2025,10 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         data->lora->shuffle_rng_state_current = data->lora->shuffle_rng_state_next;
         data->lora->shuffle_rng_state_next = shuffle_samples(
             data->lora->shuffle_rng_state_current,
-            data->samples_begin,
-            data->samples_size,
             data->shuffled_samples_begin,
             data->shuffled_samples_size,
+            data->samples_begin,
+            data->samples_size,
             data->samples_count);
         data->lora->shuffle_next_sample = 0;
     }
@@ -2840,22 +2067,6 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
     return nx;
 }
 
-static size_t hash_combine(size_t h1, size_t h2) {
-    return h1 ^ (h2 << 1);
-}
-
-static size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
-    std::hash<std::string> h_string;
-    std::hash<unsigned long long> h_ull;
-    size_t h = h_string(std::string(fn));
-    h = hash_combine(h, h_ull((unsigned long long) sample_count));
-    for (size_t i=0; i< sample_count; ++i) {
-        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
-        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
-    }
-    return h;
-}
-
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -3176,10 +2387,10 @@ int main(int argc, char ** argv) {
     train_shuffled_samples_size.resize(train_samples_size.size());
     lora.shuffle_rng_state_next = shuffle_samples(
         lora.shuffle_rng_state_current,
-        train_samples_begin.data(),
-        train_samples_size.data(),
         train_shuffled_samples_begin.data(),
         train_shuffled_samples_size.data(),
+        train_samples_begin.data(),
+        train_samples_size.data(),
         train_samples_size.size());
 
     printf("%s: begin training\n", __func__);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 39a85967a50cc..63edcf9ef397b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "common.h"
+#include "train.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@@ -18,143 +19,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-struct random_normal_distribution {
-    std::mt19937 gen;
-    std::normal_distribution<float> rd;
-    float min;
-    float max;
-};
-
-struct random_uniform_distribution {
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> rd;
-};
-
-static void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::normal_distribution<float>{mean, std};
-    rnd->min = min;
-    rnd->max = max;
-}
-
-static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
-    rnd->gen = std::mt19937(seed);
-    rnd->rd = std::uniform_real_distribution<float>{min, max};
-}
-
-static int clamp(const int v, const int min, const int max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-static float fclamp(const float v, const float min, const float max) {
-    return ((v < min) ? (min) : (v > max) ? (max) : v);
-}
-
-static float frand() {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static float frand_normal(struct random_normal_distribution * rnd) {
-    return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
-}
-
-static float frand_uniform(struct random_uniform_distribution * rnd) {
-    return rnd->rd(rnd->gen);
-}
-
-static struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
-    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
-        case 1:
-            scale /= sqrtf(tensor->ne[0]);
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = scale * frand_normal(rnd);
-            }
-            break;
-        case 2:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = scale * frand_normal(rnd);
-                }
-            }
-            break;
-        case 3:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = scale * frand_normal(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            scale /= sqrtf(tensor->ne[0]+tensor->ne[1]);
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = scale * frand_normal(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-    return tensor;
-}
-
-static struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
-        case 1:
-            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
-                *dst = frand_uniform(rnd);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                    float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-                    *dst = frand_uniform(rnd);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                        float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
-                        *dst = frand_uniform(rnd);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
-                for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
-                    for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
-                        for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
-                            float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
-                            *dst = frand_uniform(rnd);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-    return tensor;
-}
-
 struct my_llama_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;
@@ -215,40 +79,6 @@ struct my_llama_model {
 };
 
 // gguf constants
-static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
-static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
-static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
-static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
-static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
-static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
-static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
-static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
-static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
-static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
-static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
-static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
-static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
-
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
-static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
-
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
-static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
-
 static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
 static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
 static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
@@ -411,54 +241,29 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
 
     const uint32_t n_layer = hparams.n_layer;
 
-    struct random_normal_distribution rnd;
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
 
-    randomize_tensor_normal(model->tok_embeddings, &rnd);
-    randomize_tensor_normal(model->norm,           &rnd);
-    randomize_tensor_normal(model->output,         &rnd);
+    randomize_tensor_normal(model->tok_embeddings, rnd);
+    randomize_tensor_normal(model->norm,           rnd);
+    randomize_tensor_normal(model->output,         rnd);
 
     for (uint32_t i = 0; i < n_layer; ++i) {
         auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);
 
-        randomize_tensor_normal(layer.wq, &rnd);
-        randomize_tensor_normal(layer.wk, &rnd);
-        randomize_tensor_normal(layer.wv, &rnd);
-        randomize_tensor_normal(layer.wo, &rnd);
+        randomize_tensor_normal(layer.wq, rnd);
+        randomize_tensor_normal(layer.wk, rnd);
+        randomize_tensor_normal(layer.wv, rnd);
+        randomize_tensor_normal(layer.wo, rnd);
 
-        randomize_tensor_normal(layer.ffn_norm, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, rnd);
 
-        randomize_tensor_normal(layer.w1, &rnd);
-        randomize_tensor_normal(layer.w2, &rnd);
-        randomize_tensor_normal(layer.w3, &rnd);
+        randomize_tensor_normal(layer.w1, rnd);
+        randomize_tensor_normal(layer.w2, rnd);
+        randomize_tensor_normal(layer.w3, rnd);
     }
-}
-
-static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-}
 
-static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-}
-
-static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-}
-
-static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
-    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == ne3);
+    free_random_normal_distribution(rnd);
 }
 
 static struct ggml_tensor * llama_build_train_graphs(
@@ -637,461 +442,6 @@ static struct ggml_tensor * llama_build_train_graphs(
     return t36;
 }
 
-static int get_example_targets_batch(
-        struct llama_context * lctx,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        int                    example_id,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples) {
-
-    GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_probs->n_dims  == 3);
-    int n_vocab  = target_probs->ne[0];
-    int n_tokens = tokens_input->ne[0];
-    int n_batch  = tokens_input->ne[1];
-    GGML_ASSERT(n_vocab  == target_probs->ne[0]);
-    GGML_ASSERT(n_tokens == target_probs->ne[1]);
-    GGML_ASSERT(n_batch  == target_probs->ne[2]);
-
-    int used_samples = 0;
-
-    ggml_set_f32(target_probs, 0.0f);
-    int bos = llama_token_bos(lctx);
-    int eos = llama_token_eos(lctx);
-    // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
-    for (int k=0; k<n_batch; ++k) {
-        // printf("%s: batch %d\n", __func__, k);
-        size_t sample_offs  = 0;
-        size_t sample_idx   = (example_id + used_samples) % samples_count;
-        size_t sample_begin = samples_begin[sample_idx];
-        size_t sample_size  = samples_size[sample_idx];
-        ++used_samples;
-
-        // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
-        GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
-
-        ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
-        bool sample_separation_eos = !separate_with_eos;
-        bool sample_separation_bos = !separate_with_bos;
-        for (int i=0; i<n_tokens; ++i) {
-            int token = eos;
-            if (sample_offs >= sample_size && fill_with_next_samples) {
-                if (!sample_separation_eos) {
-                    // insert eos token to separate samples
-                    sample_separation_eos = true;
-                } else if (!sample_separation_bos) {
-                    // insert bos token to separate samples
-                    sample_separation_bos = true;
-                    token = bos;
-                } else {
-                    // sample separation is done, continue with next sample
-                    sample_separation_eos = !separate_with_eos;
-                    sample_separation_bos = !separate_with_bos;
-                    sample_offs  = 0;
-                    sample_idx   = (example_id + used_samples) % samples_count;
-                    sample_begin = samples_begin[sample_idx];
-                    sample_size  = samples_size[sample_idx];
-                    ++used_samples;
-                }
-            }
-            // note: no else-if here
-            if (sample_offs < sample_size) {
-                token = clamp(train_data[sample_begin+sample_offs], 0, n_vocab-1);
-                ++sample_offs;
-            }
-            ggml_set_f32_nd(target_probs,  token, i, k, 0, +1.0f);
-            if (i+1<n_tokens) {
-                ggml_set_i32_nd(tokens_input, i+1, k, 0, 0, token);
-            }
-        }
-    }
-
-    return used_samples;
-}
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
-
-// mark each byte with its utf8 unit number.
-// returns the number of utf8 characters.
-// e.g. when bytes == '\x61\xD0\xB0\x62',
-// then utf8_units will become [0,0,1,0]
-// utf8_nunits will become [1,2,2,1] and 3 is returned.
-// bytes where utf8_units is zero, are the begin of an utf8 character.
-static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
-    size_t offs = 0;
-    size_t count_utf8 = 0;
-    while(offs < count) {
-        size_t len = utf8_len(bytes[offs]);
-        for (size_t i=0; i<len; ++i) {
-            utf8_units[offs+i] = i;
-            utf8_nunits[offs+i] = len;
-        }
-        offs += len;
-        ++count_utf8;
-    }
-    return count_utf8;
-}
-
-static size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size) {
-    struct llama_file f(filename, "rb");
-
-    if (f.size == 0) {
-        out_tokens.clear();
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        printf("%s: warning: empty or not existing training data file '%s'\n",
-            __func__, filename);
-        return out_tokens.size();
-    }
-
-    // account for possible leading whitespace that will be added by tokenizer
-    // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
-    const int n_max_tokens_overhead = 1;
-
-    std::vector<char> buf;
-    buf.resize(f.size+1);
-
-    f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';
-
-    std::vector<int> utf8_units;
-    std::vector<int> utf8_nunits;
-    utf8_units.resize(buf.size());
-    utf8_nunits.resize(buf.size());
-    size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
-
-    if (sample_start.size() == 0) {
-        // tokenize all data at once
-        out_tokens.resize(buf.size() + n_max_tokens_overhead);
-
-        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
-        if (n_tokens < 0) {
-            out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), out_tokens.size(), false);
-        }
-        if (n_tokens >= 0) {
-            out_tokens.resize(n_tokens);
-        }
-
-        // generate sample starts at all token positions
-        out_samples_begin.clear();
-        out_samples_begin.push_back(0);
-        out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
-        size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
-        for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
-            out_samples_begin.push_back(sample_begin);
-            out_samples_size.push_back(context_length);
-        }
-    } else {
-        // split data into samples and tokenize each sample
-        std::string data_str(buf.data(), buf.size()-1);
-        out_samples_begin.clear();
-        out_samples_size.clear();
-        out_tokens.clear();
-
-        // find all positions of pattern sample_start
-        size_t sample_begin = data_str.find(sample_start, 0);
-        while (sample_begin != std::string::npos) {
-            out_samples_begin.push_back(sample_begin);
-            const size_t search_start = sample_begin + sample_start.size();
-            sample_begin = data_str.find(sample_start, search_start);
-        }
-        if (out_samples_begin.size() == 0) {
-            printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
-                __func__, sample_start.c_str());
-            out_samples_begin.push_back(0);
-        }
-
-        out_samples_size.resize(out_samples_begin.size(), 0);
-
-        std::vector<char>        buf_sample;
-        std::vector<llama_token> tok_sample;
-
-        const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
-        size_t found_too_big_sample   = 0;
-        size_t found_too_small_sample = 0;
-        size_t found_empty_sample     = 0;
-        size_t found_min_sample_size  = SIZE_MAX;
-        size_t found_max_sample_size  = 0;
-
-        size_t max_token_text_size = 0;
-        int n_vocab = llama_n_vocab(lctx);
-        for (llama_token token=0; token < n_vocab; ++token) {
-            max_token_text_size = std::max(
-                max_token_text_size,
-                strlen(llama_token_get_text(lctx, token)));
-        }
-
-        // upper bound of context byte length.
-        // strings with this byte length should always tokenize to at least context_length tokens.
-        size_t context_byte_len = max_token_text_size*context_length;
-
-        for (unsigned i=0; i<out_samples_begin.size(); ++i) {
-            // determine sample begin and end from pattern positions
-            size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
-            size_t sample_end   = overlapping_samples
-                                    ? std::min(
-                                        data_str.size(),
-                                        sample_begin + context_byte_len)
-                                    : (i+1 < out_samples_begin.size()
-                                        ? out_samples_begin[i+1]
-                                        : data_str.size());
-            if (utf8_units[sample_end] > 0) {
-                // sample end is in the middle of an utf8 character.
-                // advance sample_end to the begin of the next utf8 character.
-                sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
-            }
-            size_t sample_size = sample_end - sample_begin;
-            if (sample_size == 0) {
-                ++found_empty_sample;
-            }
-
-            if (sample_size > 0) {
-                // llama_tokenize expects zero terminated string,
-                // copy sample into buffer and zero terminate it.
-                buf_sample.resize(sample_size+1);
-                memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
-                buf_sample[sample_size] = '\0';
-
-                // printf("sample: '%s'\n", buf_sample.data());
-
-                // tokenize the sample
-                tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
-                int n_tokens = llama_tokenize(lctx,
-                    buf_sample.data(),
-                    tok_sample.data(),
-                    tok_sample.size(), false);
-                if (n_tokens < 0) {
-                    tok_sample.resize(-n_tokens);
-                    n_tokens = llama_tokenize(lctx,
-                        buf_sample.data(),
-                        tok_sample.data(),
-                        tok_sample.size(), false);
-                    GGML_ASSERT(n_tokens >= 0);
-                }
-                GGML_ASSERT(n_tokens <= tok_sample.size());
-
-                if ((size_t) n_tokens > context_length) {
-                    ++found_too_big_sample;
-                } else if ((size_t) n_tokens < context_length) {
-                    ++found_too_small_sample;
-                }
-                found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
-                found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
-
-                // write out tokens, start and size of sample
-                // overwrite the string start position with the token start position
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = (size_t) n_tokens;
-                out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
-            } else {
-                out_samples_begin[i] = out_tokens.size();
-                out_samples_size[i] = 0;
-            }
-
-        }
-        if (found_too_big_sample > 0) {
-            printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
-                __func__, found_too_big_sample, found_max_sample_size, context_length);
-        }
-
-        if (found_too_small_sample > 0) {
-            printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
-                __func__, found_too_small_sample, found_min_sample_size, context_length);
-        }
-
-        if (found_empty_sample) {
-            printf("%s: warning: found %zu empty samples.\n",
-                __func__, found_empty_sample);
-        }
-    }
-    printf("%s: total number of samples: %zu\n",
-        __func__, out_samples_begin.size());
-
-    GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
-
-    return out_tokens.size();
-}
-
-static void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state.exceptions(std::stringstream::failbit);
-    s_rng_state.str(rng_state);
-    s_rng_state >> rng;
-}
-
-static std::string mt19937_get_state(const std::mt19937& rng) {
-    std::stringstream s_rng_state;
-    s_rng_state.imbue(std::locale::classic());
-    s_rng_state << rng;
-    return s_rng_state.str();
-}
-
-static std::string mt19937_seed_to_state(unsigned seed) {
-    std::mt19937 rng(seed);
-    return mt19937_get_state(rng);
-}
-
-static std::string shuffle_samples(
-        const std::string & rng_state,
-        const size_t      * begins,
-        const size_t      * sizes,
-        size_t            * shuffled_begins,
-        size_t            * shuffled_sizes,
-        size_t              count) {
-    if (count == 0) return rng_state;
-
-    std::mt19937 rng;
-    mt19937_set_state(rng, rng_state);
-
-    // sort indices by random value for each index
-    std::vector<size_t> idcs;
-    {
-        std::vector<unsigned> rnd;
-        idcs.resize(count);
-        rnd.resize(count);
-        for (unsigned i=0; i<count; ++i) {
-            idcs[i] = i;
-            rnd[i]  = rng();
-        }
-
-        std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
-            // stable sort for reproducibility
-            return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
-        });
-    }
-
-    // reorder begins and sizes by sorted indices
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_begins[i] = begins[idcs[i]];
-    }
-
-    for (unsigned i=0; i<count; ++i) {
-        shuffled_sizes[i] = sizes[idcs[i]];
-    }
-
-    return mt19937_get_state(rng);
-}
-
-static std::string replace_str(const char * s, const char * needle, const char * replacement) {
-    std::string str = s;
-    size_t pos = str.find(needle);
-    if (pos != std::string::npos) {
-        str.replace(pos, strlen(needle), replacement);
-    }
-    return str;
-}
-
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 { \
     const std::string skey(key); \
@@ -1107,158 +457,6 @@ static std::string replace_str(const char * s, const char * needle, const char *
     } \
 }
 
-
-static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
-    GGML_ASSERT(a != NULL);
-    GGML_ASSERT(b != NULL);
-    GGML_ASSERT(a->type == b->type);
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
-
-    return true;
-}
-
-static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
-    if (dst == NULL) {
-        return;
-    }
-    struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
-    GGML_ASSERT(are_same_layout(dst, t));
-    memcpy(dst->data, t->data, ggml_nbytes(t));
-
-    if (strlen(ggml_get_name(dst)) == 0) {
-        ggml_set_name(dst, name);
-    }
-}
-
-static void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
-    GGML_ASSERT(file_version == 0);
-
-    GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
-    GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
-    GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
-
-    uint64_t nx;
-    GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
-    opt->nx = (size_t) nx;
-
-    // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
-
-    std::string opt_type;
-    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
-    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_ADAM;
-
-        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
-        GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
-
-        GGML_ASSERT(opt->ctx != NULL);
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        read_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-        read_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-        read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_LBFGS;
-
-        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
-        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
-        GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
-        GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
-        GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
-        GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
-        GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
-
-        GGML_ASSERT(opt->ctx != NULL);
-        ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
-
-        read_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-        read_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-        read_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-        read_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-        read_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-        read_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-        read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-        read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-        read_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-        read_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-    } else {
-        die("unknown optimizer type");
-    }
-}
-
-static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
-    gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
-    gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
-    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
-
-                ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
-                ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
-                if (opt->adam.pf) {
-                    ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
-                }
-
-                gguf_add_tensor(fctx, opt->adam.m);
-                gguf_add_tensor(fctx, opt->adam.v);
-                if (opt->adam.pf) {
-                    gguf_add_tensor(fctx, opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
-                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
-                gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
-                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
-
-                ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
-                ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
-                ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
-                ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
-                ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
-                if (opt->lbfgs.pf) {
-                    ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
-                }
-                ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
-                ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
-                ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
-                ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
-
-                gguf_add_tensor(fctx, opt->lbfgs.x);
-                gguf_add_tensor(fctx, opt->lbfgs.xp);
-                gguf_add_tensor(fctx, opt->lbfgs.g);
-                gguf_add_tensor(fctx, opt->lbfgs.gp);
-                gguf_add_tensor(fctx, opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    gguf_add_tensor(fctx, opt->lbfgs.pf);
-                }
-                gguf_add_tensor(fctx, opt->lbfgs.lmal);
-                gguf_add_tensor(fctx, opt->lbfgs.lmys);
-                gguf_add_tensor(fctx, opt->lbfgs.lms);
-                gguf_add_tensor(fctx, opt->lbfgs.lmy);
-            } break;
-    }
-}
-
 static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
     std::string arch;
@@ -1311,22 +509,22 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
 
     init_model(model);
 
-    read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD));
-    read_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
-    read_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
+    copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD));
+    copy_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
+    copy_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
 
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
-        read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
-        read_tensor_by_name(layer.wq,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i));
-        read_tensor_by_name(layer.wk,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i));
-        read_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
-        read_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
-        read_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
-        read_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
-        read_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
-        read_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
+        copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
+        copy_tensor_by_name(layer.wq,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i));
+        copy_tensor_by_name(layer.wk,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i));
+        copy_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
+        copy_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
+        copy_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
+        copy_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
+        copy_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
+        copy_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
     }
 }
 
@@ -1571,25 +769,6 @@ static void save_checkpoint_file(const char * filename, const char * fn_vocab_mo
     gguf_free(fctx);
 }
 
-static float cosine_decay(const int decay_steps, const float minimum, int step) {
-    if (step > decay_steps) {
-        step = decay_steps;
-    }
-    const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
-    const float decay = (1 - minimum)*cosine_decay + minimum;
-    return decay;
-}
-
-static float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
-    if (enable_restart) {
-        while (step > decay_steps) {
-            step -= decay_steps;
-            decay_steps = (int) restart_step_mult * decay_steps;
-        }
-    }
-    return cosine_decay(decay_steps, minimum, step);
-}
-
 struct train_params {
     const char * fn_vocab_model;
     const char * fn_train_data;
@@ -2136,29 +1315,6 @@ struct opt_callback_data {
     double                    millis_per_iter;
 };
 
-static void print_duration(double fmillis) {
-    if (fmillis < 1000.0f) {
-        printf("%.1fms", (float) fmillis);
-        return;
-    }
-    const int64_t one_sec  = 1000;
-    const int64_t one_min  = one_sec  * 60;
-    const int64_t one_hour = one_min  * 60;
-    const int64_t one_day  = one_hour * 24;
-
-    int64_t millis  = (int64_t) fmillis;
-    int64_t days    = millis/one_day;
-    int64_t hours   = (millis - days*one_day)/one_hour;
-    int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
-    int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
-
-    // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
-    if (days > 0) {
-        printf("%lldd ", (long long int) days);
-    }
-    printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
-}
-
 static void opt_callback(void * vdata, int accum_step, float * sched) {
     struct opt_callback_data * data = (struct opt_callback_data *) vdata;
     struct train_params * params    = data->params;
@@ -2210,16 +1366,15 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         // exclude file saving from time measurement, by measuring last_time after saving
         data->last_time = ggml_time_ms();
 
-        *sched = (opt->iter < params->warmup)
-                    ? (float) opt->iter / (float) params->warmup
-                    : cosine_decay_restart(
-                        params->cos_decay_steps,
-                        params->cos_decay_min,
-                        opt->iter - params->warmup,
-                        params->cos_decay_restart,
-                        params->enable_restart);
-        float min_sched = params->adam_min_alpha / params->adam_alpha;
-        *sched = min_sched + *sched * (1.0f - min_sched);
+        *sched = learning_schedule(
+            opt->iter,
+            params->warmup,
+            params->cos_decay_steps,
+            params->adam_alpha,
+            params->adam_min_alpha,
+            params->cos_decay_min,
+            params->cos_decay_restart,
+            params->enable_restart);
 
         int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
         if (impr_plot > 0) impr_plot = 0;
@@ -2247,16 +1402,16 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         printf("\n");
     }
 
-    int used_samples = get_example_targets_batch(
+    int64_t used_samples = get_example_targets_batch(
         data->lctx,
+        data->tokens_input,
+        data->target_probs,
+        data->model->shuffle_next_sample,
         data->shuffled_samples_begin,
         data->shuffled_samples_size,
         data->samples_count,
         data->tokens_data,
         data->tokens_size,
-        data->model->shuffle_next_sample,
-        data->tokens_input,
-        data->target_probs,
         params->separate_with_eos,
         params->separate_with_bos,
         params->fill_with_next_samples);
@@ -2270,32 +1425,16 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         data->model->shuffle_rng_state_current = data->model->shuffle_rng_state_next;
         data->model->shuffle_rng_state_next = shuffle_samples(
             data->model->shuffle_rng_state_current,
-            data->samples_begin,
-            data->samples_size,
             data->shuffled_samples_begin,
             data->shuffled_samples_size,
+            data->samples_begin,
+            data->samples_size,
             data->samples_count);
         data->model->shuffle_next_sample = 0;
     }
 
 }
 
-static size_t hash_combine(size_t h1, size_t h2) {
-    return h1 ^ (h2 << 1);
-}
-
-static size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
-    std::hash<std::string> h_string;
-    std::hash<unsigned long long> h_ull;
-    size_t h = h_string(std::string(fn));
-    h = hash_combine(h, h_ull((unsigned long long) sample_count));
-    for (size_t i=0; i< sample_count; ++i) {
-        h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
-        h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
-    }
-    return h;
-}
-
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -2435,10 +1574,10 @@ int main(int argc, char ** argv) {
     train_shuffled_samples_size.resize(train_samples_size.size());
     model.shuffle_rng_state_next = shuffle_samples(
         model.shuffle_rng_state_current,
-        train_samples_begin.data(),
-        train_samples_size.data(),
         train_shuffled_samples_begin.data(),
         train_shuffled_samples_size.data(),
+        train_samples_begin.data(),
+        train_samples_size.data(),
         train_samples_size.size());
     printf("%s: begin training\n", __func__);
 
@@ -2516,17 +1655,15 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
-        opt->params.adam.sched = (opt->iter < params.warmup)
-            ? (float) opt->iter / (float) params.warmup
-            : cosine_decay_restart(
-                params.cos_decay_steps,
-                params.cos_decay_min,
-                opt->iter - params.warmup,
-                params.cos_decay_restart,
-                params.enable_restart);
-
-        float min_sched = params.adam_min_alpha / params.adam_alpha;
-        opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched);
+        opt->params.adam.sched = learning_schedule(
+            opt->iter,
+            params.warmup,
+            params.cos_decay_steps,
+            params.adam_alpha,
+            params.adam_min_alpha,
+            params.cos_decay_min,
+            params.cos_decay_restart,
+            params.enable_restart);
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 

From a8c8907c627da33ca002d8b51910ce815f753b6b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 17:08:18 +0200
Subject: [PATCH 194/235] move train state into struct train_state

---
 common/train.cpp                              |  89 ++++++++
 common/train.h                                |  24 +-
 examples/finetune/finetune.cpp                | 207 ++++++-----------
 .../train-text-from-scratch.cpp               | 212 ++++++------------
 4 files changed, 250 insertions(+), 282 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index a1e35e5a3d91d..c2b3f036b958f 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -17,6 +17,28 @@ struct random_uniform_distribution {
     std::uniform_real_distribution<float> rd;
 };
 
+struct train_state  * init_train_state(int seed) {
+    struct train_state * state = (struct train_state *) malloc(sizeof(struct train_state));
+    memset(state, 0, sizeof(struct train_state));
+    state->shuffle_rng_state_current = "";
+    state->shuffle_rng_state_next = "";
+
+    state->opt = (struct ggml_opt_context *) malloc(sizeof(struct ggml_opt_context));
+    memset(state->opt, 0, sizeof(struct ggml_opt_context));
+    state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+
+    return state;
+}
+
+void free_train_state(struct train_state  * state) {
+    free(state->opt);
+    free(state);
+}
+
+struct ggml_opt_context * get_train_state_opt(struct train_state  * state) {
+    return state->opt;
+}
+
 struct random_normal_distribution * init_random_normal_distribution(int seed, float mean, float std, float min, float max) {
     struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
     rnd->gen = std::mt19937(seed);
@@ -472,6 +494,20 @@ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.
 static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
 static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
 
+static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
+static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
+static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
+static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
+static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
+static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
+static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
+static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
+static const char * LLM_KV_TRAINING_SAMPLES_HASH         = "training.samples_hash";
+static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
+static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
+static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
+static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
+
 #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 { \
     const std::string skey(key); \
@@ -613,6 +649,59 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
     }
 }
 
+bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) {
+    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) >= 0) {
+        return false;
+    }
+
+    uint32_t file_version;
+    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
+    GGML_ASSERT(file_version <= 1);
+
+    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
+    GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
+    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
+
+    if (file_version == 0) {
+
+        GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
+        GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
+
+    } else if (file_version == 1) {
+
+        GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
+        GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
+        GGUF_GET_KEY(fctx, train->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
+
+        GGUF_GET_KEY(fctx, train->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
+        GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
+        GGUF_GET_KEY(fctx, train->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
+        GGUF_GET_KEY(fctx, train->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
+    }
+
+    load_opt_context_gguf(fctx, f_ggml_ctx, train->opt);
+    return true;
+}
+
+void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) {
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    train->train_samples);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     train->train_tokens);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     train->train_epochs);
+
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash);
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    train->shuffle_rng_state_current.c_str());
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count);
+    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) train->shuffle_next_sample);
+
+    save_opt_context_gguf(fctx, train->opt);
+}
+
+
 struct llama_file {
     // use FILE * so we don't have to re-open the file to mmap
     FILE * fp;
diff --git a/common/train.h b/common/train.h
index 9d629beb7095b..54edd0f4a2a1d 100644
--- a/common/train.h
+++ b/common/train.h
@@ -9,6 +9,26 @@
 #include "ggml.h"
 #include "llama.h"
 
+typedef std::string mt19937_state;
+
+struct train_state {
+    struct ggml_opt_context * opt;
+
+    uint64_t train_its;
+    uint64_t train_samples;
+    uint64_t train_tokens;
+    uint64_t train_epochs;
+
+    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
+    mt19937_state shuffle_rng_state_current;
+    mt19937_state shuffle_rng_state_next;
+    size_t        shuffle_sample_count;
+    size_t        shuffle_next_sample;
+};
+
+struct train_state * init_train_state(int seed);
+void free_train_state(struct train_state  * state);
+
 struct random_normal_distribution;
 struct random_uniform_distribution;
 
@@ -58,7 +78,6 @@ int64_t get_example_targets_batch(
         bool                   separate_with_bos,
         bool                   fill_with_next_samples);
 
-typedef std::string mt19937_state;
 
 void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
 mt19937_state mt19937_get_state(const std::mt19937& rng);
@@ -111,3 +130,6 @@ void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
 void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
 void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
 
+bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
+void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
+
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ce6f28bad3e0e..58e96f186ff0b 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -148,34 +148,9 @@ struct my_llama_lora {
     struct ggml_tensor * output_b;
 
     std::vector<my_llama_lora_layer> layers;
-
-    uint64_t train_its = 0;
-    uint64_t train_samples = 0;
-    uint64_t train_tokens = 0;
-    uint64_t train_epochs = 0;
-
-    size_t      shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    std::string shuffle_rng_state_current;
-    std::string shuffle_rng_state_next;
-    size_t      shuffle_sample_count;
-    size_t      shuffle_next_sample;
 };
 
 // gguf constants
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model";
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE               = "training.type";
-static const char * LLM_KV_TRAINING_FILE_VERSION       = "training.file_version";
-static const char * LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count";
-static const char * LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count";
-static const char * LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count";
-static const char * LLM_KV_TRAINING_EPOCH_COUNT        = "training.epoch_count";
-static const char * LLM_KV_TRAINING_SAMPLES_HASH       = "training.samples_hash";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
-static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
-static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
-
 static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
 static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
 static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
@@ -336,10 +311,6 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
     const uint32_t n_vocab    = model->hparams.n_vocab;
     const uint32_t n_ff       = model->hparams.n_ff;
 
-    lora->train_its = 0;
-    lora->train_samples = 0;
-    lora->train_tokens = 0;
-
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
     auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
@@ -869,8 +840,6 @@ static void load_default_lora_params_from_base_model(const char * fn_base_model,
     gguf_free(fctx);
 }
 
-
-
 static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
@@ -1021,58 +990,17 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
     }
 }
 
-static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
     load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
-
-    uint32_t file_version;
-    GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-    GGML_ASSERT(file_version <= 1);
-
-    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
-    GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-
-    if (file_version == 0) {
-
-        GGUF_GET_KEY(fctx, lora->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, lora->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, lora->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
-
-    } else if (file_version == 1) {
-
-        GGUF_GET_KEY(fctx, lora->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
-        GGUF_GET_KEY(fctx, lora->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, lora->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
-        GGUF_GET_KEY(fctx, lora->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
-
-        GGUF_GET_KEY(fctx, lora->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
-        GGUF_GET_KEY(fctx, lora->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
-        GGUF_GET_KEY(fctx, lora->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
-        GGUF_GET_KEY(fctx, lora->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
-    }
-
-    load_opt_context_gguf(fctx, f_ggml_ctx, opt);
+    load_train_state_gguf(fctx, f_ggml_ctx, train);
 }
 
-static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
     save_llama_lora_gguf(fctx, model, lora);
-
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, lora->train_its);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    lora->train_samples);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     lora->train_tokens);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     lora->train_epochs);
-
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) lora->shuffle_samples_hash);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    lora->shuffle_rng_state_current.c_str());
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) lora->shuffle_sample_count);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) lora->shuffle_next_sample);
-
-    save_opt_context_gguf(fctx, opt);
+    save_train_state_gguf(fctx, train);
 }
 
-static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt) {
+static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
     struct ggml_context * f_ggml_ctx;
     struct gguf_init_params params;
     params.no_alloc = false;
@@ -1082,19 +1010,19 @@ static bool load_checkpoint_lora_file(const char * filename, struct my_llama_mod
         return false;
     }
 
-    load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, opt);
+    load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train);
 
     gguf_free(fctx);
     return true;
 }
 
-static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
     struct gguf_context * fctx = gguf_init_empty();
 
-    save_checkpoint_lora_gguf(fctx, model, lora, opt);
+    save_checkpoint_lora_gguf(fctx, model, lora, train);
 
     // write file
     const bool only_meta = false;
@@ -1896,30 +1824,31 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
 }
 
 struct opt_callback_data {
-    struct train_params *     params;
-    struct ggml_opt_context * opt;
-    struct my_llama_model *   model;
-    struct my_llama_lora  *   lora;
-    struct llama_context *    lctx;
-    int                       last_save_iter;
-    llama_token *             tokens_data;
-    size_t                    tokens_size;
-    size_t *                  samples_begin;
-    size_t *                  samples_size;
-    size_t *                  shuffled_samples_begin;
-    size_t *                  shuffled_samples_size;
-    size_t                    samples_count;
-    struct ggml_tensor *      tokens_input;
-    struct ggml_tensor *      target_probs;
-    int                       first_iter;
-    int64_t                   last_time;
-    double                    millis_per_iter;
+    struct train_params   * params;
+    struct train_state    * train;
+    struct my_llama_model * model;
+    struct my_llama_lora  * lora;
+    struct llama_context  * lctx;
+    int                     last_save_iter;
+    llama_token           * tokens_data;
+    size_t                  tokens_size;
+    size_t                * samples_begin;
+    size_t                * samples_size;
+    size_t                * shuffled_samples_begin;
+    size_t                * shuffled_samples_size;
+    size_t                  samples_count;
+    struct ggml_tensor    * tokens_input;
+    struct ggml_tensor    * target_probs;
+    int                     first_iter;
+    int64_t                 last_time;
+    double                  millis_per_iter;
 };
 
 static void opt_callback(void * vdata, int accum_step, float * sched) {
-    struct opt_callback_data * data = (struct opt_callback_data *) vdata;
-    struct train_params * params    = data->params;
-    struct ggml_opt_context * opt   = data->opt;
+    struct opt_callback_data * data   = (struct opt_callback_data *) vdata;
+    struct train_params      * params = data->params;
+    struct train_state       * train  = data->train;
+    struct ggml_opt_context  * opt    = train->opt;
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
@@ -1948,13 +1877,13 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
-            data->lora->train_its += new_iters;
-            data->lora->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-            data->lora->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
+            train->train_its += new_iters;
+            train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+            train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
             if (strlen(params->fn_checkpoint_out) > 0) {
-                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
-                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, opt, params->pattern_fn_it, -1, params->fn_latest);
+                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, train, params->pattern_fn_it, opt->iter, params->fn_latest);
+                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, train, params->pattern_fn_it, -1, params->fn_latest);
             }
             if (strlen(params->fn_lora_out) > 0) {
                 save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
@@ -1980,7 +1909,7 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         if (impr_plot > 0) impr_plot = 0;
         if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
         printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
-            __func__, opt->iter, std::min(1+data->lora->shuffle_next_sample, data->lora->shuffle_sample_count), data->lora->shuffle_sample_count,
+            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
             *sched, opt->loss_after);
 
 
@@ -2006,7 +1935,7 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         data->lctx,
         data->tokens_input,
         data->target_probs,
-        data->lora->shuffle_next_sample,
+        train->shuffle_next_sample,
         data->shuffled_samples_begin,
         data->shuffled_samples_size,
         data->samples_count,
@@ -2016,21 +1945,21 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         params->separate_with_bos,
         params->fill_with_next_samples);
 
-    data->lora->shuffle_next_sample += used_samples;
+    train->shuffle_next_sample += used_samples;
 
-    if (data->lora->shuffle_next_sample >= data->lora->shuffle_sample_count) {
-        ++data->lora->train_epochs;
-        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) data->lora->train_epochs);
+    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
+        ++train->train_epochs;
+        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
         // note: we may have used some samples from the current shuffling more than once
-        data->lora->shuffle_rng_state_current = data->lora->shuffle_rng_state_next;
-        data->lora->shuffle_rng_state_next = shuffle_samples(
-            data->lora->shuffle_rng_state_current,
+        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
+        train->shuffle_rng_state_next = shuffle_samples(
+            train->shuffle_rng_state_current,
             data->shuffled_samples_begin,
             data->shuffled_samples_size,
             data->samples_begin,
             data->samples_size,
             data->samples_count);
-        data->lora->shuffle_next_sample = 0;
+        train->shuffle_next_sample = 0;
     }
 }
 
@@ -2091,10 +2020,9 @@ int main(int argc, char ** argv) {
     init_model(lmodel, &model, params.n_ctx);
 
     struct my_llama_lora lora;
-    struct ggml_opt_context* opt = (struct ggml_opt_context*)alloca(sizeof(struct ggml_opt_context));
-    memset(opt, 0, sizeof(struct ggml_opt_context));
 
-    opt->ctx = NULL;
+    struct train_state      * train = init_train_state(params.seed);
+    struct ggml_opt_context * opt   = train->opt;
 
     load_default_lora_params_from_base_model(params.fn_model_base, &lora.hparams);
 
@@ -2157,7 +2085,7 @@ int main(int argc, char ** argv) {
     ggml_allocr * alloc = NULL;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, opt);
+    bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, train);
 
     if (existed) {
         // overwrite last n_ctx with user provided n_ctx
@@ -2203,13 +2131,13 @@ int main(int argc, char ** argv) {
 
     print_params(&model.hparams);
     print_lora_params(&lora.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) lora.train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) lora.train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) lora.train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) lora.train_epochs);
+    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
+    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
+    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
+    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
     printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
     printf("%s: max_opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    opt->iter = lora.train_its;
+    opt->iter = train->train_its;
 
     if (params.only_write_lora) {
         if (strlen(params.fn_lora_out) > 0) {
@@ -2368,25 +2296,25 @@ int main(int argc, char ** argv) {
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
     size_t shuffle_samples_hash = compute_samples_hash(params.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != lora.shuffle_samples_hash) || (lora.shuffle_sample_count != train_samples_size.size());
+    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
     if (changed_train_data) {
         printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
     }
     if (params.force_reshuffle) {
         printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
     }
-    if ((lora.shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
-        lora.shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
-        lora.shuffle_sample_count = train_samples_size.size();
-        lora.shuffle_next_sample = 0;
-        lora.shuffle_samples_hash = shuffle_samples_hash;
+    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
+        train->shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
+        train->shuffle_sample_count = train_samples_size.size();
+        train->shuffle_next_sample = 0;
+        train->shuffle_samples_hash = shuffle_samples_hash;
     }
     std::vector<size_t> train_shuffled_samples_begin;
     std::vector<size_t> train_shuffled_samples_size;
     train_shuffled_samples_begin.resize(train_samples_begin.size());
     train_shuffled_samples_size.resize(train_samples_size.size());
-    lora.shuffle_rng_state_next = shuffle_samples(
-        lora.shuffle_rng_state_current,
+    train->shuffle_rng_state_next = shuffle_samples(
+        train->shuffle_rng_state_current,
         train_shuffled_samples_begin.data(),
         train_shuffled_samples_size.data(),
         train_samples_begin.data(),
@@ -2397,7 +2325,7 @@ int main(int argc, char ** argv) {
 
     struct opt_callback_data opt_cb_data;
     opt_cb_data.params = &params;
-    opt_cb_data.opt    = opt;
+    opt_cb_data.train  = train;
     opt_cb_data.model  = &model;
     opt_cb_data.lora   = &lora;
     opt_cb_data.lctx   = lctx;
@@ -2442,13 +2370,13 @@ int main(int argc, char ** argv) {
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     if (new_iters > 0) {
-        lora.train_its += new_iters;
-        lora.train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-        lora.train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
+        train->train_its     += new_iters;
+        train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
         if (strlen(params.fn_checkpoint_out) > 0) {
-            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
-            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, opt, params.pattern_fn_it, -1, params.fn_latest);
+            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, train, params.pattern_fn_it, opt->iter, params.fn_latest);
+            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, train, params.pattern_fn_it, -1, params.fn_latest);
         }
         if (strlen(params.fn_lora_out) > 0) {
             save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
@@ -2458,6 +2386,7 @@ int main(int argc, char ** argv) {
     }
 
     ggml_free(opt->ctx);
+    free_train_state(train);
     ggml_free(lora.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 63edcf9ef397b..bead80843e890 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -65,34 +65,8 @@ struct my_llama_model {
     struct ggml_tensor * output;
 
     std::vector<my_llama_layer> layers;
-
-    uint64_t train_its = 0;
-    uint64_t train_samples = 0;
-    uint64_t train_tokens = 0;
-    uint64_t train_epochs = 0;
-
-    size_t      shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    std::string shuffle_rng_state_current;
-    std::string shuffle_rng_state_next;
-    size_t      shuffle_sample_count;
-    size_t      shuffle_next_sample;
 };
 
-// gguf constants
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
-static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
-static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
-static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
-static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
-static const char * LLM_KV_TRAINING_SAMPLES_HASH         = "training.samples_hash";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
-static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
-static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
-static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
-
 // gguf constants (sync with gguf.py)
 
 static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
@@ -152,11 +126,6 @@ static void init_model(struct my_llama_model * model) {
 
     struct ggml_context * ctx = model->ctx;
 
-    model->train_its = 0;
-    model->train_samples = 0;
-    model->train_tokens = 0;
-    model->train_epochs = 0;
-
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
     auto tn = [&tn_buf](const char * key) -> const char * {
@@ -685,62 +654,19 @@ static void save_llama_model_file(const char * filename, const char * fn_vocab_m
     gguf_free(fctx);
 }
 
-static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
+static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) {
     load_llama_model_gguf(fctx, f_ggml_ctx, model);
-
-    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) >= 0) {
-        uint32_t file_version = 0xFFFFFFFFu;
-        GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
-        GGML_ASSERT(file_version <= 1);
-
-        std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
-        GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-        GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-
-        if (file_version == 0) {
-
-            GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
-            GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-            GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
-
-        } else if (file_version == 1) {
-
-            GGUF_GET_KEY(fctx, model->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
-            GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
-            GGUF_GET_KEY(fctx, model->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
-            GGUF_GET_KEY(fctx, model->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
-
-            GGUF_GET_KEY(fctx, model->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
-            GGUF_GET_KEY(fctx, model->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
-            GGUF_GET_KEY(fctx, model->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
-            GGUF_GET_KEY(fctx, model->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
-        }
-
-        load_opt_context_gguf(fctx, f_ggml_ctx, opt);
-    } else {
+    if (!load_train_state_gguf(fctx, f_ggml_ctx, train)) {
         printf("%s: loaded llama model as checkpoint\n", __func__);
     }
 }
 
-static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
+static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
     save_llama_model_gguf(fctx, fn_vocab_model, model);
-
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    model->train_samples);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     model->train_tokens);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     model->train_epochs);
-
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) model->shuffle_samples_hash);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    model->shuffle_rng_state_current.c_str());
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) model->shuffle_sample_count);
-    gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) model->shuffle_next_sample);
-
-    save_opt_context_gguf(fctx, opt);
+    save_train_state_gguf(fctx, train);
 }
 
-static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) {
+static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
     struct ggml_context * f_ggml_ctx;
     struct gguf_init_params params;
     params.no_alloc = false;
@@ -750,18 +676,18 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
         return false;
     }
 
-    load_checkpoint_gguf(fctx, f_ggml_ctx, model, opt);
+    load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
 
     return true;
 }
 
-static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt, const char * pattern_it, int iteration, const char * latest) {
+static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train, const char * pattern_it, int iteration, const char * latest) {
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     std::string fn = replace_str(filename, pattern_it, sit.c_str());
     printf("%s: saving to %s\n", __func__, fn.c_str());
     struct gguf_context * fctx = gguf_init_empty();
 
-    save_checkpoint_gguf(fctx, fn_vocab_model, model, opt);
+    save_checkpoint_gguf(fctx, fn_vocab_model, model, train);
 
     // write file
     const bool only_meta = false;
@@ -1295,30 +1221,31 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
 }
 
 struct opt_callback_data {
-    struct train_params *     params;
-    struct ggml_opt_context * opt;
-    struct my_llama_model *   model;
-    struct llama_context *    lctx;
-    int                       last_save_iter;
-    llama_token *             tokens_data;
-    size_t                    tokens_size;
-    size_t *                  samples_begin;
-    size_t *                  samples_size;
-    size_t *                  shuffled_samples_begin;
-    size_t *                  shuffled_samples_size;
-    size_t                    samples_count;
-    struct ggml_tensor *      tokens_input;
-    struct ggml_tensor *      target_logits;
-    struct ggml_tensor *      target_probs;
-    int                       first_iter;
-    int64_t                   last_time;
-    double                    millis_per_iter;
+    struct train_params   * params;
+    struct train_state    * train;
+    struct my_llama_model * model;
+    struct llama_context  * lctx;
+    int                     last_save_iter;
+    llama_token           * tokens_data;
+    size_t                  tokens_size;
+    size_t                * samples_begin;
+    size_t                * samples_size;
+    size_t                * shuffled_samples_begin;
+    size_t                * shuffled_samples_size;
+    size_t                  samples_count;
+    struct ggml_tensor    * tokens_input;
+    struct ggml_tensor    * target_logits;
+    struct ggml_tensor    * target_probs;
+    int                     first_iter;
+    int64_t                 last_time;
+    double                  millis_per_iter;
 };
 
 static void opt_callback(void * vdata, int accum_step, float * sched) {
-    struct opt_callback_data * data = (struct opt_callback_data *) vdata;
-    struct train_params * params    = data->params;
-    struct ggml_opt_context * opt   = data->opt;
+    struct opt_callback_data * data   = (struct opt_callback_data *) vdata;
+    struct train_params      * params = data->params;
+    struct train_state       * train  = data->train;
+    struct ggml_opt_context  * opt    = train->opt;
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
@@ -1347,13 +1274,13 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
-            data->model->train_its += new_iters;
-            data->model->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-            data->model->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
+            train->train_its += new_iters;
+            train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+            train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
             if (strlen(params->fn_checkpoint_out) > 0) {
-                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, opt->iter, params->fn_latest);
-                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, opt, params->pattern_fn_it, -1, params->fn_latest);
+                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, train, params->pattern_fn_it, opt->iter, params->fn_latest);
+                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, train, params->pattern_fn_it, -1, params->fn_latest);
 
             }
             if (strlen(params->fn_model_out) > 0) {
@@ -1380,7 +1307,7 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         if (impr_plot > 0) impr_plot = 0;
         if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
         printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
-            __func__, opt->iter, std::min(1+data->model->shuffle_next_sample, data->model->shuffle_sample_count), data->model->shuffle_sample_count,
+            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
             *sched, opt->loss_after);
 
 
@@ -1406,7 +1333,7 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         data->lctx,
         data->tokens_input,
         data->target_probs,
-        data->model->shuffle_next_sample,
+        train->shuffle_next_sample,
         data->shuffled_samples_begin,
         data->shuffled_samples_size,
         data->samples_count,
@@ -1416,21 +1343,21 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
         params->separate_with_bos,
         params->fill_with_next_samples);
 
-    data->model->shuffle_next_sample += used_samples;
+    train->shuffle_next_sample += used_samples;
 
-    if (data->model->shuffle_next_sample >= data->model->shuffle_sample_count) {
-        ++data->model->train_epochs;
-        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) data->model->train_epochs);
+    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
+        ++train->train_epochs;
+        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
         // note: we may have used some samples from the current shuffling more than once
-        data->model->shuffle_rng_state_current = data->model->shuffle_rng_state_next;
-        data->model->shuffle_rng_state_next = shuffle_samples(
-            data->model->shuffle_rng_state_current,
+        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
+        train->shuffle_rng_state_next = shuffle_samples(
+            train->shuffle_rng_state_current,
             data->shuffled_samples_begin,
             data->shuffled_samples_size,
             data->samples_begin,
             data->samples_size,
             data->samples_count);
-        data->model->shuffle_next_sample = 0;
+        train->shuffle_next_sample = 0;
     }
 
 }
@@ -1480,8 +1407,8 @@ int main(int argc, char ** argv) {
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.n_batch;
 
-    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
-    memset(opt, 0, sizeof(struct ggml_opt_context));
+    struct train_state      * train = init_train_state(params.seed);
+    struct ggml_opt_context * opt   = train->opt;
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
     opt_params_adam.print_forward_graph     = false;
@@ -1505,7 +1432,7 @@ int main(int argc, char ** argv) {
     opt->params = opt_params_adam;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt);
+    bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, train);
     if (!existed) {
         init_model(&model);
     }
@@ -1513,7 +1440,7 @@ int main(int argc, char ** argv) {
 
     opt->params = opt_params_adam;
 
-    opt->iter = model.train_its;
+    opt->iter = train->train_its;
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
     bool from_scratch = !existed;
@@ -1555,25 +1482,25 @@ int main(int argc, char ** argv) {
     printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
 
     size_t shuffle_samples_hash = compute_samples_hash(params.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != model.shuffle_samples_hash) || (model.shuffle_sample_count != train_samples_size.size());
+    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
     if (changed_train_data) {
         printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
     }
     if (params.force_reshuffle) {
         printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
     }
-    if ((model.shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
-        model.shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
-        model.shuffle_sample_count = train_samples_size.size();
-        model.shuffle_next_sample = 0;
-        model.shuffle_samples_hash = shuffle_samples_hash;
+    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
+        train->shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
+        train->shuffle_sample_count = train_samples_size.size();
+        train->shuffle_next_sample = 0;
+        train->shuffle_samples_hash = shuffle_samples_hash;
     }
     std::vector<size_t> train_shuffled_samples_begin;
     std::vector<size_t> train_shuffled_samples_size;
     train_shuffled_samples_begin.resize(train_samples_begin.size());
     train_shuffled_samples_size.resize(train_samples_size.size());
-    model.shuffle_rng_state_next = shuffle_samples(
-        model.shuffle_rng_state_current,
+    train->shuffle_rng_state_next = shuffle_samples(
+        train->shuffle_rng_state_current,
         train_shuffled_samples_begin.data(),
         train_shuffled_samples_size.data(),
         train_samples_begin.data(),
@@ -1583,7 +1510,7 @@ int main(int argc, char ** argv) {
 
     struct opt_callback_data opt_cb_data;
     opt_cb_data.params = &params;
-    opt_cb_data.opt = opt;
+    opt_cb_data.train = train;
     opt_cb_data.model = &model;
     opt_cb_data.lctx = lctx;
     opt_cb_data.last_save_iter = opt->iter;
@@ -1594,9 +1521,9 @@ int main(int argc, char ** argv) {
     opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
     opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
     opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input  = NULL;
-    opt_cb_data.target_logits = NULL;
-    opt_cb_data.target_probs  = NULL;
+    opt_cb_data.tokens_input           = NULL;
+    opt_cb_data.target_logits          = NULL;
+    opt_cb_data.target_probs           = NULL;
     opt_cb_data.first_iter             = opt->iter;
     opt_cb_data.last_time              = ggml_time_ms();
     opt_cb_data.millis_per_iter        = 0.0;
@@ -1672,9 +1599,9 @@ int main(int argc, char ** argv) {
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
         int n_iter = params.adam_n_iter;
-        model.train_its = opt->iter;
-        model.train_samples += n_batch * n_iter;
-        model.train_tokens  += n_batch * n_tokens * n_iter;
+        train->train_its = opt->iter;
+        train->train_samples += n_batch * n_iter;
+        train->train_tokens  += n_batch * n_tokens * n_iter;
 
         if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
             printf("Example %d, opt iter %d\n", ex, opt->iter);
@@ -1693,13 +1620,13 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    model.train_its += new_iters;
-    model.train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-    model.train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
+    train->train_its += new_iters;
+    train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+    train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
     if (params.n_examples > 0) {
-        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt, params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt, params.pattern_fn_it, -1, params.fn_latest);
+        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, train, params.pattern_fn_it, opt->iter, params.fn_latest);
+        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, train, params.pattern_fn_it, -1, params.fn_latest);
     }
 
     if (strlen(params.fn_model_out) > 0) {
@@ -1715,6 +1642,7 @@ int main(int argc, char ** argv) {
 
     delete[] compute_addr;
     delete[] compute_buf_0;
+    free_train_state(train);
     ggml_free(model.ctx);
     llama_free(lctx);
     llama_free_model(lmodel);

From ee27333b161cce87eed3eeb1279206a2a5f2af64 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 17:50:16 +0200
Subject: [PATCH 195/235] move train data saving code into callback to unify
 code of opt_callback

train_params are still different in finetune and train-text-from-scratch, so it can't yet be moved to train.h|cpp
---
 common/train.cpp                              |   5 +
 common/train.h                                |   3 +
 examples/finetune/finetune.cpp                | 126 ++++++++++-------
 .../train-text-from-scratch.cpp               | 130 ++++++++++--------
 4 files changed, 157 insertions(+), 107 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index c2b3f036b958f..81039e5eb948a 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1001,3 +1001,8 @@ size_t tokenize_file(
 
     return out_tokens.size();
 }
+
+std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) {
+    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
+    return replace_str(filename, pattern_it, sit.c_str());
+}
diff --git a/common/train.h b/common/train.h
index 54edd0f4a2a1d..59004a87cdec5 100644
--- a/common/train.h
+++ b/common/train.h
@@ -133,3 +133,6 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
 bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
 void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
 
+std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
+
+typedef void (*save_train_files_callback)(void * data, struct train_state * train);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 58e96f186ff0b..5c787e94e4849 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1016,17 +1016,15 @@ static bool load_checkpoint_lora_file(const char * filename, struct my_llama_mod
     return true;
 }
 
-static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train, const char * pattern_it, int iteration, const char * latest) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    std::string fn = replace_str(filename, pattern_it, sit.c_str());
-    printf("%s: saving to %s\n", __func__, fn.c_str());
+static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
+    printf("%s: saving to %s\n", __func__, filename);
     struct gguf_context * fctx = gguf_init_empty();
 
     save_checkpoint_lora_gguf(fctx, model, lora, train);
 
     // write file
     const bool only_meta = false;
-    gguf_write_to_file(fctx, fn.c_str(), only_meta);
+    gguf_write_to_file(fctx, filename, only_meta);
     gguf_free(fctx);
 }
 
@@ -1139,11 +1137,9 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
     file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 
-static void save_as_llama_lora(struct my_llama_lora * lora, const char * filename, const char * pattern_it, int iteration, const char * latest) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    std::string fn = replace_str(filename, pattern_it, sit.c_str());
-    printf("%s: saving to %s\n", __func__, fn.c_str());
-    struct llama_file file(fn.c_str(), "wb");
+static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) {
+    printf("%s: saving to %s\n", __func__, filename);
+    struct llama_file file(filename, "wb");
     if (file.fp == NULL) {
         return;
     }
@@ -1823,25 +1819,49 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
     return true;
 }
 
-struct opt_callback_data {
-    struct train_params   * params;
-    struct train_state    * train;
+struct save_train_files_data {
+    const char            * fn_checkpoint_out;
+    const char            * fn_lora_out;
+    const char            * pattern_fn_it;
+    const char            * fn_latest;
     struct my_llama_model * model;
     struct my_llama_lora  * lora;
-    struct llama_context  * lctx;
-    int                     last_save_iter;
-    llama_token           * tokens_data;
-    size_t                  tokens_size;
-    size_t                * samples_begin;
-    size_t                * samples_size;
-    size_t                * shuffled_samples_begin;
-    size_t                * shuffled_samples_size;
-    size_t                  samples_count;
-    struct ggml_tensor    * tokens_input;
-    struct ggml_tensor    * target_probs;
-    int                     first_iter;
-    int64_t                 last_time;
-    double                  millis_per_iter;
+};
+
+static void save_train_files(void * vdata, struct train_state * train) {
+    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
+
+    int64_t iter = train->opt->iter;
+
+    if (strlen(data->fn_checkpoint_out) > 0) {
+        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train);
+        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->model, data->lora, train);
+    }
+    if (strlen(data->fn_lora_out) > 0) {
+        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora);
+        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->lora);
+    }    
+}
+
+struct opt_callback_data {
+    struct train_params       * params;
+    struct train_state        * train;
+    save_train_files_callback   save_cb;
+    void                      * save_data;
+    struct llama_context      * lctx;
+    int                         last_save_iter;
+    llama_token               * tokens_data;
+    size_t                      tokens_size;
+    size_t                    * samples_begin;
+    size_t                    * samples_size;
+    size_t                    * shuffled_samples_begin;
+    size_t                    * shuffled_samples_size;
+    size_t                      samples_count;
+    struct ggml_tensor        * tokens_input;
+    struct ggml_tensor        * target_probs;
+    int                         first_iter;
+    int64_t                     last_time;
+    double                      millis_per_iter;
 };
 
 static void opt_callback(void * vdata, int accum_step, float * sched) {
@@ -1881,14 +1901,10 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
             train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
             train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
-            if (strlen(params->fn_checkpoint_out) > 0) {
-                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, train, params->pattern_fn_it, opt->iter, params->fn_latest);
-                save_checkpoint_lora_file(params->fn_checkpoint_out, data->model, data->lora, train, params->pattern_fn_it, -1, params->fn_latest);
-            }
-            if (strlen(params->fn_lora_out) > 0) {
-                save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, opt->iter, params->fn_latest);
-                save_as_llama_lora(data->lora, params->fn_lora_out, params->pattern_fn_it, -1, params->fn_latest);
+            if (data->save_cb) {
+                data->save_cb(data->save_data, train);
             }
+
             data->last_save_iter = opt->iter;
         }
 
@@ -2140,10 +2156,17 @@ int main(int argc, char ** argv) {
     opt->iter = train->train_its;
 
     if (params.only_write_lora) {
-        if (strlen(params.fn_lora_out) > 0) {
-            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
-            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
-        }
+        save_train_files_data save_data;
+        save_data.fn_checkpoint_out = "";
+        save_data.fn_lora_out       = params.fn_lora_out;
+        save_data.pattern_fn_it     = params.pattern_fn_it;
+        save_data.fn_latest         = params.fn_latest;
+        save_data.model             = &model;
+        save_data.lora              = &lora;
+
+        save_train_files(&save_data, train);
+
+        free_train_state(train);
         ggml_free(lora.ctx);
         llama_free(lctx);
         llama_free_model(lmodel);
@@ -2323,12 +2346,20 @@ int main(int argc, char ** argv) {
 
     printf("%s: begin training\n", __func__);
 
+    save_train_files_data save_data;
+    save_data.fn_checkpoint_out = params.fn_checkpoint_out;
+    save_data.fn_lora_out       = params.fn_lora_out;
+    save_data.pattern_fn_it     = params.pattern_fn_it;
+    save_data.fn_latest         = params.fn_latest;
+    save_data.model             = &model;
+    save_data.lora              = &lora;
+
     struct opt_callback_data opt_cb_data;
-    opt_cb_data.params = &params;
-    opt_cb_data.train  = train;
-    opt_cb_data.model  = &model;
-    opt_cb_data.lora   = &lora;
-    opt_cb_data.lctx   = lctx;
+    opt_cb_data.params                 = &params;
+    opt_cb_data.train                  = train;
+    opt_cb_data.save_cb                = &save_train_files;
+    opt_cb_data.save_data              = &save_data;
+    opt_cb_data.lctx                   = lctx;
     opt_cb_data.last_save_iter         = opt->iter;
     opt_cb_data.tokens_data            = train_tokens.data();
     opt_cb_data.tokens_size            = train_tokens.size();
@@ -2374,14 +2405,7 @@ int main(int argc, char ** argv) {
         train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
         train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
-        if (strlen(params.fn_checkpoint_out) > 0) {
-            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, train, params.pattern_fn_it, opt->iter, params.fn_latest);
-            save_checkpoint_lora_file(params.fn_checkpoint_out, &model, &lora, train, params.pattern_fn_it, -1, params.fn_latest);
-        }
-        if (strlen(params.fn_lora_out) > 0) {
-            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, opt->iter, params.fn_latest);
-            save_as_llama_lora(&lora, params.fn_lora_out, params.pattern_fn_it, -1, params.fn_latest);
-        }
+        save_train_files(&save_data, train);
         opt_cb_data.last_save_iter = opt->iter;
     }
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index bead80843e890..7984dd7241cd3 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -640,17 +640,15 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
     }
 }
 
-static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, const char * pattern_it, int iteration, const char * latest) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    std::string fn = replace_str(filename, pattern_it, sit.c_str());
-    printf("%s: saving to %s\n", __func__, fn.c_str());
+static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
+    printf("%s: saving to %s\n", __func__, filename);
     struct gguf_context * fctx = gguf_init_empty();
 
     save_llama_model_gguf(fctx, fn_vocab_model, model);
 
     // write file
     const bool only_meta = false;
-    gguf_write_to_file(fctx, fn.c_str(), only_meta);
+    gguf_write_to_file(fctx, filename, only_meta);
     gguf_free(fctx);
 }
 
@@ -681,17 +679,15 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
     return true;
 }
 
-static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train, const char * pattern_it, int iteration, const char * latest) {
-    std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
-    std::string fn = replace_str(filename, pattern_it, sit.c_str());
-    printf("%s: saving to %s\n", __func__, fn.c_str());
+static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
+    printf("%s: saving to %s\n", __func__, filename);
     struct gguf_context * fctx = gguf_init_empty();
 
     save_checkpoint_gguf(fctx, fn_vocab_model, model, train);
 
     // write file
     const bool only_meta = false;
-    gguf_write_to_file(fctx, fn.c_str(), only_meta);
+    gguf_write_to_file(fctx, filename, only_meta);
     gguf_free(fctx);
 }
 
@@ -1220,25 +1216,50 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
     return true;
 }
 
-struct opt_callback_data {
-    struct train_params   * params;
-    struct train_state    * train;
+struct save_train_files_data {
+    const char            * fn_checkpoint_out;
+    const char            * fn_model_out;
+    const char            * fn_vocab_model;
+    const char            * pattern_fn_it;
+    const char            * fn_latest;
     struct my_llama_model * model;
-    struct llama_context  * lctx;
-    int                     last_save_iter;
-    llama_token           * tokens_data;
-    size_t                  tokens_size;
-    size_t                * samples_begin;
-    size_t                * samples_size;
-    size_t                * shuffled_samples_begin;
-    size_t                * shuffled_samples_size;
-    size_t                  samples_count;
-    struct ggml_tensor    * tokens_input;
-    struct ggml_tensor    * target_logits;
-    struct ggml_tensor    * target_probs;
-    int                     first_iter;
-    int64_t                 last_time;
-    double                  millis_per_iter;
+};
+
+static void save_train_files(void * vdata, struct train_state * train) {
+    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
+    int64_t iter = train->opt->iter;
+
+    if (strlen(data->fn_checkpoint_out) > 0) {
+        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train);
+        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model, train);
+
+    }
+    if (strlen(data->fn_model_out) > 0) {
+        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model);
+        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model);
+    }
+}
+
+struct opt_callback_data {
+    struct train_params       * params;
+    struct train_state        * train;
+    save_train_files_callback   save_cb;
+    void                      * save_data;
+    struct llama_context      * lctx;
+    int                         last_save_iter;
+    llama_token               * tokens_data;
+    size_t                      tokens_size;
+    size_t                    * samples_begin;
+    size_t                    * samples_size;
+    size_t                    * shuffled_samples_begin;
+    size_t                    * shuffled_samples_size;
+    size_t                      samples_count;
+    struct ggml_tensor        * tokens_input;
+    struct ggml_tensor        * target_logits;
+    struct ggml_tensor        * target_probs;
+    int                         first_iter;
+    int64_t                     last_time;
+    double                      millis_per_iter;
 };
 
 static void opt_callback(void * vdata, int accum_step, float * sched) {
@@ -1278,15 +1299,10 @@ static void opt_callback(void * vdata, int accum_step, float * sched) {
             train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
             train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
-            if (strlen(params->fn_checkpoint_out) > 0) {
-                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, train, params->pattern_fn_it, opt->iter, params->fn_latest);
-                save_checkpoint_file(params->fn_checkpoint_out, params->fn_vocab_model, data->model, train, params->pattern_fn_it, -1, params->fn_latest);
-
-            }
-            if (strlen(params->fn_model_out) > 0) {
-                save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, opt->iter, params->fn_latest);
-                save_llama_model_file(params->fn_model_out, params->fn_vocab_model, data->model, params->pattern_fn_it, -1, params->fn_latest);
+            if (data->save_cb) {
+                data->save_cb(data->save_data, train);
             }
+
             data->last_save_iter = opt->iter;
         }
 
@@ -1508,14 +1524,23 @@ int main(int argc, char ** argv) {
         train_samples_size.size());
     printf("%s: begin training\n", __func__);
 
+    save_train_files_data save_data;
+    save_data.fn_checkpoint_out = params.fn_checkpoint_out;
+    save_data.fn_model_out      = params.fn_model_out;
+    save_data.fn_vocab_model    = params.fn_vocab_model;
+    save_data.pattern_fn_it     = params.pattern_fn_it;
+    save_data.fn_latest         = params.fn_latest;
+    save_data.model             = &model;
+
     struct opt_callback_data opt_cb_data;
-    opt_cb_data.params = &params;
-    opt_cb_data.train = train;
-    opt_cb_data.model = &model;
-    opt_cb_data.lctx = lctx;
-    opt_cb_data.last_save_iter = opt->iter;
-    opt_cb_data.tokens_data = train_tokens.data();
-    opt_cb_data.tokens_size = train_tokens.size();
+    opt_cb_data.params                 = &params;
+    opt_cb_data.train                  = train;
+    opt_cb_data.save_cb                = &save_train_files;
+    opt_cb_data.save_data              = &save_data;
+    opt_cb_data.lctx                   = lctx;
+    opt_cb_data.last_save_iter         = opt->iter;
+    opt_cb_data.tokens_data            = train_tokens.data();
+    opt_cb_data.tokens_size            = train_tokens.size();
     opt_cb_data.samples_begin          = train_samples_begin.data();
     opt_cb_data.samples_size           = train_samples_size.data();
     opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
@@ -1620,22 +1645,15 @@ int main(int argc, char ** argv) {
     printf("%s: total training time=%f seconds\n", __func__, dd);
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    train->train_its += new_iters;
-    train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-    train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-    if (params.n_examples > 0) {
-        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, train, params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, train, params.pattern_fn_it, -1, params.fn_latest);
-    }
+    if (new_iters > 0) {
+        train->train_its     += new_iters;
+        train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
-    if (strlen(params.fn_model_out) > 0) {
-        save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model, params.pattern_fn_it, opt->iter, params.fn_latest);
-        save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model, params.pattern_fn_it, -1, params.fn_latest);
+        save_train_files(&save_data, train);
+        opt_cb_data.last_save_iter = opt->iter;
     }
 
-    opt_cb_data.last_save_iter = opt->iter;
-
     if (alloc) {
         ggml_allocr_free(alloc);
     }

From e9758ae1d2f1e43db7fee588dcc9c2d040e7da5e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 18:45:59 +0200
Subject: [PATCH 196/235] move common train params into common/train

---
 common/train.cpp                              | 323 ++++++++++++
 common/train.h                                |  60 +++
 examples/finetune/finetune.cpp                | 499 +++---------------
 .../train-text-from-scratch.cpp               | 487 +++--------------
 4 files changed, 550 insertions(+), 819 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 81039e5eb948a..d22d4b0361770 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1006,3 +1006,326 @@ std::string get_train_filename(const char * filename, const char * pattern_it, c
     std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
     return replace_str(filename, pattern_it, sit.c_str());
 }
+
+struct train_params_common get_default_train_params_common() {
+    struct train_params_common params;
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.gguf";
+    params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
+    params.pattern_fn_it     = "ITERATION";
+    params.fn_latest         = "LATEST";
+
+    params.print_usage = false;
+    
+    params.save_every = 10;
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_gradient_accumulation = 1;
+
+    params.custom_n_ctx = false;
+
+    params.use_flash              = true;
+    params.use_checkpointing      = true;
+
+    params.sample_start           = "";
+    params.include_sample_start   = false;
+    params.escape                 = false;
+    params.overlapping_samples    = false;
+    params.fill_with_next_samples = false;
+    params.separate_with_eos      = false;
+    params.separate_with_bos      = true;
+    params.force_reshuffle        = false;
+
+    params.opt_past               = 0;
+    params.opt_delta              = 1e-5f;
+    params.opt_max_no_improvement = 0;
+
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_min     = 0.1f;
+    params.enable_restart    = false;
+
+    params.adam_n_iter         = 256;
+    params.adam_alpha          = 1e-3f;
+    params.adam_min_alpha      = 0;
+    params.adam_decay          = 1e-1f;
+    params.adam_decay_min_ndim = 2;
+    params.adam_beta1          = 0.9f;
+    params.adam_beta2          = 0.999f;
+    params.adam_gclip          = 1.0f;
+    params.adam_eps_f          = 0.0f;
+    return params;
+}
+
+void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params) {
+    // fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    // fprintf(stderr, "\n");
+    // fprintf(stderr, "options:\n");
+    // fprintf(stderr, "  -h, --help                 show this help message and exit\n");
+    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
+    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
+    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
+    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
+    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
+    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
+    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
+    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
+    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
+    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
+    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
+    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
+    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
+    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
+    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
+    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
+    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
+    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
+    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
+    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
+    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
+    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
+    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
+    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
+    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
+    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
+    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
+    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
+    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
+    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
+    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
+    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
+    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
+    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
+    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
+    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
+    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
+    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
+    fprintf(stderr, "\n");
+}
+
+bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param) {
+    int& i = *idx;
+    char * arg = argv[i];
+    if (arg == "--train-data") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->fn_train_data = argv[i];
+    } else if (arg == "--checkpoint-in") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->fn_checkpoint_in = argv[i];
+    } else if (arg == "--checkpoint-out") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->fn_checkpoint_out = argv[i];
+    } else if (arg == "--pattern-fn-it") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->pattern_fn_it = argv[i];
+    } else if (arg == "--fn-latest") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->fn_latest = argv[i];
+    } else if (arg == "--save-every") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->save_every = std::stoi(argv[i]);
+    } else if (arg == "-s" || arg == "--seed") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->seed = std::stoi(argv[i]);
+    } else if (arg == "-c" || arg == "--ctx") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->n_ctx = std::stoi(argv[i]);
+        params->custom_n_ctx = true;
+    } else if (arg == "-t" || arg == "--threads") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->n_threads = std::stoi(argv[i]);
+    } else if (arg == "-b" || arg == "--batch") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->n_batch = std::stoi(argv[i]);
+    } else if (arg == "--grad-acc") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
+    } else if (arg == "--sample-start") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->sample_start = std::string(argv[i]);
+    } else if (arg == "--escape") {
+        params->escape = true;
+    } else if (arg == "--include-sample-start") {
+        params->include_sample_start = true;
+    } else if (arg == "--overlapping-samples") {
+        params->overlapping_samples = true;
+    } else if (arg == "--fill-with-next-samples") {
+        params->fill_with_next_samples = true;
+    } else if (arg == "--separate-with-eos") {
+        params->separate_with_eos = true;
+    } else if (arg == "--separate-with-bos") {
+        params->separate_with_bos = true;
+    } else if (arg == "--no-separate-with-eos") {
+        params->separate_with_eos = false;
+    } else if (arg == "--no-separate-with-bos") {
+        params->separate_with_bos = false;
+    } else if (arg == "--force-reshuffle") {
+        params->force_reshuffle = true;
+    } else if (arg == "--no-flash") {
+        params->use_flash = false;
+    } else if (arg == "--use-flash") {
+        params->use_flash = true;
+    } else if (arg == "--no-checkpointing") {
+        params->use_checkpointing = false;
+    } else if (arg == "--use-checkpointing") {
+        params->use_checkpointing = true;
+    } else if (arg == "--warmup") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->warmup = std::stoi(argv[i]);
+    } else if (arg == "--cos-decay-steps") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->cos_decay_steps = std::stoi(argv[i]);
+    } else if (arg == "--cos-decay-restart") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->cos_decay_restart = std::stof(argv[i]);
+    } else if (arg == "--cos-decay-min") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->cos_decay_min = std::stof(argv[i]);
+    } else if (arg == "--enable-restart") {
+        params->enable_restart = true;
+    } else if (arg == "--disable-restart") {
+        params->enable_restart = false;
+    } else if (arg == "--opt-past") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->opt_past = std::stoi(argv[i]);
+    } else if (arg == "--opt-delta") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->opt_delta = std::stof(argv[i]);
+    } else if (arg == "--opt-max-no-improvement") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->opt_max_no_improvement = std::stoi(argv[i]);
+    } else if (arg == "--adam-epsf") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_eps_f = std::stof(argv[i]);
+    } else if (arg == "--adam-iter") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_n_iter = std::stoi(argv[i]);
+    } else if (arg == "--adam-alpha") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_alpha = std::stof(argv[i]);
+    } else if (arg == "--adam-min-alpha") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_min_alpha = std::stof(argv[i]);
+    } else if (arg == "--adam-decay") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_decay = std::stof(argv[i]);
+    } else if (arg == "--adam-decay-min-ndim") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_decay_min_ndim = std::stoi(argv[i]);
+    } else if (arg == "--adam-beta1") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_beta1 = std::stof(argv[i]);
+    } else if (arg == "--adam-beta2") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_beta2 = std::stof(argv[i]);
+    } else if (arg == "--adam-gclip") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->adam_gclip = std::stof(argv[i]);
+    } else if (arg == "-h" || arg == "--help") {
+        params->print_usage = true;
+        return true;
+    } else {
+        return false;
+    }
+    return true;
+}
+
+void finish_processing_train_args(struct train_params_common * params) {
+    if (params->escape) {
+        process_escapes(params->sample_start);
+    }
+}
diff --git a/common/train.h b/common/train.h
index 59004a87cdec5..cc3673c369141 100644
--- a/common/train.h
+++ b/common/train.h
@@ -26,9 +26,69 @@ struct train_state {
     size_t        shuffle_next_sample;
 };
 
+struct train_params_common {
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * pattern_fn_it;
+    const char * fn_latest;
+
+    bool print_usage;
+
+    int save_every;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_threads;
+    int n_batch;
+    int n_gradient_accumulation;
+
+    bool custom_n_ctx;
+
+    bool use_flash;
+    bool use_checkpointing;
+
+    std::string sample_start;
+    bool include_sample_start;
+    bool escape;
+    bool overlapping_samples;
+    bool fill_with_next_samples;
+    bool separate_with_eos;
+    bool separate_with_bos;
+
+    bool force_reshuffle;
+
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_min;
+    bool  enable_restart;
+
+    int   opt_past;
+    float opt_delta;
+    int   opt_max_no_improvement;
+
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_min_alpha;
+    float adam_decay;
+    int   adam_decay_min_ndim;
+    float adam_beta1;
+    float adam_beta2;
+    float adam_gclip;
+    float adam_eps_f;
+};
+
 struct train_state * init_train_state(int seed);
 void free_train_state(struct train_state  * state);
 
+struct train_params_common get_default_train_params_common();
+void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
+
+bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
+void finish_processing_train_args(struct train_params_common * params);
+
 struct random_normal_distribution;
 struct random_uniform_distribution;
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 5c787e94e4849..09a29340afbec 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1197,24 +1197,10 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
 }
 
 struct train_params {
+    struct train_params_common common;
+
     const char * fn_model_base;
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
     const char * fn_lora_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    int save_every;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_threads;
-    int n_batch;
-    int n_gradient_accumulation;
-
-    bool custom_n_ctx;
 
     bool only_write_lora;
 
@@ -1255,61 +1241,13 @@ struct train_params {
     bool custom_n_rank_tok_embeddings;
     bool custom_n_rank_norm;
     bool custom_n_rank_output;
-
-    bool use_flash;
-    bool use_checkpointing;
-
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
 };
 
 static struct train_params get_default_train_params() {
     struct train_params params;
+    params.common = get_default_train_params_common();
     params.fn_model_base     = "";
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.gguf";
-    params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
     params.fn_lora_out       = "ggml-lora-ITERATION-f32.gguf";
-    params.pattern_fn_it     = "ITERATION";
-    params.fn_latest         = "LATEST";
-
-    params.save_every = 10;
-
-    params.seed       =   -1;
-
-    params.n_ctx      =  128;
-    params.n_threads  =    6;
-    params.n_batch    =    8;
-    params.n_gradient_accumulation = 1;
-
-    params.custom_n_ctx = false;
 
     params.only_write_lora = false;
 
@@ -1351,59 +1289,18 @@ static struct train_params get_default_train_params() {
     params.custom_n_rank_norm           = false;
     params.custom_n_rank_output         = false;
 
-    params.use_flash              = true;
-    params.use_checkpointing      = true;
-
-    params.sample_start           = "";
-    params.include_sample_start   = false;
-    params.escape                 = false;
-    params.overlapping_samples    = false;
-    params.fill_with_next_samples = false;
-    params.separate_with_eos      = false;
-    params.separate_with_bos      = true;
-    params.force_reshuffle        = false;
-
-    params.opt_past               = 0;
-    params.opt_delta              = 1e-5f;
-    params.opt_max_no_improvement = 0;
-
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_min     = 0.1f;
-    params.enable_restart    = false;
-
-    params.adam_n_iter         = 256;
-    params.adam_alpha          = 1e-3f;
-    params.adam_min_alpha      = 0;
-    params.adam_decay          = 1e-1f;
-    params.adam_decay_min_ndim = 2;
-    params.adam_beta1          = 0.9f;
-    params.adam_beta2          = 0.999f;
-    params.adam_gclip          = 1.0f;
-    params.adam_eps_f          = 0.0f;
     return params;
 }
 
-static void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                 show this help message and exit\n");
+
     fprintf(stderr, "  --model-base FNAME         model path from which to load base model (default '%s')\n", params->fn_model_base);
-    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
-    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
-    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
     fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
-    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
-    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
-    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
     fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training\n");
-    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
-    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
-    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
-    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
-    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
     fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
@@ -1421,39 +1318,8 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
     fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
     fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
-    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
-    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
-    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
-    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
-    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
-    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
-    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
-    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
-    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
-    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
-    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
-    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
-    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
-    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
-    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
-    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
-    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
-    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
-    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
-    fprintf(stderr, "\n");
+    
+    print_common_train_usage(argc, argv, &params->common);
 }
 
 static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
@@ -1468,87 +1334,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
 
-        if (arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "--train-data") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_train_data = argv[i];
-        } else if (arg == "--checkpoint-in") {
-            if (++i >= argc) {
-                invalid_param = true;
+        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
+            if (invalid_param) {
                 break;
+            } else if (params->common.print_usage) {
+                train_print_usage(argc, argv, &default_params);
+                exit(0);
             }
-            params->fn_checkpoint_in = argv[i];
-        } else if (arg == "--checkpoint-out") {
+        } else if (arg == "--model-base") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->fn_checkpoint_out = argv[i];
+            params->fn_model_base = argv[i];
         } else if (arg == "--lora-out") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->fn_lora_out = argv[i];
-        } else if (arg == "--pattern-fn-it") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->pattern_fn_it = argv[i];
-        } else if (arg == "--fn-latest") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_latest = argv[i];
-        } else if (arg == "--save-every") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->save_every = std::stoi(argv[i]);
         } else if (arg == "--only-write-lora") {
             params->only_write_lora = true;
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->seed = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_ctx = std::stoi(argv[i]);
-            params->custom_n_ctx = true;
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_batch = std::stoi(argv[i]);
-        } else if (arg == "--grad-acc") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
         } else if (arg == "--norm-rms-eps") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1667,141 +1473,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             }
             params->n_rank_w3 = std::stoi(argv[i]);
             params->custom_n_rank_w3 = true;
-        } else if (arg == "--sample-start") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->sample_start = std::string(argv[i]);
-        } else if (arg == "--escape") {
-            params->escape = true;
-        } else if (arg == "--include-sample-start") {
-            params->include_sample_start = true;
-        } else if (arg == "--overlapping-samples") {
-            params->overlapping_samples = true;
-        } else if (arg == "--fill-with-next-samples") {
-            params->fill_with_next_samples = true;
-        } else if (arg == "--separate-with-eos") {
-            params->separate_with_eos = true;
-        } else if (arg == "--separate-with-bos") {
-            params->separate_with_bos = true;
-        } else if (arg == "--no-separate-with-eos") {
-            params->separate_with_eos = false;
-        } else if (arg == "--no-separate-with-bos") {
-            params->separate_with_bos = false;
-        } else if (arg == "--force-reshuffle") {
-            params->force_reshuffle = true;
-        } else if (arg == "--no-flash") {
-            params->use_flash = false;
-        } else if (arg == "--use-flash") {
-            params->use_flash = true;
-        } else if (arg == "--no-checkpointing") {
-            params->use_checkpointing = false;
-        } else if (arg == "--use-checkpointing") {
-            params->use_checkpointing = true;
-        } else if (arg == "--warmup") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->warmup = std::stoi(argv[i]);
-        } else if (arg == "--cos-decay-steps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_steps = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-restart") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_restart = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-min") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_min = std::stof(argv[i]);
-        } else if (arg == "--enable-restart") {
-            params->enable_restart = true;
-        } else if (arg == "--disable-restart") {
-            params->enable_restart = false;
-        } else if (arg == "--opt-past") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->opt_past = std::stoi(argv[i]);
-        } else if (arg == "--opt-delta") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->opt_delta = std::stof(argv[i]);
-        } else if (arg == "--opt-max-no-improvement") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->opt_max_no_improvement = std::stoi(argv[i]);
-        } else if (arg == "--adam-epsf") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_eps_f = std::stof(argv[i]);
-        } else if (arg == "--adam-iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_n_iter = std::stoi(argv[i]);
-        } else if (arg == "--adam-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_alpha = std::stof(argv[i]);
-        } else if (arg == "--adam-min-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_min_alpha = std::stof(argv[i]);
-        } else if (arg == "--adam-decay") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_decay = std::stof(argv[i]);
-        } else if (arg == "--adam-decay-min-ndim") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_decay_min_ndim = std::stoi(argv[i]);
-        } else if (arg == "--adam-beta1") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_beta1 = std::stof(argv[i]);
-        } else if (arg == "--adam-beta2") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_beta2 = std::stof(argv[i]);
-        } else if (arg == "--adam-gclip") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_gclip = std::stof(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            train_print_usage(argc, argv, &default_params);
-            exit(0);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             train_print_usage(argc, argv, &default_params);
@@ -1813,9 +1484,7 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
         train_print_usage(argc, argv, &default_params);
         exit(1);
     }
-    if (params->escape) {
-        process_escapes(params->sample_start);
-    }
+    finish_processing_train_args(&params->common);
     return true;
 }
 
@@ -1844,31 +1513,31 @@ static void save_train_files(void * vdata, struct train_state * train) {
 }
 
 struct opt_callback_data {
-    struct train_params       * params;
-    struct train_state        * train;
-    save_train_files_callback   save_cb;
-    void                      * save_data;
-    struct llama_context      * lctx;
-    int                         last_save_iter;
-    llama_token               * tokens_data;
-    size_t                      tokens_size;
-    size_t                    * samples_begin;
-    size_t                    * samples_size;
-    size_t                    * shuffled_samples_begin;
-    size_t                    * shuffled_samples_size;
-    size_t                      samples_count;
-    struct ggml_tensor        * tokens_input;
-    struct ggml_tensor        * target_probs;
-    int                         first_iter;
-    int64_t                     last_time;
-    double                      millis_per_iter;
+    struct train_params_common * params;
+    struct train_state         * train;
+    save_train_files_callback    save_cb;
+    void                       * save_data;
+    struct llama_context       * lctx;
+    int                          last_save_iter;
+    llama_token                * tokens_data;
+    size_t                       tokens_size;
+    size_t                     * samples_begin;
+    size_t                     * samples_size;
+    size_t                     * shuffled_samples_begin;
+    size_t                     * shuffled_samples_size;
+    size_t                       samples_count;
+    struct ggml_tensor         * tokens_input;
+    struct ggml_tensor         * target_probs;
+    int                          first_iter;
+    int64_t                      last_time;
+    double                       millis_per_iter;
 };
 
 static void opt_callback(void * vdata, int accum_step, float * sched) {
-    struct opt_callback_data * data   = (struct opt_callback_data *) vdata;
-    struct train_params      * params = data->params;
-    struct train_state       * train  = data->train;
-    struct ggml_opt_context  * opt    = train->opt;
+    struct opt_callback_data   * data   = (struct opt_callback_data *) vdata;
+    struct train_params_common * params = data->params;
+    struct train_state         * train  = data->train;
+    struct ggml_opt_context    * opt    = train->opt;
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
@@ -2019,11 +1688,11 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
+    if (params.common.seed == LLAMA_DEFAULT_SEED) {
+        params.common.seed = time(NULL);
     }
-    printf("%s: seed: %u\n", __func__, params.seed);
-    srand(params.seed);
+    printf("%s: seed: %u\n", __func__, params.common.seed);
+    srand(params.common.seed);
 
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = false;
@@ -2033,11 +1702,11 @@ int main(int argc, char ** argv) {
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
     struct my_llama_model model;
-    init_model(lmodel, &model, params.n_ctx);
+    init_model(lmodel, &model, params.common.n_ctx);
 
     struct my_llama_lora lora;
 
-    struct train_state      * train = init_train_state(params.seed);
+    struct train_state      * train = init_train_state(params.common.seed);
     struct ggml_opt_context * opt   = train->opt;
 
     load_default_lora_params_from_base_model(params.fn_model_base, &lora.hparams);
@@ -2083,30 +1752,30 @@ int main(int argc, char ** argv) {
     opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
     opt->params.print_forward_graph     = false;
     opt->params.print_backward_graph    = false;
-    opt->params.n_threads               = params.n_threads;
-    opt->params.past                    = params.opt_past;
-    opt->params.delta                   = params.opt_delta;
-    opt->params.max_no_improvement      = params.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.adam_n_iter;
+    opt->params.n_threads               = params.common.n_threads;
+    opt->params.past                    = params.common.opt_past;
+    opt->params.delta                   = params.common.opt_delta;
+    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
+    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
+    opt->params.adam.n_iter             = params.common.adam_n_iter;
     opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.adam_alpha;
-    opt->params.adam.decay              = params.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.adam_beta1;
-    opt->params.adam.beta2              = params.adam_beta2;
-    opt->params.adam.gclip              = params.adam_gclip;
-    opt->params.adam.eps_f              = params.adam_eps_f;
+    opt->params.adam.alpha              = params.common.adam_alpha;
+    opt->params.adam.decay              = params.common.adam_decay;
+    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
+    opt->params.adam.beta1              = params.common.adam_beta1;
+    opt->params.adam.beta2              = params.common.adam_beta2;
+    opt->params.adam.gclip              = params.common.adam_gclip;
+    opt->params.adam.eps_f              = params.common.adam_eps_f;
 
     ggml_allocr * alloc = NULL;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_lora_file(params.fn_checkpoint_in, &model, &lora, train);
+    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
 
     if (existed) {
         // overwrite last n_ctx with user provided n_ctx
-        if (params.custom_n_ctx) {
-            model.hparams.n_ctx = params.n_ctx;
+        if (params.common.custom_n_ctx) {
+            model.hparams.n_ctx = params.common.n_ctx;
         }
 
         const bool opt_param_count_changed = (
@@ -2124,7 +1793,7 @@ int main(int argc, char ** argv) {
         || (lora.hparams.n_rank_output         != n_rank_output)
         );
 
-        const bool opt_past_changed = opt->params.past != params.opt_past;
+        const bool opt_past_changed = opt->params.past != params.common.opt_past;
 
         if (opt_param_count_changed) {
             print_lora_params(&lora.hparams);
@@ -2139,7 +1808,7 @@ int main(int argc, char ** argv) {
         }
     } else { // existed == false
         init_lora(&model, &lora);
-        randomize_lora(&lora, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+        randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
         if (!params.only_write_lora) {
             ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
         }
@@ -2159,8 +1828,8 @@ int main(int argc, char ** argv) {
         save_train_files_data save_data;
         save_data.fn_checkpoint_out = "";
         save_data.fn_lora_out       = params.fn_lora_out;
-        save_data.pattern_fn_it     = params.pattern_fn_it;
-        save_data.fn_latest         = params.fn_latest;
+        save_data.pattern_fn_it     = params.common.pattern_fn_it;
+        save_data.fn_latest         = params.common.fn_latest;
         save_data.model             = &model;
         save_data.lora              = &lora;
 
@@ -2175,7 +1844,7 @@ int main(int argc, char ** argv) {
 
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.n_batch;
+    int n_batch  = params.common.n_batch;
 
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
@@ -2215,7 +1884,7 @@ int main(int argc, char ** argv) {
     size_t estimated_compute_size_wo_data = (
         ggml_tensor_overhead()*GGML_MAX_NODES*2
       + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
-            params.use_checkpointing ? 3 : 2
+            params.common.use_checkpointing ? 3 : 2
         )
     );
     struct ggml_init_params ctx_compute_params = {
@@ -2242,7 +1911,7 @@ int main(int argc, char ** argv) {
         gf = ggml_new_graph(ctx_compute);
         gf->order = (enum ggml_cgraph_eval_order) order;
         gb = ggml_new_graph(ctx_compute);
-        gb_tmp = params.use_checkpointing
+        gb_tmp = params.common.use_checkpointing
             ? ggml_new_graph(ctx_compute)
             : NULL;
         loss = llama_build_lora_finetune_graphs(
@@ -2250,8 +1919,8 @@ int main(int argc, char ** argv) {
             gf, gb, gb_tmp,
             &logits, tokens_input, target_probs,
             n_tokens, n_batch,
-            params.use_flash,
-            params.use_checkpointing
+            params.common.use_flash,
+            params.common.use_checkpointing
         );
         size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
         if (max_compute_size < best_compute_size) {
@@ -2275,7 +1944,7 @@ int main(int argc, char ** argv) {
     gf = ggml_new_graph(ctx_compute);
     gf->order = best_order;
     gb = ggml_new_graph(ctx_compute);
-    gb_tmp = params.use_checkpointing
+    gb_tmp = params.common.use_checkpointing
         ? ggml_new_graph(ctx_compute)
         : NULL;
     loss = llama_build_lora_finetune_graphs(
@@ -2283,8 +1952,8 @@ int main(int argc, char ** argv) {
         gf, gb, gb_tmp,
         &logits, tokens_input, target_probs,
         n_tokens, n_batch,
-        params.use_flash,
-        params.use_checkpointing
+        params.common.use_flash,
+        params.common.use_checkpointing
     );
     ggml_allocr_free(alloc);
 
@@ -2294,10 +1963,10 @@ int main(int argc, char ** argv) {
     std::vector<size_t> train_samples_size;
     printf("%s: tokenize training data\n", __func__);
     tokenize_file(lctx,
-            params.fn_train_data,
-            params.sample_start,
-            params.include_sample_start,
-            params.overlapping_samples,
+            params.common.fn_train_data,
+            params.common.sample_start,
+            params.common.include_sample_start,
+            params.common.overlapping_samples,
             n_tokens,
             train_tokens,
             train_samples_begin,
@@ -2318,16 +1987,16 @@ int main(int argc, char ** argv) {
     }
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
 
-    size_t shuffle_samples_hash = compute_samples_hash(params.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
+    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
     const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
     if (changed_train_data) {
         printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
     }
-    if (params.force_reshuffle) {
+    if (params.common.force_reshuffle) {
         printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
     }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
+    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
+        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
         train->shuffle_sample_count = train_samples_size.size();
         train->shuffle_next_sample = 0;
         train->shuffle_samples_hash = shuffle_samples_hash;
@@ -2347,15 +2016,15 @@ int main(int argc, char ** argv) {
     printf("%s: begin training\n", __func__);
 
     save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.fn_checkpoint_out;
+    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
     save_data.fn_lora_out       = params.fn_lora_out;
-    save_data.pattern_fn_it     = params.pattern_fn_it;
-    save_data.fn_latest         = params.fn_latest;
+    save_data.pattern_fn_it     = params.common.pattern_fn_it;
+    save_data.fn_latest         = params.common.fn_latest;
     save_data.model             = &model;
     save_data.lora              = &lora;
 
     struct opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params;
+    opt_cb_data.params                 = &params.common;
     opt_cb_data.train                  = train;
     opt_cb_data.save_cb                = &save_train_files;
     opt_cb_data.save_data              = &save_data;
@@ -2375,7 +2044,7 @@ int main(int argc, char ** argv) {
     opt_cb_data.millis_per_iter        = 0.0;
 
     // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.n_threads).work_size + GGML_OBJECT_SIZE;
+    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
     printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
 
     // context for work buffer
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7984dd7241cd3..5b993b47b0a54 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -692,17 +692,10 @@ static void save_checkpoint_file(const char * filename, const char * fn_vocab_mo
 }
 
 struct train_params {
+    struct train_params_common common;
+    
     const char * fn_vocab_model;
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
     const char * fn_model_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    int save_every;
-
-    uint32_t seed;
 
     int n_ctx;
     int n_embd;
@@ -710,10 +703,7 @@ struct train_params {
     int n_layer;
     int n_ff;
 
-    int n_threads;
     int n_examples;
-    int n_batch;
-    int n_gradient_accumulation;
 
     float f_norm_rms_eps;
     float rope_freq_base;
@@ -721,40 +711,8 @@ struct train_params {
 
     int print_info_interval;
 
-    bool use_flash;
-    bool use_checkpointing;
     bool use_alloc;
 
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
-
     int mem_model_gb;
     int mem_compute_gb;
     int mem_compute0_gb;
@@ -762,17 +720,9 @@ struct train_params {
 
 struct train_params get_default_train_params() {
     struct train_params params;
+    params.common = get_default_train_params_common();
     params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint.bin";
     params.fn_model_out      = "ggml-checkpoint-f32.bin";
-    params.pattern_fn_it     = "ITERATION";
-    params.fn_latest         = "LATEST";
-
-    params.save_every = 10;
-
-    params.seed       =   -1;
 
     params.n_ctx      =  128;
     params.n_embd     =  256;
@@ -780,10 +730,7 @@ struct train_params get_default_train_params() {
     params.n_layer    =   16;
     params.n_ff       =  768;
 
-    params.n_threads  =    6;
     params.n_examples =    1;
-    params.n_batch    =    8;
-    params.n_gradient_accumulation = 1;
 
     params.f_norm_rms_eps  = 1e-5f;
     params.rope_freq_base  = 10000.0f;
@@ -791,60 +738,22 @@ struct train_params get_default_train_params() {
 
     params.print_info_interval    = 1;
 
-    params.use_flash              = true;
-    params.use_checkpointing      = true;
     params.use_alloc              = true;
 
-    params.sample_start           = "";
-    params.include_sample_start   = false;
-    params.escape                 = false;
-    params.overlapping_samples    = false;
-    params.fill_with_next_samples = false;
-    params.separate_with_eos      = false;
-    params.separate_with_bos      = true;
-    params.force_reshuffle        = false;
-
-    params.opt_past               = 0;
-    params.opt_delta              = 1e-5f;
-    params.opt_max_no_improvement = 0;
-
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_min     = 0.1f;
-    params.enable_restart    = false;
-
-    params.adam_n_iter         = 256;
-    params.adam_alpha          = 1e-3f;
-    params.adam_min_alpha      = 0;
-    params.adam_decay          = 1e-1f;
-    params.adam_decay_min_ndim = 2;
-    params.adam_beta1          = 0.9f;
-    params.adam_beta2          = 0.999f;
-    params.adam_gclip          = 1.0f;
-    params.adam_eps_f          = 0.0f;
-
     params.mem_model_gb   =  2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
     return params;
 }
 
-static void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                 show this help message and exit\n");
+
     fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
-    fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
-    fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
     fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
-    fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
-    fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
-    fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
-    fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
-    fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
     fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
     fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
     fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
@@ -852,49 +761,15 @@ static void train_print_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-    fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
     fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
-    fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
-    fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
-    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
-    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
-    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
-    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
-    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
-    fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
-    fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
-    fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
-    fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
-    fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
-    fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
     fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
     fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
-    fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
-    fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
-    fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
-    fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
-    fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
-    fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
-    fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
-    fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
-    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
-    fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
-    fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
-    fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
-    fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
-    fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
-    fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
-    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
-    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
-    fprintf(stderr, "\n");
+
+    print_common_train_usage(argc, argv, &params->common);
 }
 
 static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
@@ -909,66 +784,25 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
 
-        if (arg == "--vocab-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_vocab_model = argv[i];
-        } else if (arg == "--train-data") {
-            if (++i >= argc) {
-                invalid_param = true;
+        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
+            if (invalid_param) {
                 break;
+            } else if (params->common.print_usage) {
+                train_print_usage(argc, argv, &default_params);
+                exit(0);
             }
-            params->fn_train_data = argv[i];
-        } else if (arg == "--checkpoint-in") {
+        } else if (arg == "--vocab-model") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params->fn_checkpoint_in = argv[i];
-        } else if (arg == "--checkpoint-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_checkpoint_out = argv[i];
+            params->fn_vocab_model = argv[i];
         } else if (arg == "--model-out") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params->fn_model_out = argv[i];
-        } else if (arg == "--pattern-fn-it") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->pattern_fn_it = argv[i];
-        } else if (arg == "--fn-latest") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_latest = argv[i];
-        } else if (arg == "--save-every") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->save_every = std::stoi(argv[i]);
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->seed = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_ctx = std::stoi(argv[i]);
         } else if (arg == "--embd") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1011,24 +845,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_batch = std::stoi(argv[i]);
-        } else if (arg == "--grad-acc") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
         } else if (arg == "-n" || arg == "--examples") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1041,142 +857,10 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->print_info_interval = std::stoi(argv[i]);
-         } else if (arg == "--sample-start") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->sample_start = std::string(argv[i]);
-        } else if (arg == "--escape") {
-            params->escape = true;
-        } else if (arg == "--include-sample-start") {
-            params->include_sample_start = true;
-        } else if (arg == "--overlapping-samples") {
-            params->overlapping_samples = true;
-        } else if (arg == "--fill-with-next-samples") {
-            params->fill_with_next_samples = true;
-        } else if (arg == "--separate-with-eos") {
-            params->separate_with_eos = true;
-        } else if (arg == "--separate-with-bos") {
-            params->separate_with_bos = true;
-        } else if (arg == "--no-separate-with-eos") {
-            params->separate_with_eos = false;
-        } else if (arg == "--no-separate-with-bos") {
-            params->separate_with_bos = false;
-        } else if (arg == "--force-reshuffle") {
-            params->force_reshuffle = true;
-        } else if (arg == "--no-flash") {
-            params->use_flash = false;
-        } else if (arg == "--use-flash") {
-            params->use_flash = true;
-        } else if (arg == "--no-checkpointing") {
-            params->use_checkpointing = false;
-        } else if (arg == "--use-checkpointing") {
-            params->use_checkpointing = true;
         } else if (arg == "--no-alloc") {
             params->use_alloc = false;
         } else if (arg == "--use-alloc") {
             params->use_alloc = true;
-        } else if (arg == "--warmup") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->warmup = std::stoi(argv[i]);
-        } else if (arg == "--cos-decay-steps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_steps = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-restart") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_restart = std::stof(argv[i]);
-        } else if (arg == "--cos-decay-min") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->cos_decay_min = std::stof(argv[i]);
-        } else if (arg == "--enable-restart") {
-            params->enable_restart = true;
-        } else if (arg == "--disable-restart") {
-            params->enable_restart = false;
-        } else if (arg == "--opt-past") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->opt_past = std::stoi(argv[i]);
-        } else if (arg == "--opt-delta") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->opt_delta = std::stof(argv[i]);
-        } else if (arg == "--opt-max-no-improvement") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->opt_max_no_improvement = std::stoi(argv[i]);
-        } else if (arg == "--adam-epsf") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_eps_f = std::stof(argv[i]);
-        } else if (arg == "--adam-iter") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_n_iter = std::stoi(argv[i]);
-        } else if (arg == "--adam-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_alpha = std::stof(argv[i]);
-        } else if (arg == "--adam-min-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_min_alpha = std::stof(argv[i]);
-        } else if (arg == "--adam-decay") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_decay = std::stof(argv[i]);
-        } else if (arg == "--adam-decay-min-ndim") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_decay_min_ndim = std::stoi(argv[i]);
-        } else if (arg == "--adam-beta1") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_beta1 = std::stof(argv[i]);
-        } else if (arg == "--adam-beta2") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_beta2 = std::stof(argv[i]);
-        } else if (arg == "--adam-gclip") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->adam_gclip = std::stof(argv[i]);
         } else if (arg == "--mem-model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1195,9 +879,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->mem_compute0_gb = std::stoi(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            train_print_usage(argc, argv, &default_params);
-            exit(0);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             train_print_usage(argc, argv, &default_params);
@@ -1209,9 +890,7 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
         train_print_usage(argc, argv, &default_params);
         exit(1);
     }
-    if (params->escape) {
-        process_escapes(params->sample_start);
-    }
+    finish_processing_train_args(&params->common);
 
     return true;
 }
@@ -1241,32 +920,32 @@ static void save_train_files(void * vdata, struct train_state * train) {
 }
 
 struct opt_callback_data {
-    struct train_params       * params;
-    struct train_state        * train;
-    save_train_files_callback   save_cb;
-    void                      * save_data;
-    struct llama_context      * lctx;
-    int                         last_save_iter;
-    llama_token               * tokens_data;
-    size_t                      tokens_size;
-    size_t                    * samples_begin;
-    size_t                    * samples_size;
-    size_t                    * shuffled_samples_begin;
-    size_t                    * shuffled_samples_size;
-    size_t                      samples_count;
-    struct ggml_tensor        * tokens_input;
-    struct ggml_tensor        * target_logits;
-    struct ggml_tensor        * target_probs;
-    int                         first_iter;
-    int64_t                     last_time;
-    double                      millis_per_iter;
+    struct train_params_common * params;
+    struct train_state         * train;
+    save_train_files_callback    save_cb;
+    void                       * save_data;
+    struct llama_context       * lctx;
+    int                          last_save_iter;
+    llama_token                * tokens_data;
+    size_t                       tokens_size;
+    size_t                     * samples_begin;
+    size_t                     * samples_size;
+    size_t                     * shuffled_samples_begin;
+    size_t                     * shuffled_samples_size;
+    size_t                       samples_count;
+    struct ggml_tensor         * tokens_input;
+    struct ggml_tensor         * target_logits;
+    struct ggml_tensor         * target_probs;
+    int                          first_iter;
+    int64_t                      last_time;
+    double                       millis_per_iter;
 };
 
 static void opt_callback(void * vdata, int accum_step, float * sched) {
-    struct opt_callback_data * data   = (struct opt_callback_data *) vdata;
-    struct train_params      * params = data->params;
-    struct train_state       * train  = data->train;
-    struct ggml_opt_context  * opt    = train->opt;
+    struct opt_callback_data   * data   = (struct opt_callback_data *) vdata;
+    struct train_params_common * params = data->params;
+    struct train_state         * train  = data->train;
+    struct ggml_opt_context    * opt    = train->opt;
     int n_batch = params->n_batch;
     int n_ctx = params->n_ctx;
 
@@ -1385,11 +1064,11 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
+    if (params.common.seed == LLAMA_DEFAULT_SEED) {
+        params.common.seed = time(NULL);
     }
-    printf("%s: seed: %u\n", __func__, params.seed);
-    srand(params.seed);
+    printf("%s: seed: %u\n", __func__, params.common.seed);
+    srand(params.common.seed);
 
     struct llama_context_params llama_params = llama_context_default_params();
     llama_params.vocab_only = true;
@@ -1399,7 +1078,7 @@ int main(int argc, char ** argv) {
 
     struct my_llama_model model;
     model.hparams.n_vocab = llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_ctx   = params.common.n_ctx;
     model.hparams.n_embd  = params.n_embd;
     model.hparams.n_head  = params.n_head;
     model.hparams.n_layer = params.n_layer;
@@ -1421,34 +1100,34 @@ int main(int argc, char ** argv) {
 
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.n_batch;
+    int n_batch  = params.common.n_batch;
 
-    struct train_state      * train = init_train_state(params.seed);
+    struct train_state      * train = init_train_state(params.common.seed);
     struct ggml_opt_context * opt   = train->opt;
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
     opt_params_adam.print_forward_graph     = false;
     opt_params_adam.print_backward_graph    = false;
-    opt_params_adam.n_threads               = params.n_threads;
-    opt_params_adam.past                    = params.opt_past;
-    opt_params_adam.delta                   = params.opt_delta;
-    opt_params_adam.max_no_improvement      = params.opt_max_no_improvement;
-    opt_params_adam.n_gradient_accumulation = params.n_gradient_accumulation;
-    opt_params_adam.adam.n_iter             = params.adam_n_iter;
+    opt_params_adam.n_threads               = params.common.n_threads;
+    opt_params_adam.past                    = params.common.opt_past;
+    opt_params_adam.delta                   = params.common.opt_delta;
+    opt_params_adam.max_no_improvement      = params.common.opt_max_no_improvement;
+    opt_params_adam.n_gradient_accumulation = params.common.n_gradient_accumulation;
+    opt_params_adam.adam.n_iter             = params.common.adam_n_iter;
     opt_params_adam.adam.sched              = 1.0f;
-    opt_params_adam.adam.alpha              = params.adam_alpha;
-    opt_params_adam.adam.decay              = params.adam_decay;
-    opt_params_adam.adam.decay_min_ndim     = params.adam_decay_min_ndim;
-    opt_params_adam.adam.beta1              = params.adam_beta1;
-    opt_params_adam.adam.beta2              = params.adam_beta2;
-    opt_params_adam.adam.gclip              = params.adam_gclip;
-    opt_params_adam.adam.eps_f              = params.adam_eps_f;
+    opt_params_adam.adam.alpha              = params.common.adam_alpha;
+    opt_params_adam.adam.decay              = params.common.adam_decay;
+    opt_params_adam.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
+    opt_params_adam.adam.beta1              = params.common.adam_beta1;
+    opt_params_adam.adam.beta2              = params.common.adam_beta2;
+    opt_params_adam.adam.gclip              = params.common.adam_gclip;
+    opt_params_adam.adam.eps_f              = params.common.adam_eps_f;
 
     opt->ctx = model.ctx;
     opt->params = opt_params_adam;
 
     printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, train);
+    bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
     if (!existed) {
         init_model(&model);
     }
@@ -1461,7 +1140,7 @@ int main(int argc, char ** argv) {
 
     bool from_scratch = !existed;
     if (from_scratch) {
-        randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+        randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
     printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx));
@@ -1485,10 +1164,10 @@ int main(int argc, char ** argv) {
     std::vector<size_t> train_samples_size;
     printf("%s: tokenize training data\n", __func__);
     tokenize_file(lctx,
-            params.fn_train_data,
-            params.sample_start,
-            params.include_sample_start,
-            params.overlapping_samples,
+            params.common.fn_train_data,
+            params.common.sample_start,
+            params.common.include_sample_start,
+            params.common.overlapping_samples,
             n_tokens,
             train_tokens,
             train_samples_begin,
@@ -1497,16 +1176,16 @@ int main(int argc, char ** argv) {
 
     printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
 
-    size_t shuffle_samples_hash = compute_samples_hash(params.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
+    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
     const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
     if (changed_train_data) {
         printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
     }
-    if (params.force_reshuffle) {
+    if (params.common.force_reshuffle) {
         printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
     }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.seed);
+    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
+        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
         train->shuffle_sample_count = train_samples_size.size();
         train->shuffle_next_sample = 0;
         train->shuffle_samples_hash = shuffle_samples_hash;
@@ -1525,15 +1204,15 @@ int main(int argc, char ** argv) {
     printf("%s: begin training\n", __func__);
 
     save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.fn_checkpoint_out;
+    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
     save_data.fn_model_out      = params.fn_model_out;
     save_data.fn_vocab_model    = params.fn_vocab_model;
-    save_data.pattern_fn_it     = params.pattern_fn_it;
-    save_data.fn_latest         = params.fn_latest;
+    save_data.pattern_fn_it     = params.common.pattern_fn_it;
+    save_data.fn_latest         = params.common.fn_latest;
     save_data.model             = &model;
 
     struct opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params;
+    opt_cb_data.params                 = &params.common;
     opt_cb_data.train                  = train;
     opt_cb_data.save_cb                = &save_train_files;
     opt_cb_data.save_data              = &save_data;
@@ -1587,7 +1266,7 @@ int main(int argc, char ** argv) {
 
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);
         struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = params.use_checkpointing
+        struct ggml_cgraph * gb_tmp = params.common.use_checkpointing
             ? ggml_new_graph(ctx0)
             : NULL;
 
@@ -1601,21 +1280,21 @@ int main(int argc, char ** argv) {
             gf, gb, gb_tmp,
             &logits, tokens_input, target_probs,
             n_tokens, n_batch,
-            params.use_flash,
-            params.use_checkpointing
+            params.common.use_flash,
+            params.common.use_checkpointing
         );
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
         opt->params.adam.sched = learning_schedule(
             opt->iter,
-            params.warmup,
-            params.cos_decay_steps,
-            params.adam_alpha,
-            params.adam_min_alpha,
-            params.cos_decay_min,
-            params.cos_decay_restart,
-            params.enable_restart);
+            params.common.warmup,
+            params.common.cos_decay_steps,
+            params.common.adam_alpha,
+            params.common.adam_min_alpha,
+            params.common.cos_decay_min,
+            params.common.cos_decay_restart,
+            params.common.enable_restart);
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
@@ -1623,7 +1302,7 @@ int main(int argc, char ** argv) {
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 
-        int n_iter = params.adam_n_iter;
+        int n_iter = params.common.adam_n_iter;
         train->train_its = opt->iter;
         train->train_samples += n_batch * n_iter;
         train->train_tokens  += n_batch * n_tokens * n_iter;

From bef1e97875dd8050d76e509c7e17d9792bff6744 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 18:51:16 +0200
Subject: [PATCH 197/235] move common opt_callback into common/train

---
 common/train.cpp                              | 117 +++++++++++++-
 common/train.h                                |  25 ++-
 examples/finetune/finetune.cpp                | 142 +----------------
 .../train-text-from-scratch.cpp               | 149 +-----------------
 4 files changed, 146 insertions(+), 287 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index d22d4b0361770..99c319253741f 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1016,7 +1016,7 @@ struct train_params_common get_default_train_params_common() {
     params.fn_latest         = "LATEST";
 
     params.print_usage = false;
-    
+
     params.save_every = 10;
 
     params.seed       =   -1;
@@ -1329,3 +1329,118 @@ void finish_processing_train_args(struct train_params_common * params) {
         process_escapes(params->sample_start);
     }
 }
+
+void train_opt_callback(void * vdata, int accum_step, float * sched) {
+    struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
+    struct train_params_common     * params = data->params;
+    struct train_state             * train  = data->train;
+    struct ggml_opt_context        * opt    = train->opt;
+    int n_batch = params->n_batch;
+    int n_ctx = params->n_ctx;
+
+    if (accum_step == 0) {
+        // time measurement
+        int64_t now = ggml_time_ms();
+        if (now > data->last_time && opt->iter > data->first_iter) {
+            double dt = (double) (now - data->last_time);
+            if (data->millis_per_iter == 0.0) {
+                data->millis_per_iter = dt;
+            } else {
+                const double gain = 0.7;
+                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
+            }
+        }
+
+        double remaining_millis = 0.0;
+        if (data->millis_per_iter > 0.0) {
+            const int n_iter = params->adam_n_iter;
+            const int done_iter = opt->iter - data->first_iter;
+            const int remaining_iter = n_iter - done_iter;
+            remaining_millis = remaining_iter * data->millis_per_iter;
+        }
+
+        // file saving
+        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
+        if (save_now) {
+            int new_iters = opt->iter - data->last_save_iter;
+            train->train_its += new_iters;
+            train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
+            train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
+
+            if (data->save_cb) {
+                data->save_cb(data->save_data, train);
+            }
+
+            data->last_save_iter = opt->iter;
+        }
+
+        // exclude file saving from time measurement, by measuring last_time after saving
+        data->last_time = ggml_time_ms();
+
+        *sched = learning_schedule(
+            opt->iter,
+            params->warmup,
+            params->cos_decay_steps,
+            params->adam_alpha,
+            params->adam_min_alpha,
+            params->cos_decay_min,
+            params->cos_decay_restart,
+            params->enable_restart);
+
+        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
+        if (impr_plot > 0) impr_plot = 0;
+        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
+        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
+            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
+            *sched, opt->loss_after);
+
+
+        if (data->millis_per_iter > 0) {
+            printf(" dt=");
+            print_duration(data->millis_per_iter);
+            printf(" eta=");
+            print_duration(remaining_millis);
+        }
+
+        float improvement = opt->loss_before - opt->loss_after;
+        const float plot_scale = 10.0f;
+        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
+        printf(" |");
+        for (int i=0; i<bar_len; ++i) {
+            printf("-");
+        }
+        printf(">");
+        printf("\n");
+    }
+
+    int64_t used_samples = get_example_targets_batch(
+        data->lctx,
+        data->tokens_input,
+        data->target_probs,
+        train->shuffle_next_sample,
+        data->shuffled_samples_begin,
+        data->shuffled_samples_size,
+        data->samples_count,
+        data->tokens_data,
+        data->tokens_size,
+        params->separate_with_eos,
+        params->separate_with_bos,
+        params->fill_with_next_samples);
+
+    train->shuffle_next_sample += used_samples;
+
+    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
+        ++train->train_epochs;
+        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
+        // note: we may have used some samples from the current shuffling more than once
+        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
+        train->shuffle_rng_state_next = shuffle_samples(
+            train->shuffle_rng_state_current,
+            data->shuffled_samples_begin,
+            data->shuffled_samples_size,
+            data->samples_begin,
+            data->samples_size,
+            data->samples_count);
+        train->shuffle_next_sample = 0;
+    }
+}
diff --git a/common/train.h b/common/train.h
index cc3673c369141..db63a5d168462 100644
--- a/common/train.h
+++ b/common/train.h
@@ -80,6 +80,29 @@ struct train_params_common {
     float adam_eps_f;
 };
 
+typedef void (*save_train_files_callback)(void * data, struct train_state * train);
+
+struct train_opt_callback_data {
+    struct train_params_common * params;
+    struct train_state         * train;
+    save_train_files_callback    save_cb;
+    void                       * save_data;
+    struct llama_context       * lctx;
+    int                          last_save_iter;
+    llama_token                * tokens_data;
+    size_t                       tokens_size;
+    size_t                     * samples_begin;
+    size_t                     * samples_size;
+    size_t                     * shuffled_samples_begin;
+    size_t                     * shuffled_samples_size;
+    size_t                       samples_count;
+    struct ggml_tensor         * tokens_input;
+    struct ggml_tensor         * target_probs;
+    int                          first_iter;
+    int64_t                      last_time;
+    double                       millis_per_iter;
+};
+
 struct train_state * init_train_state(int seed);
 void free_train_state(struct train_state  * state);
 
@@ -195,4 +218,4 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
 
 std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
 
-typedef void (*save_train_files_callback)(void * data, struct train_state * train);
+void train_opt_callback(void * vdata, int accum_step, float * sched);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 09a29340afbec..308e3d5924363 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1318,7 +1318,7 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
     fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
     fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
     fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
-    
+
     print_common_train_usage(argc, argv, &params->common);
 }
 
@@ -1509,142 +1509,6 @@ static void save_train_files(void * vdata, struct train_state * train) {
     if (strlen(data->fn_lora_out) > 0) {
         save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora);
         save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->lora);
-    }    
-}
-
-struct opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-static void opt_callback(void * vdata, int accum_step, float * sched) {
-    struct opt_callback_data   * data   = (struct opt_callback_data *) vdata;
-    struct train_params_common * params = data->params;
-    struct train_state         * train  = data->train;
-    struct ggml_opt_context    * opt    = train->opt;
-    int n_batch = params->n_batch;
-    int n_ctx = params->n_ctx;
-
-    if (accum_step == 0) {
-        // time measurement
-        int64_t now = ggml_time_ms();
-        if (now > data->last_time && opt->iter > data->first_iter) {
-            double dt = now - data->last_time;
-            if (data->millis_per_iter == 0.0) {
-                data->millis_per_iter = dt;
-            } else {
-                const double gain = 0.7;
-                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
-            }
-        }
-
-        double remaining_millis = 0.0;
-        if (data->millis_per_iter > 0.0) {
-            const int n_iter = params->adam_n_iter;
-            const int done_iter = opt->iter - data->first_iter;
-            const int remaining_iter = n_iter - done_iter;
-            remaining_millis = remaining_iter * data->millis_per_iter;
-        }
-
-        // file saving
-        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
-        if (save_now) {
-            int new_iters = opt->iter - data->last_save_iter;
-            train->train_its += new_iters;
-            train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-            train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
-
-            if (data->save_cb) {
-                data->save_cb(data->save_data, train);
-            }
-
-            data->last_save_iter = opt->iter;
-        }
-
-        // exclude file saving from time measurement, by measuring last_time after saving
-        data->last_time = ggml_time_ms();
-
-        *sched = learning_schedule(
-            opt->iter,
-            params->warmup,
-            params->cos_decay_steps,
-            params->adam_alpha,
-            params->adam_min_alpha,
-            params->cos_decay_min,
-            params->cos_decay_restart,
-            params->enable_restart);
-
-        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
-        if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
-        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
-            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
-            *sched, opt->loss_after);
-
-
-        if (data->millis_per_iter > 0) {
-            printf(" dt=");
-            print_duration(data->millis_per_iter);
-            printf(" eta=");
-            print_duration(remaining_millis);
-        }
-
-        float improvement = opt->loss_before - opt->loss_after;
-        const float plot_scale = 10.0f;
-        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
-        printf(" |");
-        for (int i=0; i<bar_len; ++i) {
-            printf("-");
-        }
-        printf(">");
-        printf("\n");
-    }
-
-    int64_t used_samples = get_example_targets_batch(
-        data->lctx,
-        data->tokens_input,
-        data->target_probs,
-        train->shuffle_next_sample,
-        data->shuffled_samples_begin,
-        data->shuffled_samples_size,
-        data->samples_count,
-        data->tokens_data,
-        data->tokens_size,
-        params->separate_with_eos,
-        params->separate_with_bos,
-        params->fill_with_next_samples);
-
-    train->shuffle_next_sample += used_samples;
-
-    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
-        ++train->train_epochs;
-        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
-        // note: we may have used some samples from the current shuffling more than once
-        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
-        train->shuffle_rng_state_next = shuffle_samples(
-            train->shuffle_rng_state_current,
-            data->shuffled_samples_begin,
-            data->shuffled_samples_size,
-            data->samples_begin,
-            data->samples_size,
-            data->samples_count);
-        train->shuffle_next_sample = 0;
     }
 }
 
@@ -2023,7 +1887,7 @@ int main(int argc, char ** argv) {
     save_data.model             = &model;
     save_data.lora              = &lora;
 
-    struct opt_callback_data opt_cb_data;
+    struct train_opt_callback_data opt_cb_data;
     opt_cb_data.params                 = &params.common;
     opt_cb_data.train                  = train;
     opt_cb_data.save_cb                = &save_train_files;
@@ -2057,7 +1921,7 @@ int main(int argc, char ** argv) {
 
     int64_t t0 = ggml_time_ms();
 
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
+    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
 
     ggml_free(ctx_work);
     ggml_free(ctx_compute);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 5b993b47b0a54..c54727ec54f37 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -693,7 +693,7 @@ static void save_checkpoint_file(const char * filename, const char * fn_vocab_mo
 
 struct train_params {
     struct train_params_common common;
-    
+
     const char * fn_vocab_model;
     const char * fn_model_out;
 
@@ -919,144 +919,6 @@ static void save_train_files(void * vdata, struct train_state * train) {
     }
 }
 
-struct opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_logits;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-static void opt_callback(void * vdata, int accum_step, float * sched) {
-    struct opt_callback_data   * data   = (struct opt_callback_data *) vdata;
-    struct train_params_common * params = data->params;
-    struct train_state         * train  = data->train;
-    struct ggml_opt_context    * opt    = train->opt;
-    int n_batch = params->n_batch;
-    int n_ctx = params->n_ctx;
-
-    if (accum_step == 0) {
-        // time measurement
-        int64_t now = ggml_time_ms();
-        if (now > data->last_time && opt->iter > data->first_iter) {
-            double dt = now - data->last_time;
-            if (data->millis_per_iter == 0.0) {
-                data->millis_per_iter = dt;
-            } else {
-                const double gain = 0.7;
-                data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
-            }
-        }
-
-        double remaining_millis = 0.0;
-        if (data->millis_per_iter > 0.0) {
-            const int n_iter = params->adam_n_iter;
-            const int done_iter = opt->iter - data->first_iter;
-            const int remaining_iter = n_iter - done_iter;
-            remaining_millis = remaining_iter * data->millis_per_iter;
-        }
-
-        // file saving
-        const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
-        if (save_now) {
-            int new_iters = opt->iter - data->last_save_iter;
-            train->train_its += new_iters;
-            train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-            train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
-
-            if (data->save_cb) {
-                data->save_cb(data->save_data, train);
-            }
-
-            data->last_save_iter = opt->iter;
-        }
-
-        // exclude file saving from time measurement, by measuring last_time after saving
-        data->last_time = ggml_time_ms();
-
-        *sched = learning_schedule(
-            opt->iter,
-            params->warmup,
-            params->cos_decay_steps,
-            params->adam_alpha,
-            params->adam_min_alpha,
-            params->cos_decay_min,
-            params->cos_decay_restart,
-            params->enable_restart);
-
-        int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
-        if (impr_plot > 0) impr_plot = 0;
-        if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
-        printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
-            __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
-            *sched, opt->loss_after);
-
-
-        if (data->millis_per_iter > 0) {
-            printf(" dt=");
-            print_duration(data->millis_per_iter);
-            printf(" eta=");
-            print_duration(remaining_millis);
-        }
-
-        float improvement = opt->loss_before - opt->loss_after;
-        const float plot_scale = 10.0f;
-        int bar_len = (int)(1 + improvement*plot_scale + 0.5);
-        printf(" |");
-        for (int i=0; i<bar_len; ++i) {
-            printf("-");
-        }
-        printf(">");
-        printf("\n");
-    }
-
-    int64_t used_samples = get_example_targets_batch(
-        data->lctx,
-        data->tokens_input,
-        data->target_probs,
-        train->shuffle_next_sample,
-        data->shuffled_samples_begin,
-        data->shuffled_samples_size,
-        data->samples_count,
-        data->tokens_data,
-        data->tokens_size,
-        params->separate_with_eos,
-        params->separate_with_bos,
-        params->fill_with_next_samples);
-
-    train->shuffle_next_sample += used_samples;
-
-    if (train->shuffle_next_sample >= train->shuffle_sample_count) {
-        ++train->train_epochs;
-        printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
-        // note: we may have used some samples from the current shuffling more than once
-        train->shuffle_rng_state_current = train->shuffle_rng_state_next;
-        train->shuffle_rng_state_next = shuffle_samples(
-            train->shuffle_rng_state_current,
-            data->shuffled_samples_begin,
-            data->shuffled_samples_size,
-            data->samples_begin,
-            data->samples_size,
-            data->samples_count);
-        train->shuffle_next_sample = 0;
-    }
-
-}
-
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -1211,7 +1073,7 @@ int main(int argc, char ** argv) {
     save_data.fn_latest         = params.common.fn_latest;
     save_data.model             = &model;
 
-    struct opt_callback_data opt_cb_data;
+    struct train_opt_callback_data opt_cb_data;
     opt_cb_data.params                 = &params.common;
     opt_cb_data.train                  = train;
     opt_cb_data.save_cb                = &save_train_files;
@@ -1226,7 +1088,6 @@ int main(int argc, char ** argv) {
     opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
     opt_cb_data.samples_count          = train_samples_size.size();
     opt_cb_data.tokens_input           = NULL;
-    opt_cb_data.target_logits          = NULL;
     opt_cb_data.target_probs           = NULL;
     opt_cb_data.first_iter             = opt->iter;
     opt_cb_data.last_time              = ggml_time_ms();
@@ -1246,10 +1107,7 @@ int main(int argc, char ** argv) {
         ggml_set_no_alloc(ctx0, false);
 
         // don't use alloc for input tensors, so we can safely fill them with data
-        //struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        //struct ggml_tensor * after_opt_probs        = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * target_logits          = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
         struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
 
         ggml_set_no_alloc(ctx0, (alloc != NULL));
@@ -1259,7 +1117,6 @@ int main(int argc, char ** argv) {
         }
 
         opt_cb_data.tokens_input  = tokens_input;
-        opt_cb_data.target_logits = target_logits;
         opt_cb_data.target_probs  = target_probs;
 
         int n_past = 0;
@@ -1298,7 +1155,7 @@ int main(int argc, char ** argv) {
 
         printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
 
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data);
+        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
 
         size_t used_mem_after_opt = ggml_used_mem(ctx0);
 

From 7aa9ea7f20b54544215791a83d393d98febf2b0a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 19:08:51 +0200
Subject: [PATCH 198/235] fix consume_common_train_arg

---
 common/train.cpp               | 6 +++++-
 examples/finetune/finetune.cpp | 2 --
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 99c319253741f..287fe5d937c9a 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1115,7 +1115,11 @@ void print_common_train_usage(int /*argc*/, char ** argv, const struct train_par
 
 bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param) {
     int& i = *idx;
-    char * arg = argv[i];
+    std::string arg = argv[i];
+    const std::string arg_prefix = "--";
+    if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+        std::replace(arg.begin(), arg.end(), '_', '-');
+    }
     if (arg == "--train-data") {
         if (++i >= argc) {
             *invalid_param = true;
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 308e3d5924363..17b559a10bc11 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -514,8 +514,6 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
     ggml_allocr_free(alloc);
 }
 
-
-
 static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
     const uint32_t n_layer = lora->layers.size();
 

From 48d35091901090293b66c2f7c9bbe7ae6cfe082e Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 20:20:23 +0200
Subject: [PATCH 199/235] save and load head_count_kv in lora checkpoints

---
 examples/finetune/finetune.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 17b559a10bc11..5c6fa639cba89 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -174,6 +174,7 @@ static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
 static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
 static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
 static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
+static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV     = "%s.attention.head_count_kv";
 static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
 static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
 static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
@@ -865,8 +866,11 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
     GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
     GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
 
+    model->hparams.n_head_kv = model->hparams.n_head;
+    GGUF_GET_KEY(fctx, model->hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
+
     model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
-    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+    GGUF_GET_KEY(fctx, model->hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
 
     float rope_freq_scale = 1.0f;
     GGUF_GET_KEY(fctx, lora->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -939,6 +943,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
     gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
     gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
     gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
+    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV),     model->hparams.n_head_kv);
     gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
     gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot);
     gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), lora->hparams.f_norm_rms_eps);

From 571dc94da986b6f039ad9a16d5080d72dec186b0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 20:23:05 +0200
Subject: [PATCH 200/235] increase train_samples by used_samples instead of
 number of batches

on batch can contain more than one sample when option "fill_with_next_samples" is used
---
 common/train.cpp                                            | 6 +++---
 examples/finetune/finetune.cpp                              | 1 -
 .../train-text-from-scratch/train-text-from-scratch.cpp     | 1 -
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 287fe5d937c9a..9357fab0f2f81 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1367,9 +1367,8 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
         const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
         if (save_now) {
             int new_iters = opt->iter - data->last_save_iter;
-            train->train_its += new_iters;
-            train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
-            train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
+            train->train_its    += new_iters;
+            train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
 
             if (data->save_cb) {
                 data->save_cb(data->save_data, train);
@@ -1431,6 +1430,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
         params->separate_with_bos,
         params->fill_with_next_samples);
 
+    train->train_samples += used_samples;
     train->shuffle_next_sample += used_samples;
 
     if (train->shuffle_next_sample >= train->shuffle_sample_count) {
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 5c6fa639cba89..c1227897c9bf6 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1938,7 +1938,6 @@ int main(int argc, char ** argv) {
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     if (new_iters > 0) {
         train->train_its     += new_iters;
-        train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
         train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
         save_train_files(&save_data, train);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c54727ec54f37..88174e064cfb7 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1183,7 +1183,6 @@ int main(int argc, char ** argv) {
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     if (new_iters > 0) {
         train->train_its     += new_iters;
-        train->train_samples += new_iters * opt->params.n_gradient_accumulation * n_batch;
         train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
 
         save_train_files(&save_data, train);

From 7930caf24c509698b68c47f342de552070dbf0cb Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 20:36:43 +0200
Subject: [PATCH 201/235] fix usage of llama_tokenize

---
 common/train.cpp | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 9357fab0f2f81..ef147b140a5c2 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -853,10 +853,22 @@ size_t tokenize_file(
         // tokenize all data at once
         out_tokens.resize(buf.size() + n_max_tokens_overhead);
 
-        int n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false);
+        int n_tokens = llama_tokenize(
+            lctx,
+            buf.data(),
+            (int) buf.size(),
+            out_tokens.data(),
+            (int) out_tokens.size(),
+            false);
         if (n_tokens < 0) {
             out_tokens.resize(-n_tokens);
-            n_tokens = llama_tokenize(lctx, buf.data(), out_tokens.data(), (int) out_tokens.size(), false);
+            n_tokens = llama_tokenize(
+                lctx,
+                buf.data(),
+                (int) buf.size(),
+                out_tokens.data(),
+                (int) out_tokens.size(),
+                false);
         }
         if (n_tokens >= 0) {
             out_tokens.resize(n_tokens);
@@ -948,14 +960,18 @@ size_t tokenize_file(
                 tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
                 int n_tokens = llama_tokenize(lctx,
                     buf_sample.data(),
+                    (int) buf_sample.size(),
                     tok_sample.data(),
-                    (int) tok_sample.size(), false);
+                    (int) tok_sample.size(),
+                    false);
                 if (n_tokens < 0) {
                     tok_sample.resize(-n_tokens);
                     n_tokens = llama_tokenize(lctx,
                         buf_sample.data(),
+                        (int) buf_sample.size(),
                         tok_sample.data(),
-                        (int) tok_sample.size(), false);
+                        (int) tok_sample.size(),
+                        false);
                     GGML_ASSERT(n_tokens >= 0);
                 }
                 GGML_ASSERT(n_tokens <= (int) tok_sample.size());

From 8d82d4c8e625aafc088871725b1fd1a1bbcf5233 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 20:37:56 +0200
Subject: [PATCH 202/235] remove static from process_escape since we need it
 exposed in header

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 91acb66a067ef..ad45d84f81b8a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
-static void process_escapes(std::string& input) {
+void process_escapes(std::string& input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
 

From 9139fec7ffff2c83d8916052f90a4b0e2ab99bc2 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 20:38:23 +0200
Subject: [PATCH 203/235] fix code formating of long function declarations

---
 common/train.cpp | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index ef147b140a5c2..c7eaa72dff8a3 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -39,7 +39,9 @@ struct ggml_opt_context * get_train_state_opt(struct train_state  * state) {
     return state->opt;
 }
 
-struct random_normal_distribution * init_random_normal_distribution(int seed, float mean, float std, float min, float max) {
+struct random_normal_distribution * init_random_normal_distribution(
+    int seed, float mean, float std, float min, float max
+) {
     struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
     rnd->gen = std::mt19937(seed);
     rnd->rd = std::normal_distribution<float>{mean, std};
@@ -203,18 +205,19 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
 }
 
 int64_t get_example_targets_batch(
-        struct llama_context * lctx,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        int64_t                example_id,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples) {
+    struct llama_context * lctx,
+    struct ggml_tensor   * tokens_input,
+    struct ggml_tensor   * target_probs,
+    int64_t                example_id,
+    const size_t         * samples_begin,
+    const size_t         * samples_size,
+          size_t           samples_count,
+    const llama_token    * train_data,
+    size_t                 n_train_data,
+    bool                   separate_with_eos,
+    bool                   separate_with_bos,
+    bool                   fill_with_next_samples
+) {
 
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_probs->n_dims  == 3);
@@ -1129,7 +1132,9 @@ void print_common_train_usage(int /*argc*/, char ** argv, const struct train_par
     fprintf(stderr, "\n");
 }
 
-bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param) {
+bool consume_common_train_arg(
+    int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param
+) {
     int& i = *idx;
     std::string arg = argv[i];
     const std::string arg_prefix = "--";

From 1d33ec5b1c2aa3e7e673082ee72049795b2e1d66 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 21:10:16 +0200
Subject: [PATCH 204/235] fix condition in load_train_state_gguf

---
 common/train.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/train.cpp b/common/train.cpp
index c7eaa72dff8a3..f38c25bd5abc8 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -653,7 +653,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
 }
 
 bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) {
-    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) >= 0) {
+    if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) {
         return false;
     }
 

From 1d099651792713da8a0e3aaf0ca2594dfcc5b40d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 21:12:16 +0200
Subject: [PATCH 205/235] use die("msg") instead of replace GGML_ASSERT(!"msg")
 or throw std::runtime_error("msg")

---
 common/train.cpp               | 6 +++---
 examples/finetune/finetune.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index f38c25bd5abc8..1eec3e3fbd6d0 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -109,7 +109,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
             }
             break;
         default:
-            GGML_ASSERT(!"Unsupported tensor->n_dims");
+            die("Unsupported tensor->n_dims");
     };
     return tensor;
 }
@@ -153,7 +153,7 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
             }
             break;
         default:
-            GGML_ASSERT(!"Unsupported tensor->n_dims");
+            die("Unsupported tensor->n_dims");
     };
     return tensor;
 }
@@ -581,7 +581,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
         copy_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
         copy_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
     } else {
-        throw std::runtime_error("unknown optimizer type\n");
+        die("unknown optimizer type\n");
     }
 }
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index c1227897c9bf6..5480754934316 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1664,12 +1664,12 @@ int main(int argc, char ** argv) {
 
         if (opt_param_count_changed) {
             print_lora_params(&lora.hparams);
-            GGML_ASSERT(!"Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
+            die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
             // need to discard previous optimizer gradient statistics and opt_init with new shapes
             // TODO
         }
         if (opt_past_changed) {
-            GGML_ASSERT(!"Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
+            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
             // need to discard previous optimizer past function value statistics and opt_init with new shapes
             // TODO
         }

From 9db2664dd19fec8f1a36c64a2dc842c552a6b77f Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 21:21:04 +0200
Subject: [PATCH 206/235] fix saving and loading of training type

---
 common/train.cpp                                      |  9 ---------
 examples/finetune/finetune.cpp                        | 11 ++++++++++-
 .../train-text-from-scratch.cpp                       | 10 +++++++++-
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 1eec3e3fbd6d0..3724d75c256df 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -497,15 +497,11 @@ static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.
 static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
 static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
 
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
 static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
 static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
 static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
 static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
 static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
-static const char * LLM_KV_TRAINING_SAMPLES_HASH         = "training.samples_hash";
 static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
 static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
 static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
@@ -661,10 +657,6 @@ bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_g
     GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
     GGML_ASSERT(file_version <= 1);
 
-    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
-    GGUF_GET_KEY(fctx, train_type,           gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-
     if (file_version == 0) {
 
         GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
@@ -690,7 +682,6 @@ bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_g
 
 void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) {
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
     gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its);
     gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    train->train_samples);
     gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     train->train_tokens);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 5480754934316..ae3582a54e7a9 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -151,6 +151,10 @@ struct my_llama_lora {
 };
 
 // gguf constants
+static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
+static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
+static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
+
 static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
 static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
 static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
@@ -994,11 +998,16 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
 }
 
 static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
+    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
+    GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
+    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
+
     load_train_state_gguf(fctx, f_ggml_ctx, train);
+    load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
 }
 
 static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
     save_llama_lora_gguf(fctx, model, lora);
     save_train_state_gguf(fctx, train);
 }
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 88174e064cfb7..5c37776f37444 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -68,6 +68,9 @@ struct my_llama_model {
 };
 
 // gguf constants (sync with gguf.py)
+static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
+static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
+static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
 
 static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
 static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
@@ -654,12 +657,17 @@ static void save_llama_model_file(const char * filename, const char * fn_vocab_m
 
 static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) {
     load_llama_model_gguf(fctx, f_ggml_ctx, model);
-    if (!load_train_state_gguf(fctx, f_ggml_ctx, train)) {
+    if (load_train_state_gguf(fctx, f_ggml_ctx, train)) {
+        std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
+        GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
+        GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
+    } else {
         printf("%s: loaded llama model as checkpoint\n", __func__);
     }
 }
 
 static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
+    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
     save_llama_model_gguf(fctx, fn_vocab_model, model);
     save_train_state_gguf(fctx, train);
 }

From dd3e7634f036eb22c69cba7ea6008df7e974a36d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 21:30:49 +0200
Subject: [PATCH 207/235] remove terminating '\0' from tokenization

(llama_tokenize is now passed the string length instead of relying on terminating '\0')
---
 common/train.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 3724d75c256df..7ffaf94a8d14d 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -832,10 +832,9 @@ size_t tokenize_file(
     const int n_max_tokens_overhead = 1;
 
     std::vector<char> buf;
-    buf.resize(f.size+1);
+    buf.resize(f.size);
 
     f.read_raw(buf.data(), f.size);
-    buf[f.size] = '\0';
 
     std::vector<int> utf8_units;
     std::vector<int> utf8_nunits;
@@ -879,7 +878,7 @@ size_t tokenize_file(
         }
     } else {
         // split data into samples and tokenize each sample
-        std::string data_str(buf.data(), buf.size()-1);
+        std::string data_str(buf.data(), buf.size());
         out_samples_begin.clear();
         out_samples_size.clear();
         out_tokens.clear();
@@ -944,9 +943,8 @@ size_t tokenize_file(
             if (sample_size > 0) {
                 // llama_tokenize expects zero terminated string,
                 // copy sample into buffer and zero terminate it.
-                buf_sample.resize(sample_size+1);
+                buf_sample.resize(sample_size);
                 memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
-                buf_sample[sample_size] = '\0';
 
                 // printf("sample: '%s'\n", buf_sample.data());
 

From 83061fbdbe237d3e12674714ea741b5bd5e3d037 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 22:19:46 +0200
Subject: [PATCH 208/235] fix compile warnings

---
 common/train.cpp                              | 22 ++++++++++---------
 common/train.h                                |  2 +-
 examples/finetune/finetune.cpp                |  2 +-
 .../train-text-from-scratch.cpp               |  2 +-
 ggml.c                                        |  1 -
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 7ffaf94a8d14d..e54f9b5fe4da1 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -17,11 +17,17 @@ struct random_uniform_distribution {
     std::uniform_real_distribution<float> rd;
 };
 
-struct train_state  * init_train_state(int seed) {
+struct train_state  * init_train_state() {
     struct train_state * state = (struct train_state *) malloc(sizeof(struct train_state));
-    memset(state, 0, sizeof(struct train_state));
+    state->train_its     = 0;
+    state->train_samples = 0;
+    state->train_tokens  = 0;
+    state->train_epochs  = 0;
+    state->shuffle_samples_hash  = 0;
+    state->shuffle_sample_count  = 0;
+    state->shuffle_next_sample   = 0;
     state->shuffle_rng_state_current = "";
-    state->shuffle_rng_state_next = "";
+    state->shuffle_rng_state_next    = "";
 
     state->opt = (struct ggml_opt_context *) malloc(sizeof(struct ggml_opt_context));
     memset(state->opt, 0, sizeof(struct ggml_opt_context));
@@ -35,10 +41,6 @@ void free_train_state(struct train_state  * state) {
     free(state);
 }
 
-struct ggml_opt_context * get_train_state_opt(struct train_state  * state) {
-    return state->opt;
-}
-
 struct random_normal_distribution * init_random_normal_distribution(
     int seed, float mean, float std, float min, float max
 ) {
@@ -741,7 +743,7 @@ struct llama_file {
             die_fmt("read error: %s", strerror(errno));
         }
         if (ret != 1) {
-            die_fmt("unexpectedly reached end of file");
+            die("unexpectedly reached end of file");
         }
     }
 
@@ -840,7 +842,7 @@ size_t tokenize_file(
     std::vector<int> utf8_nunits;
     utf8_units.resize(buf.size());
     utf8_nunits.resize(buf.size());
-    size_t n_utf8_chars = mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
+    mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
 
     if (sample_start.size() == 0) {
         // tokenize all data at once
@@ -1070,7 +1072,7 @@ struct train_params_common get_default_train_params_common() {
     return params;
 }
 
-void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params) {
+void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) {
     // fprintf(stderr, "usage: %s [options]\n", argv[0]);
     // fprintf(stderr, "\n");
     // fprintf(stderr, "options:\n");
diff --git a/common/train.h b/common/train.h
index db63a5d168462..97f08964d5974 100644
--- a/common/train.h
+++ b/common/train.h
@@ -103,7 +103,7 @@ struct train_opt_callback_data {
     double                       millis_per_iter;
 };
 
-struct train_state * init_train_state(int seed);
+struct train_state * init_train_state();
 void free_train_state(struct train_state  * state);
 
 struct train_params_common get_default_train_params_common();
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ae3582a54e7a9..50eda730d14ef 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1582,7 +1582,7 @@ int main(int argc, char ** argv) {
 
     struct my_llama_lora lora;
 
-    struct train_state      * train = init_train_state(params.common.seed);
+    struct train_state      * train = init_train_state();
     struct ggml_opt_context * opt   = train->opt;
 
     load_default_lora_params_from_base_model(params.fn_model_base, &lora.hparams);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 5c37776f37444..861d0829453e1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -972,7 +972,7 @@ int main(int argc, char ** argv) {
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.common.n_batch;
 
-    struct train_state      * train = init_train_state(params.common.seed);
+    struct train_state      * train = init_train_state();
     struct ggml_opt_context * opt   = train->opt;
 
     struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
diff --git a/ggml.c b/ggml.c
index e00324f8aad77..ec9ea80a4cae5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15134,7 +15134,6 @@ static void ggml_compute_forward_flash_attn_back_f32(
 
     const int64_t elem_q = ggml_nelements(q);
     const int64_t elem_k = ggml_nelements(k);
-    const int64_t elem_v = ggml_nelements(v);
 
     enum ggml_type result_type = dst->type;
     GGML_ASSERT(ggml_blck_size(result_type) == 1);

From 8721785c52d74e4b6eaf7930c620e74e4c4319a0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 16 Sep 2023 22:28:23 +0200
Subject: [PATCH 209/235] fix compile warnings

---
 examples/finetune/finetune.cpp                | 27 +++++++++----------
 .../train-text-from-scratch.cpp               |  1 -
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 50eda730d14ef..ea1a68b0d7b22 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -151,7 +151,6 @@ struct my_llama_lora {
 };
 
 // gguf constants
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
 static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
 static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
 
@@ -1085,7 +1084,7 @@ struct llama_file {
             die_fmt("read error: %s", strerror(errno));
         }
         if (ret != 1) {
-            die_fmt("unexpectedly reached end of file");
+            die("unexpectedly reached end of file");
         }
     }
 
@@ -1599,18 +1598,18 @@ int main(int argc, char ** argv) {
     }
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;
-    int n_rank_attention_norm          = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
-    int n_rank_wq                      = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
-    int n_rank_wk                      = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
-    int n_rank_wv                      = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
-    int n_rank_wo                      = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
-    int n_rank_ffn_norm                = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    int n_rank_w1                      = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
-    int n_rank_w2                      = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
-    int n_rank_w3                      = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
-    int n_rank_tok_embeddings          = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
-    int n_rank_norm                    = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
-    int n_rank_output                  = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
+    uint32_t n_rank_attention_norm          = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
+    uint32_t n_rank_wq                      = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
+    uint32_t n_rank_wk                      = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
+    uint32_t n_rank_wv                      = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
+    uint32_t n_rank_wo                      = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
+    uint32_t n_rank_ffn_norm                = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
+    uint32_t n_rank_w1                      = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
+    uint32_t n_rank_w2                      = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
+    uint32_t n_rank_w3                      = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
+    uint32_t n_rank_tok_embeddings          = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
+    uint32_t n_rank_norm                    = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
+    uint32_t n_rank_output                  = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
     lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
     lora.hparams.n_rank_wq             = n_rank_wq;
     lora.hparams.n_rank_wk             = n_rank_wk;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 861d0829453e1..d5cf426652bec 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -69,7 +69,6 @@ struct my_llama_model {
 
 // gguf constants (sync with gguf.py)
 static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
 static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
 
 static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";

From ddf5ac257ae63fa5fb301571b4da74389262b06a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 12:48:17 +0200
Subject: [PATCH 210/235] use new/delete for train_state instead of malloc/free

using malloc may result in seg faults when trying to assign string fields
---
 common/train.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index e54f9b5fe4da1..fd34e026e63c4 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -18,7 +18,7 @@ struct random_uniform_distribution {
 };
 
 struct train_state  * init_train_state() {
-    struct train_state * state = (struct train_state *) malloc(sizeof(struct train_state));
+    struct train_state * state = new struct train_state;
     state->train_its     = 0;
     state->train_samples = 0;
     state->train_tokens  = 0;
@@ -29,16 +29,16 @@ struct train_state  * init_train_state() {
     state->shuffle_rng_state_current = "";
     state->shuffle_rng_state_next    = "";
 
-    state->opt = (struct ggml_opt_context *) malloc(sizeof(struct ggml_opt_context));
-    memset(state->opt, 0, sizeof(struct ggml_opt_context));
+    state->opt = new struct ggml_opt_context;
+    state->opt->ctx = NULL;
     state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
 
     return state;
 }
 
 void free_train_state(struct train_state  * state) {
-    free(state->opt);
-    free(state);
+    delete state->opt;
+    delete state;
 }
 
 struct random_normal_distribution * init_random_normal_distribution(
@@ -932,7 +932,7 @@ size_t tokenize_file(
                                     : (i+1 < out_samples_begin.size()
                                         ? out_samples_begin[i+1]
                                         : data_str.size());
-            if (utf8_units[sample_end] > 0) {
+            if (sample_end < utf8_units.size() && utf8_units[sample_end] > 0) {
                 // sample end is in the middle of an utf8 character.
                 // advance sample_end to the begin of the next utf8 character.
                 sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];

From 151bfe9ee1b006cc266f4d5a65429aaeb660eb8d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 13:07:17 +0200
Subject: [PATCH 211/235] assert that sample_count > 0, avoiding division by
 zero

---
 common/train.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/train.cpp b/common/train.cpp
index fd34e026e63c4..5edd5876e7743 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -220,7 +220,7 @@ int64_t get_example_targets_batch(
     bool                   separate_with_bos,
     bool                   fill_with_next_samples
 ) {
-
+    GGML_ASSERT(samples_count > 0);
     GGML_ASSERT(tokens_input->n_dims  == 2);
     GGML_ASSERT(target_probs->n_dims  == 3);
     int64_t n_vocab  = target_probs->ne[0];

From bf2ad65836007fb1060f2ea509137d47f5566f1a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 14:28:58 +0200
Subject: [PATCH 212/235] fix frand to return value in interval [0,1)

---
 common/train.cpp | 2 +-
 common/train.h   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/train.cpp b/common/train.cpp
index 5edd5876e7743..991679292debb 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -161,7 +161,7 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 }
 
 float frand() {
-    return (float)rand()/(float)RAND_MAX;
+    return (float)rand()/((float)(RAND_MAX) + 1.0f);
 }
 
 float frand_normal(struct random_normal_distribution * rnd) {
diff --git a/common/train.h b/common/train.h
index 97f08964d5974..4857ba92281d6 100644
--- a/common/train.h
+++ b/common/train.h
@@ -124,6 +124,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
 struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
 
+// generate random float in interval [0,1)
 float frand();
 float frand_normal (struct random_normal_distribution * rnd);
 float frand_uniform(struct random_uniform_distribution * rnd);

From d1bb6fb3499efefac9e1eb7bab0cc1fdf08e66b3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 14:37:41 +0200
Subject: [PATCH 213/235] add train option "--sample-random-offsets"

Use samples beginning at random offsets.
The offset is only applied to the first sample in each batch context window.
Together with "--fill-with-next-samples" this may help for training endless text generation.

For example given a dataset containing samples "abcd", "ABCD", "0123".
With context size of 8 and options "--fill-with-next-samples", "--no-separate-with-eos", "--no-separate-with-bos",
the context windows of batches could only be filled with "abcdABCD", "ABCDabcd", "0123abcd", etc.

With "--sample-random-offsets" it can also be filled with "23abcdAB", "bcd0123A", etc.
---
 common/train.cpp                              | 21 ++++++++++++++++---
 common/train.h                                |  7 ++++++-
 examples/finetune/finetune.cpp                |  4 ++++
 .../train-text-from-scratch.cpp               |  4 ++++
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 991679292debb..10e0107eb3bc0 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -211,6 +211,7 @@ int64_t get_example_targets_batch(
     struct ggml_tensor   * tokens_input,
     struct ggml_tensor   * target_probs,
     int64_t                example_id,
+    const size_t         * samples_offs,
     const size_t         * samples_begin,
     const size_t         * samples_size,
           size_t           samples_count,
@@ -218,7 +219,8 @@ int64_t get_example_targets_batch(
     size_t                 n_train_data,
     bool                   separate_with_eos,
     bool                   separate_with_bos,
-    bool                   fill_with_next_samples
+    bool                   fill_with_next_samples,
+    bool                   sample_random_offsets
 ) {
     GGML_ASSERT(samples_count > 0);
     GGML_ASSERT(tokens_input->n_dims  == 2);
@@ -238,8 +240,8 @@ int64_t get_example_targets_batch(
     // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
     for (int k=0; k<n_batch; ++k) {
         // printf("%s: batch %d\n", __func__, k);
-        size_t sample_offs  = 0;
         size_t sample_idx   = (example_id + used_samples) % samples_count;
+        size_t sample_offs  = sample_random_offsets ? samples_offs[sample_idx] : 0;
         size_t sample_begin = samples_begin[sample_idx];
         size_t sample_size  = samples_size[sample_idx];
         ++used_samples;
@@ -308,6 +310,7 @@ std::string mt19937_seed_to_state(unsigned seed) {
 
 std::string shuffle_samples(
         const std::string & rng_state,
+        size_t            * shuffled_offs,
         size_t            * shuffled_begins,
         size_t            * shuffled_sizes,
         const size_t      * begins,
@@ -335,6 +338,11 @@ std::string shuffle_samples(
         });
     }
 
+    // create random offsets
+    for (unsigned i=0; i<count; ++i) {
+        shuffled_offs[i] = (size_t) ((sizes[idcs[i]] - 1) * ((double) rng() / (double) (rng.max()-1)));
+    }
+
     // reorder begins and sizes by sorted indices
     for (unsigned i=0; i<count; ++i) {
         shuffled_begins[i] = begins[idcs[i]];
@@ -1048,6 +1056,7 @@ struct train_params_common get_default_train_params_common() {
     params.fill_with_next_samples = false;
     params.separate_with_eos      = false;
     params.separate_with_bos      = true;
+    params.sample_random_offsets  = false;
     params.force_reshuffle        = false;
 
     params.opt_past               = 0;
@@ -1097,6 +1106,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
     fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
     fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
     fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
+    fprintf(stderr, "  --sample-random-offsets    Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : "");
     fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
     fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
     fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
@@ -1221,6 +1231,8 @@ bool consume_common_train_arg(
         params->separate_with_eos = false;
     } else if (arg == "--no-separate-with-bos") {
         params->separate_with_bos = false;
+    } else if (arg == "--sample-random-offsets") {
+        params->sample_random_offsets = true;
     } else if (arg == "--force-reshuffle") {
         params->force_reshuffle = true;
     } else if (arg == "--no-flash") {
@@ -1433,6 +1445,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
         data->tokens_input,
         data->target_probs,
         train->shuffle_next_sample,
+        data->shuffled_samples_offs,
         data->shuffled_samples_begin,
         data->shuffled_samples_size,
         data->samples_count,
@@ -1440,7 +1453,8 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
         data->tokens_size,
         params->separate_with_eos,
         params->separate_with_bos,
-        params->fill_with_next_samples);
+        params->fill_with_next_samples,
+        params->sample_random_offsets);
 
     train->train_samples += used_samples;
     train->shuffle_next_sample += used_samples;
@@ -1452,6 +1466,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
         train->shuffle_rng_state_current = train->shuffle_rng_state_next;
         train->shuffle_rng_state_next = shuffle_samples(
             train->shuffle_rng_state_current,
+            data->shuffled_samples_offs,
             data->shuffled_samples_begin,
             data->shuffled_samples_size,
             data->samples_begin,
diff --git a/common/train.h b/common/train.h
index 4857ba92281d6..6ef1f9fc50542 100644
--- a/common/train.h
+++ b/common/train.h
@@ -56,6 +56,7 @@ struct train_params_common {
     bool fill_with_next_samples;
     bool separate_with_eos;
     bool separate_with_bos;
+    bool sample_random_offsets;
 
     bool force_reshuffle;
 
@@ -93,6 +94,7 @@ struct train_opt_callback_data {
     size_t                       tokens_size;
     size_t                     * samples_begin;
     size_t                     * samples_size;
+    size_t                     * shuffled_samples_offs;
     size_t                     * shuffled_samples_begin;
     size_t                     * shuffled_samples_size;
     size_t                       samples_count;
@@ -153,6 +155,7 @@ int64_t get_example_targets_batch(
         struct ggml_tensor   * tokens_input,
         struct ggml_tensor   * target_probs,
         int64_t                example_id,
+        const size_t         * samples_offs,
         const size_t         * samples_begin,
         const size_t         * samples_size,
               size_t           samples_count,
@@ -160,7 +163,8 @@ int64_t get_example_targets_batch(
         size_t                 n_train_data,
         bool                   separate_with_eos,
         bool                   separate_with_bos,
-        bool                   fill_with_next_samples);
+        bool                   fill_with_next_samples,
+        bool                   sample_random_offsets);
 
 
 void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
@@ -169,6 +173,7 @@ mt19937_state mt19937_seed_to_state(unsigned seed);
 
 mt19937_state shuffle_samples(
         const mt19937_state & rng_state,
+        size_t              * shuffled_offs,
         size_t              * shuffled_begins,
         size_t              * shuffled_sizes,
         const size_t        * begins,
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index ea1a68b0d7b22..e631451a58b64 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1876,12 +1876,15 @@ int main(int argc, char ** argv) {
         train->shuffle_next_sample = 0;
         train->shuffle_samples_hash = shuffle_samples_hash;
     }
+    std::vector<size_t> train_shuffled_samples_offs;
     std::vector<size_t> train_shuffled_samples_begin;
     std::vector<size_t> train_shuffled_samples_size;
+    train_shuffled_samples_offs.resize(train_samples_begin.size());
     train_shuffled_samples_begin.resize(train_samples_begin.size());
     train_shuffled_samples_size.resize(train_samples_size.size());
     train->shuffle_rng_state_next = shuffle_samples(
         train->shuffle_rng_state_current,
+        train_shuffled_samples_offs.data(),
         train_shuffled_samples_begin.data(),
         train_shuffled_samples_size.data(),
         train_samples_begin.data(),
@@ -1909,6 +1912,7 @@ int main(int argc, char ** argv) {
     opt_cb_data.tokens_size            = train_tokens.size();
     opt_cb_data.samples_begin          = train_samples_begin.data();
     opt_cb_data.samples_size           = train_samples_size.data();
+    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
     opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
     opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
     opt_cb_data.samples_count          = train_samples_size.size();
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index d5cf426652bec..0da7ec11b311c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1059,12 +1059,15 @@ int main(int argc, char ** argv) {
         train->shuffle_next_sample = 0;
         train->shuffle_samples_hash = shuffle_samples_hash;
     }
+    std::vector<size_t> train_shuffled_samples_offs;
     std::vector<size_t> train_shuffled_samples_begin;
     std::vector<size_t> train_shuffled_samples_size;
+    train_shuffled_samples_offs.resize(train_samples_begin.size());
     train_shuffled_samples_begin.resize(train_samples_begin.size());
     train_shuffled_samples_size.resize(train_samples_size.size());
     train->shuffle_rng_state_next = shuffle_samples(
         train->shuffle_rng_state_current,
+        train_shuffled_samples_offs.data(),
         train_shuffled_samples_begin.data(),
         train_shuffled_samples_size.data(),
         train_samples_begin.data(),
@@ -1091,6 +1094,7 @@ int main(int argc, char ** argv) {
     opt_cb_data.tokens_size            = train_tokens.size();
     opt_cb_data.samples_begin          = train_samples_begin.data();
     opt_cb_data.samples_size           = train_samples_size.data();
+    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
     opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
     opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
     opt_cb_data.samples_count          = train_samples_size.size();

From 56a03faf5f07e7cdac04420e71aaa860671da69a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 16:37:21 +0200
Subject: [PATCH 214/235] deduplicate code into function

---
 examples/finetune/finetune.cpp | 169 ++++++++++++---------------------
 1 file changed, 60 insertions(+), 109 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index e631451a58b64..b0c03a749566c 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -306,6 +306,63 @@ static void set_param_lora(struct my_llama_lora * lora) {
     }
 }
 
+static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
+    ggml_allocr_alloc(alloc, lora->norm_a);
+    ggml_allocr_alloc(alloc, lora->norm_b);
+    ggml_allocr_alloc(alloc, lora->output_a);
+    ggml_allocr_alloc(alloc, lora->output_b);
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm_a);
+        ggml_allocr_alloc(alloc, layer.attention_norm_b);
+        ggml_allocr_alloc(alloc, layer.wq_a);
+        ggml_allocr_alloc(alloc, layer.wq_b);
+        ggml_allocr_alloc(alloc, layer.wk_a);
+        ggml_allocr_alloc(alloc, layer.wk_b);
+        ggml_allocr_alloc(alloc, layer.wv_a);
+        ggml_allocr_alloc(alloc, layer.wv_b);
+        ggml_allocr_alloc(alloc, layer.wo_a);
+        ggml_allocr_alloc(alloc, layer.wo_b);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
+        ggml_allocr_alloc(alloc, layer.w1_a);
+        ggml_allocr_alloc(alloc, layer.w1_b);
+        ggml_allocr_alloc(alloc, layer.w2_a);
+        ggml_allocr_alloc(alloc, layer.w2_b);
+        ggml_allocr_alloc(alloc, layer.w3_a);
+        ggml_allocr_alloc(alloc, layer.w3_b);
+    }
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
+    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
+    ggml_allocr_alloc(alloc, lora->norm_a->grad);
+    ggml_allocr_alloc(alloc, lora->norm_b->grad);
+    ggml_allocr_alloc(alloc, lora->output_a->grad);
+    ggml_allocr_alloc(alloc, lora->output_b->grad);
+    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
+        auto & layer = lora->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
+        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
+        ggml_allocr_alloc(alloc, layer.wq_a->grad);
+        ggml_allocr_alloc(alloc, layer.wq_b->grad);
+        ggml_allocr_alloc(alloc, layer.wk_a->grad);
+        ggml_allocr_alloc(alloc, layer.wk_b->grad);
+        ggml_allocr_alloc(alloc, layer.wv_a->grad);
+        ggml_allocr_alloc(alloc, layer.wv_b->grad);
+        ggml_allocr_alloc(alloc, layer.wo_a->grad);
+        ggml_allocr_alloc(alloc, layer.wo_b->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
+        ggml_allocr_alloc(alloc, layer.w1_a->grad);
+        ggml_allocr_alloc(alloc, layer.w1_b->grad);
+        ggml_allocr_alloc(alloc, layer.w2_a->grad);
+        ggml_allocr_alloc(alloc, layer.w2_b->grad);
+        ggml_allocr_alloc(alloc, layer.w3_a->grad);
+        ggml_allocr_alloc(alloc, layer.w3_b->grad);
+    }
+}
+
 static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
     const auto & lparams = lora->hparams;
 
@@ -400,121 +457,15 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
     set_param_lora(lora);
 
     // measure data size
-    ggml_allocr * alloc = NULL;
+    struct ggml_allocr * alloc = NULL;
     alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
-    ggml_allocr_alloc(alloc, lora->norm_a);
-    ggml_allocr_alloc(alloc, lora->norm_b);
-    ggml_allocr_alloc(alloc, lora->output_a);
-    ggml_allocr_alloc(alloc, lora->output_b);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        ggml_allocr_alloc(alloc, layer.attention_norm_a);
-        ggml_allocr_alloc(alloc, layer.attention_norm_b);
-        ggml_allocr_alloc(alloc, layer.wq_a);
-        ggml_allocr_alloc(alloc, layer.wq_b);
-        ggml_allocr_alloc(alloc, layer.wk_a);
-        ggml_allocr_alloc(alloc, layer.wk_b);
-        ggml_allocr_alloc(alloc, layer.wv_a);
-        ggml_allocr_alloc(alloc, layer.wv_b);
-        ggml_allocr_alloc(alloc, layer.wo_a);
-        ggml_allocr_alloc(alloc, layer.wo_b);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
-        ggml_allocr_alloc(alloc, layer.w1_a);
-        ggml_allocr_alloc(alloc, layer.w1_b);
-        ggml_allocr_alloc(alloc, layer.w2_a);
-        ggml_allocr_alloc(alloc, layer.w2_b);
-        ggml_allocr_alloc(alloc, layer.w3_a);
-        ggml_allocr_alloc(alloc, layer.w3_b);
-    }
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
-    ggml_allocr_alloc(alloc, lora->norm_a->grad);
-    ggml_allocr_alloc(alloc, lora->norm_b->grad);
-    ggml_allocr_alloc(alloc, lora->output_a->grad);
-    ggml_allocr_alloc(alloc, lora->output_b->grad);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
-        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
-        ggml_allocr_alloc(alloc, layer.wq_a->grad);
-        ggml_allocr_alloc(alloc, layer.wq_b->grad);
-        ggml_allocr_alloc(alloc, layer.wk_a->grad);
-        ggml_allocr_alloc(alloc, layer.wk_b->grad);
-        ggml_allocr_alloc(alloc, layer.wv_a->grad);
-        ggml_allocr_alloc(alloc, layer.wv_b->grad);
-        ggml_allocr_alloc(alloc, layer.wo_a->grad);
-        ggml_allocr_alloc(alloc, layer.wo_b->grad);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
-        ggml_allocr_alloc(alloc, layer.w1_a->grad);
-        ggml_allocr_alloc(alloc, layer.w1_b->grad);
-        ggml_allocr_alloc(alloc, layer.w2_a->grad);
-        ggml_allocr_alloc(alloc, layer.w2_b->grad);
-        ggml_allocr_alloc(alloc, layer.w3_a->grad);
-        ggml_allocr_alloc(alloc, layer.w3_b->grad);
-    }
+    alloc_lora(alloc, lora);
 
     // allocate data
     lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
     ggml_allocr_free(alloc);
     alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
-    ggml_allocr_alloc(alloc, lora->norm_a);
-    ggml_allocr_alloc(alloc, lora->norm_b);
-    ggml_allocr_alloc(alloc, lora->output_a);
-    ggml_allocr_alloc(alloc, lora->output_b);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        ggml_allocr_alloc(alloc, layer.attention_norm_a);
-        ggml_allocr_alloc(alloc, layer.attention_norm_b);
-        ggml_allocr_alloc(alloc, layer.wq_a);
-        ggml_allocr_alloc(alloc, layer.wq_b);
-        ggml_allocr_alloc(alloc, layer.wk_a);
-        ggml_allocr_alloc(alloc, layer.wk_b);
-        ggml_allocr_alloc(alloc, layer.wv_a);
-        ggml_allocr_alloc(alloc, layer.wv_b);
-        ggml_allocr_alloc(alloc, layer.wo_a);
-        ggml_allocr_alloc(alloc, layer.wo_b);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
-        ggml_allocr_alloc(alloc, layer.w1_a);
-        ggml_allocr_alloc(alloc, layer.w1_b);
-        ggml_allocr_alloc(alloc, layer.w2_a);
-        ggml_allocr_alloc(alloc, layer.w2_b);
-        ggml_allocr_alloc(alloc, layer.w3_a);
-        ggml_allocr_alloc(alloc, layer.w3_b);
-    }
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
-    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
-    ggml_allocr_alloc(alloc, lora->norm_a->grad);
-    ggml_allocr_alloc(alloc, lora->norm_b->grad);
-    ggml_allocr_alloc(alloc, lora->output_a->grad);
-    ggml_allocr_alloc(alloc, lora->output_b->grad);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
-        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
-        ggml_allocr_alloc(alloc, layer.wq_a->grad);
-        ggml_allocr_alloc(alloc, layer.wq_b->grad);
-        ggml_allocr_alloc(alloc, layer.wk_a->grad);
-        ggml_allocr_alloc(alloc, layer.wk_b->grad);
-        ggml_allocr_alloc(alloc, layer.wv_a->grad);
-        ggml_allocr_alloc(alloc, layer.wv_b->grad);
-        ggml_allocr_alloc(alloc, layer.wo_a->grad);
-        ggml_allocr_alloc(alloc, layer.wo_b->grad);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
-        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
-        ggml_allocr_alloc(alloc, layer.w1_a->grad);
-        ggml_allocr_alloc(alloc, layer.w1_b->grad);
-        ggml_allocr_alloc(alloc, layer.w2_a->grad);
-        ggml_allocr_alloc(alloc, layer.w2_b->grad);
-        ggml_allocr_alloc(alloc, layer.w3_a->grad);
-        ggml_allocr_alloc(alloc, layer.w3_b->grad);
-    }
+    alloc_lora(alloc, lora);
     ggml_allocr_free(alloc);
 }
 

From 1dbd6bc3d563913efd3755d103d9f408943361a9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 16:40:40 +0200
Subject: [PATCH 215/235] remove n_rot hparam, as it must always be
 hparam.n_embd_head()

---
 examples/finetune/finetune.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index b0c03a749566c..296d2d6219089 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -29,7 +29,6 @@ struct my_llama_hparams {
     uint32_t n_head     = 32;
     uint32_t n_head_kv  = 32;
     uint32_t n_layer    = 32;
-    uint32_t n_rot      = 64;
 
     uint32_t n_gqa() const {
         return n_head/n_head_kv;
@@ -203,7 +202,6 @@ static void print_params(struct my_llama_hparams * params) {
     printf("%s: n_ff:    %u\n", __func__, params->n_ff);
     printf("%s: n_head:  %u\n", __func__, params->n_head);
     printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 
 static void print_lora_params(struct my_llama_lora_hparams * params) {
@@ -247,7 +245,6 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
     hparams.n_head     = llama_model_n_head(input);
     hparams.n_head_kv  = llama_model_n_head_kv(input);
     hparams.n_layer    = llama_model_n_layer(input);
-    hparams.n_rot      = llama_model_n_rot(input);
 
     model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
     model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
@@ -535,8 +532,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     const int n_layer     = hparams.n_layer;
     const int n_head      = hparams.n_head;
     const int n_head_kv   = hparams.n_head_kv;
-    const int n_rot       = hparams.n_rot;
     const int n_ff        = hparams.n_ff;
+    const int n_rot       = hparams.n_embd_head();
     const int n_embd_head = hparams.n_embd_head();
     const int n_embd_gqa  = hparams.n_embd_gqa();
     const float rms_norm_eps    = lora->hparams.f_norm_rms_eps;
@@ -544,7 +541,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     const float rope_freq_scale = lora->hparams.rope_freq_scale;
 
     GGML_ASSERT((size_t) n_layer == lora->layers.size());
-    GGML_ASSERT(n_embd_head == n_rot);
 
     auto set_name = [](struct ggml_tensor * t, const char * n) {
         ggml_set_name(t, n);
@@ -823,9 +819,6 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
     model->hparams.n_head_kv = model->hparams.n_head;
     GGUF_GET_KEY(fctx, model->hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
 
-    model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
-    GGUF_GET_KEY(fctx, model->hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
-
     float rope_freq_scale = 1.0f;
     GGUF_GET_KEY(fctx, lora->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
     GGUF_GET_KEY(fctx, lora->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
@@ -899,7 +892,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
     gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
     gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV),     model->hparams.n_head_kv);
     gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot);
+    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_embd_head());
     gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), lora->hparams.f_norm_rms_eps);
     gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              lora->hparams.rope_freq_base);
     gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           lora->hparams.rope_freq_scale);

From 5ed309810e02486424d85e859df594a1b5f57665 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 16:41:25 +0200
Subject: [PATCH 216/235] align code

---
 examples/finetune/finetune.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 296d2d6219089..17b89a2f8becf 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1542,18 +1542,18 @@ int main(int argc, char ** argv) {
     }
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;
-    uint32_t n_rank_attention_norm          = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
-    uint32_t n_rank_wq                      = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
-    uint32_t n_rank_wk                      = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
-    uint32_t n_rank_wv                      = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
-    uint32_t n_rank_wo                      = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
-    uint32_t n_rank_ffn_norm                = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_w1                      = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
-    uint32_t n_rank_w2                      = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
-    uint32_t n_rank_w3                      = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
-    uint32_t n_rank_tok_embeddings          = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
-    uint32_t n_rank_norm                    = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
-    uint32_t n_rank_output                  = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
+    uint32_t n_rank_attention_norm     = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
+    uint32_t n_rank_wq                 = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
+    uint32_t n_rank_wk                 = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
+    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
+    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
+    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
+    uint32_t n_rank_w1                 = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
+    uint32_t n_rank_w2                 = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
+    uint32_t n_rank_w3                 = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
+    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
+    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
+    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
     lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
     lora.hparams.n_rank_wq             = n_rank_wq;
     lora.hparams.n_rank_wk             = n_rank_wk;

From b0ee563748062e11bdba8c8c973bea7a187a5b6d Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 16:43:12 +0200
Subject: [PATCH 217/235] assert correct base model tensor shapes

---
 examples/finetune/finetune.cpp | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 17b89a2f8becf..d0fc48f235616 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -250,8 +250,11 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
     model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
     model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
 
-    model->layers.resize(hparams.n_layer);
+    assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab);
+    assert_shape_1d(model->norm,           hparams.n_embd);
+    assert_shape_2d(model->output,         hparams.n_embd, hparams.n_vocab);
 
+    model->layers.resize(hparams.n_layer);
     for (uint32_t i = 0; i < hparams.n_layer; ++i) {
         auto & layer = model->layers[i];
 
@@ -264,6 +267,16 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
         layer.w1             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
         layer.w2             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
         layer.w3             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
+
+        assert_shape_1d(layer.attention_norm, hparams.n_embd);
+        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
+        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd);
+        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd);
+        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
+        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
+        assert_shape_2d(layer.w1,             hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.w2,             hparams.n_ff,   hparams.n_embd);
+        assert_shape_2d(layer.w3,             hparams.n_embd, hparams.n_ff);
     }
 }
 

From 934ad8d35d60e81d77d56b9defa08f58fd7e25bf Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 16:51:03 +0200
Subject: [PATCH 218/235] move some params from lora hparams into model hparams
 and load model params from gguf

this equalizes the model definition in finetune and text-from-scratch and removes the need for additional llama api functions to get model parameters
---
 examples/finetune/finetune.cpp | 206 +++++++++++++++++----------------
 1 file changed, 106 insertions(+), 100 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index d0fc48f235616..3f0e2be7b4059 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -30,6 +30,12 @@ struct my_llama_hparams {
     uint32_t n_head_kv  = 32;
     uint32_t n_layer    = 32;
 
+    // float f_norm_eps     = 1e-5f; // falcon
+    float f_norm_rms_eps = 1e-5f; // llama
+
+    float rope_freq_base  = 10000.0f;
+    float rope_freq_scale = 1.0f;
+
     uint32_t n_gqa() const {
         return n_head/n_head_kv;
     }
@@ -67,7 +73,7 @@ struct my_llama_layer {
 };
 
 struct my_llama_model {
-    my_llama_hparams hparams;
+    struct my_llama_hparams hparams;
 
     struct ggml_tensor * tok_embeddings;
 
@@ -93,12 +99,6 @@ struct my_llama_lora_hparams {
     uint32_t n_rank_norm = 1;
     uint32_t n_rank_output = 4;
 
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-
     bool operator!=(const my_llama_lora_hparams& other) const {
         return memcmp(this, &other, sizeof(other));
     }
@@ -196,12 +196,16 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
 
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
+    printf("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:    %u\n", __func__, params->n_embd);
+    printf("%s: n_ff:      %u\n", __func__, params->n_ff);
+    printf("%s: n_head:    %u\n", __func__, params->n_head);
+    printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    printf("%s: n_layer:   %u\n", __func__, params->n_layer);
+    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
+    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
+    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
 }
 
 static void print_lora_params(struct my_llama_lora_hparams * params) {
@@ -217,12 +221,61 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
     printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
     printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
     printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
-    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
-    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
-    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
 }
 
-static void init_model(struct llama_model * input, struct my_llama_model * model, uint32_t n_ctx) {
+#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
+{ \
+    const std::string skey(key); \
+    const int kid = gguf_find_key(ctx, skey.c_str()); \
+    if (kid >= 0) { \
+        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
+        if (ktype != (type)) { \
+            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
+        } \
+        (dst) = func(ctx, kid); \
+    } else if (req) { \
+        die_fmt("key not found in model: %s", skey.c_str()); \
+    } \
+}
+
+static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) {
+    std::string arch;
+
+    GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
+    if (expected_arch != NULL) {
+        if (arch != expected_arch) {
+            printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch);
+        }
+        GGML_ASSERT(arch == expected_arch);
+    }
+
+    std::vector<char> keybuf;
+    keybuf.resize(512);
+    auto kv = [&arch, &keybuf](const char * key) -> const char * {
+        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
+        return keybuf.data();
+    };
+
+    GGUF_GET_KEY(ctx, hparams->n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_EMBEDDING_LENGTH));
+    GGUF_GET_KEY(ctx, hparams->n_ctx,          gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+    GGUF_GET_KEY(ctx, hparams->n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_FEED_FORWARD_LENGTH));
+    GGUF_GET_KEY(ctx, hparams->n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
+    GGUF_GET_KEY(ctx, hparams->n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_BLOCK_COUNT));
+
+    // n_head_kv is optional, default to n_head
+    hparams->n_head_kv = hparams->n_head;
+    GGUF_GET_KEY(ctx, hparams->n_head_kv,      gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
+
+    float rope_freq_scale = 1.0f;
+    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
+    GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    if (rope_freq_scale != 1.0f) {
+        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
+    }
+}
+
+static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
     auto & hparams = model->hparams;
 
     std::vector<char> tn_buf;
@@ -238,14 +291,23 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
         return tn_buf.data();
     };
 
-    hparams.n_vocab    = llama_model_n_vocab(input);
-    hparams.n_ctx      = n_ctx;
-    hparams.n_embd     = llama_model_n_embd(input);
-    hparams.n_ff       = llama_model_n_ff(input);
-    hparams.n_head     = llama_model_n_head(input);
-    hparams.n_head_kv  = llama_model_n_head_kv(input);
-    hparams.n_layer    = llama_model_n_layer(input);
 
+    // get parameters directly from gguf file
+    {
+        struct gguf_init_params params = {
+            /*.no_alloc = */ false,
+            /*.ctx      = */ NULL,
+        };
+        struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
+
+        load_model_hparams_gguf(mctx, &hparams, "llama");
+
+        gguf_free(mctx);
+    }
+    hparams.n_vocab = llama_model_n_vocab(input);
+    hparams.n_ctx = n_ctx;
+
+    // get tensors from llama_model (possibly mmapped)
     model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
     model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
     model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
@@ -549,9 +611,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     const int n_rot       = hparams.n_embd_head();
     const int n_embd_head = hparams.n_embd_head();
     const int n_embd_gqa  = hparams.n_embd_gqa();
-    const float rms_norm_eps    = lora->hparams.f_norm_rms_eps;
-    const float rope_freq_base  = lora->hparams.rope_freq_base;
-    const float rope_freq_scale = lora->hparams.rope_freq_scale;
+    const float rms_norm_eps    = hparams.f_norm_rms_eps;
+    const float rope_freq_base  = hparams.rope_freq_base;
+    const float rope_freq_scale = hparams.rope_freq_scale;
 
     GGML_ASSERT((size_t) n_layer == lora->layers.size());
 
@@ -756,52 +818,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     return t36;
 }
 
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-}
-
-static void load_default_lora_params_from_base_model(const char * fn_base_model, struct my_llama_lora_hparams * lora_params) {
-    if (strlen(fn_base_model) == 0) {
-        return;
-    }
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = NULL;
-    struct gguf_context * fctx = gguf_init_from_file(fn_base_model, params);
-    if (fctx == NULL) {
-        return;
-    }
-
-    const char * arch = "llama";
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(fctx, lora_params->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, lora_params->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        lora_params->rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-
-    gguf_free(fctx);
-}
-
 static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
     // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 
@@ -821,24 +837,15 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
     GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
     GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
 
-    // n_ctx was not saved in earlier checkpoint file version, so we make it optional here
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-
-    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
+    struct my_llama_hparams hparams;
+    load_model_hparams_gguf(fctx, &hparams, arch.c_str());
 
-    model->hparams.n_head_kv = model->hparams.n_head;
-    GGUF_GET_KEY(fctx, model->hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(fctx, lora->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, lora->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        lora->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
-    }
+    // parameters that define tensor shapes must match
+    GGML_ASSERT(hparams.n_embd    == model->hparams.n_embd);
+    GGML_ASSERT(hparams.n_ff      == model->hparams.n_ff);
+    GGML_ASSERT(hparams.n_head    == model->hparams.n_head);
+    GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv);
+    GGML_ASSERT(hparams.n_layer   == model->hparams.n_layer);
 
     GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD);
     GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm,           gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM);
@@ -906,9 +913,10 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
     gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV),     model->hparams.n_head_kv);
     gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
     gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_embd_head());
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), lora->hparams.f_norm_rms_eps);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              lora->hparams.rope_freq_base);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           lora->hparams.rope_freq_scale);
+    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps);
+    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base);
+    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           model->hparams.rope_freq_scale);
+
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,   lora->hparams.n_rank_tok_embeddings);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM,  lora->hparams.n_rank_norm);
     gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT,       lora->hparams.n_rank_output);
@@ -1534,24 +1542,22 @@ int main(int argc, char ** argv) {
     struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
 
     struct my_llama_model model;
-    init_model(lmodel, &model, params.common.n_ctx);
+    init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx);
 
     struct my_llama_lora lora;
 
     struct train_state      * train = init_train_state();
     struct ggml_opt_context * opt   = train->opt;
 
-    load_default_lora_params_from_base_model(params.fn_model_base, &lora.hparams);
-
-    // set lora params from command line
+    // set params from command line
     if (params.custom_f_norm_rms_eps) {
-        lora.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
+        model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
     }
     if (params.custom_rope_freq_base) {
-        lora.hparams.rope_freq_base  = params.rope_freq_base;
+        model.hparams.rope_freq_base  = params.rope_freq_base;
     }
     if (params.custom_rope_freq_scale) {
-        lora.hparams.rope_freq_scale = params.rope_freq_scale;
+        model.hparams.rope_freq_scale = params.rope_freq_scale;
     }
     lora.hparams.lora_r                = params.lora_r;
     lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;

From dd94ce4ec0d0735711aca1c72d71c0b177db6493 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 16:49:52 +0200
Subject: [PATCH 219/235] remove now unnecessary llama API functions to get
 model params that where added by this PR

---
 llama.cpp | 40 ----------------------------------------
 llama.h   | 10 ----------
 2 files changed, 50 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d9db49c5c45d1..a156d40b8c921 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6478,26 +6478,6 @@ int llama_n_embd(const struct llama_context * ctx) {
     return llama_model_n_embd(&ctx->model);
 }
 
-int llama_n_ff(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_ff;
-}
-
-int llama_n_head(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_head;
-}
-
-int llama_n_head_kv(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_head_kv;
-}
-
-int llama_n_rot(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_rot;
-}
-
-int llama_n_layer(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_layer;
-}
-
 enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
     return ctx->model.vocab.type;
 }
@@ -6518,26 +6498,6 @@ int llama_model_n_embd(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
-int llama_model_n_ff(const struct llama_model * model) {
-    return model->hparams.n_ff;
-}
-
-int llama_model_n_head(const struct llama_model * model) {
-    return model->hparams.n_head;
-}
-
-int llama_model_n_head_kv(const struct llama_model * model) {
-    return model->hparams.n_head_kv;
-}
-
-int llama_model_n_rot(const struct llama_model * model) {
-    return model->hparams.n_rot;
-}
-
-int llama_model_n_layer(const struct llama_model * model) {
-    return model->hparams.n_layer;
-}
-
 int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
     return snprintf(buf, buf_size, "%s %s %s",
             model->name.c_str(),
diff --git a/llama.h b/llama.h
index 3883f48c915d6..606f205a6f175 100644
--- a/llama.h
+++ b/llama.h
@@ -249,11 +249,6 @@ extern "C" {
     LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
     LLAMA_API int llama_n_embd     (const struct llama_context * ctx);
-    LLAMA_API int llama_n_ff       (const struct llama_context * ctx);
-    LLAMA_API int llama_n_head     (const struct llama_context * ctx);
-    LLAMA_API int llama_n_head_kv  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_rot      (const struct llama_context * ctx);
-    LLAMA_API int llama_n_layer    (const struct llama_context * ctx);
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
 
@@ -261,11 +256,6 @@ extern "C" {
     LLAMA_API int llama_model_n_ctx      (const struct llama_model * model);
     LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
     LLAMA_API int llama_model_n_embd     (const struct llama_model * model);
-    LLAMA_API int llama_model_n_ff       (const struct llama_model * model);
-    LLAMA_API int llama_model_n_head     (const struct llama_model * model);
-    LLAMA_API int llama_model_n_head_kv  (const struct llama_model * model);
-    LLAMA_API int llama_model_n_rot      (const struct llama_model * model);
-    LLAMA_API int llama_model_n_layer    (const struct llama_model * model);
 
     // Get a string describing the model type
     LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

From 9e10fa977e2471dd390477dfb702b81f22b54582 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 17:08:36 +0200
Subject: [PATCH 220/235] train-text-from-scratch: automatically allocate model
 tensors, remove option '--mem-model N'

---
 examples/finetune/finetune.cpp                |  12 +-
 .../train-text-from-scratch.cpp               | 129 ++++++++++++------
 2 files changed, 90 insertions(+), 51 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 3f0e2be7b4059..c43d00dfd5a80 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1658,8 +1658,8 @@ int main(int argc, char ** argv) {
     printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
     printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
     printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: max_lora_size = %zu bytes (%.1f MB)\n", __func__, lora.data.size(), (float) lora.data.size() / (1024.0f*1024.0f));
-    printf("%s: max_opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
+    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
+    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
     opt->iter = train->train_its;
 
     if (params.only_write_lora) {
@@ -1686,7 +1686,7 @@ int main(int argc, char ** argv) {
 
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
-    printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx));
+    printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx) + lora.data.size());
 
     std::vector<uint8_t> mem_input_data;
     std::vector<uint8_t> mem_compute_data;
@@ -1709,7 +1709,7 @@ int main(int argc, char ** argv) {
     ggml_allocr_alloc(alloc, target_probs);
     size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
     ggml_allocr_free(alloc);
-    printf("%s: max_input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
+    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
 
     // allocate input tensors
     mem_input_data.resize(max_input_size);
@@ -1769,7 +1769,7 @@ int main(int argc, char ** argv) {
         ggml_free(ctx_compute);
     }
     size_t max_compute_size = best_compute_size;
-    printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
+    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
     printf("%s: evaluation order = %s\n", __func__,
         (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
         (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
@@ -1887,7 +1887,7 @@ int main(int argc, char ** argv) {
 
     // measure required memory for work buffer
     size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: max_work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
+    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
 
     // context for work buffer
     struct ggml_init_params ctx_work_params = {
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0da7ec11b311c..83e1563632df9 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -19,6 +19,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const size_t tensor_alignment = 32;
+
 struct my_llama_hparams {
     uint32_t n_vocab = 32000;
     uint32_t n_ctx   = 512;
@@ -56,6 +58,7 @@ struct my_llama_layer {
 
 struct my_llama_model {
     struct ggml_context * ctx = NULL;
+    std::vector<uint8_t> data;
 
     my_llama_hparams hparams;
 
@@ -118,6 +121,65 @@ static void print_params(struct my_llama_hparams * params) {
     printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }
 
+static void set_param_model(struct my_llama_model * model) {
+    const auto& hparams = model->hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    struct ggml_context* ctx = model->ctx;
+
+    ggml_set_param(ctx, model->tok_embeddings);
+    ggml_set_param(ctx, model->norm);
+    ggml_set_param(ctx, model->output);
+
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        ggml_set_param(ctx, layer.attention_norm);
+        ggml_set_param(ctx, layer.wq);
+        ggml_set_param(ctx, layer.wk);
+        ggml_set_param(ctx, layer.wv);
+        ggml_set_param(ctx, layer.wo);
+        ggml_set_param(ctx, layer.ffn_norm);
+        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.w3);
+    }
+}
+
+static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
+    ggml_allocr_alloc(alloc, model->tok_embeddings);
+    ggml_allocr_alloc(alloc, model->norm);
+    ggml_allocr_alloc(alloc, model->output);
+    for (uint32_t i = 0; i < model->layers.size(); ++i) {
+        auto & layer = model->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm);
+        ggml_allocr_alloc(alloc, layer.wq);
+        ggml_allocr_alloc(alloc, layer.wk);
+        ggml_allocr_alloc(alloc, layer.wv);
+        ggml_allocr_alloc(alloc, layer.wo);
+        ggml_allocr_alloc(alloc, layer.ffn_norm);
+        ggml_allocr_alloc(alloc, layer.w1);
+        ggml_allocr_alloc(alloc, layer.w2);
+        ggml_allocr_alloc(alloc, layer.w3);
+    }
+    ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
+    ggml_allocr_alloc(alloc, model->norm->grad);
+    ggml_allocr_alloc(alloc, model->output->grad);
+    for (uint32_t i = 0; i < model->layers.size(); ++i) {
+        auto & layer = model->layers[i];
+        ggml_allocr_alloc(alloc, layer.attention_norm->grad);
+        ggml_allocr_alloc(alloc, layer.wq->grad);
+        ggml_allocr_alloc(alloc, layer.wk->grad);
+        ggml_allocr_alloc(alloc, layer.wv->grad);
+        ggml_allocr_alloc(alloc, layer.wo->grad);
+        ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
+        ggml_allocr_alloc(alloc, layer.w1->grad);
+        ggml_allocr_alloc(alloc, layer.w2->grad);
+        ggml_allocr_alloc(alloc, layer.w3->grad);
+    }
+}
+
 static void init_model(struct my_llama_model * model) {
     const auto & hparams = model->hparams;
 
@@ -126,7 +188,6 @@ static void init_model(struct my_llama_model * model) {
     const uint32_t n_vocab = hparams.n_vocab;
     const uint32_t n_ff    = hparams.n_ff;
 
-    struct ggml_context * ctx = model->ctx;
 
     std::vector<char> tn_buf;
     tn_buf.resize(GGML_MAX_NAME);
@@ -141,6 +202,15 @@ static void init_model(struct my_llama_model * model) {
         return tn_buf.data();
     };
 
+    // context for model tensors without their data
+    struct ggml_init_params ctx_model_params;
+    ctx_model_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
+    ctx_model_params.mem_buffer = NULL;
+    ctx_model_params.no_alloc   = true;
+
+    struct ggml_context * ctx = ggml_init(ctx_model_params);
+    model->ctx = ctx;
+
     model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
     model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
     model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
@@ -179,32 +249,20 @@ static void init_model(struct my_llama_model * model) {
         ggml_set_name(layer.w2,             tni(LLM_TENSOR_FFN_DOWN, i));
         ggml_set_name(layer.w3,             tni(LLM_TENSOR_FFN_UP, i));
     }
-}
-
-static void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
 
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
+    set_param_model(model);
 
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
+    // measure data size
+    struct ggml_allocr * alloc = NULL;
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    alloc_model(alloc, model);
 
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
-        ggml_set_param(ctx, layer.w2);
-        ggml_set_param(ctx, layer.w3);
-    }
+    // allocate data
+    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
+    ggml_allocr_free(alloc);
+    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
+    alloc_model(alloc, model);
+    ggml_allocr_free(alloc);
 }
 
 static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@@ -720,7 +778,6 @@ struct train_params {
 
     bool use_alloc;
 
-    int mem_model_gb;
     int mem_compute_gb;
     int mem_compute0_gb;
 };
@@ -747,7 +804,6 @@ struct train_params get_default_train_params() {
 
     params.use_alloc              = true;
 
-    params.mem_model_gb   =  2;
     params.mem_compute_gb = 24;
     params.mem_compute0_gb = 8;
     return params;
@@ -772,7 +828,6 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
     fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
     fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
     fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
-    fprintf(stderr, "  --mem-model N              Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb);
     fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
     fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
 
@@ -868,12 +923,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
             params->use_alloc = false;
         } else if (arg == "--use-alloc") {
             params->use_alloc = true;
-        } else if (arg == "--mem-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_model_gb = std::stoi(argv[i]);
         } else if (arg == "--mem-compute") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -960,13 +1009,6 @@ int main(int argc, char ** argv) {
 
     print_params(&model.hparams);
 
-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
-
-    model.ctx = ggml_init(lcparams);
-
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.common.n_batch;
@@ -992,7 +1034,6 @@ int main(int argc, char ** argv) {
     opt_params_adam.adam.gclip              = params.common.adam_gclip;
     opt_params_adam.adam.eps_f              = params.common.adam_eps_f;
 
-    opt->ctx = model.ctx;
     opt->params = opt_params_adam;
 
     printf("%s: init model\n", __func__);
@@ -1000,7 +1041,6 @@ int main(int argc, char ** argv) {
     if (!existed) {
         init_model(&model);
     }
-    set_param_model(&model);
 
     opt->params = opt_params_adam;
 
@@ -1012,8 +1052,7 @@ int main(int argc, char ** argv) {
         randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
     }
 
-    printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx));
-    // ggml_print_tensor_objects(model.ctx);
+    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
 
     // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
@@ -1024,7 +1063,6 @@ int main(int argc, char ** argv) {
 
     ggml_allocr * alloc = NULL;
     if (params.use_alloc) {
-        static const size_t tensor_alignment = 32;
         alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
     }
 
@@ -1206,6 +1244,7 @@ int main(int argc, char ** argv) {
 
     delete[] compute_addr;
     delete[] compute_buf_0;
+    ggml_free(opt->ctx);
     free_train_state(train);
     ggml_free(model.ctx);
     llama_free(lctx);

From db38d2bce4223fdfcb355f18c4edfd37a1e500af Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 17:33:11 +0200
Subject: [PATCH 221/235] train-text-from-scratch: automatically allocate opt
 context

---
 examples/finetune/finetune.cpp                |  6 +-
 .../train-text-from-scratch.cpp               | 99 ++++++++++++-------
 2 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index c43d00dfd5a80..caeea9c3f1e69 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1651,6 +1651,7 @@ int main(int argc, char ** argv) {
             ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
         }
     }
+    opt->iter = train->train_its;
 
     print_params(&model.hparams);
     print_lora_params(&lora.hparams);
@@ -1660,7 +1661,7 @@ int main(int argc, char ** argv) {
     printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
     printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
     printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    opt->iter = train->train_its;
+    printf("%s: opt iter %d\n", __func__, opt->iter);
 
     if (params.only_write_lora) {
         save_train_files_data save_data;
@@ -1684,9 +1685,6 @@ int main(int argc, char ** argv) {
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.common.n_batch;
 
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    printf("used_mem model: %zu bytes\n", ggml_used_mem(lora.ctx) + lora.data.size());
 
     std::vector<uint8_t> mem_input_data;
     std::vector<uint8_t> mem_compute_data;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 83e1563632df9..069e460c15006 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -975,6 +975,27 @@ static void save_train_files(void * vdata, struct train_state * train) {
     }
 }
 
+static int64_t get_parameter_count(struct my_llama_model* model) {
+    int64_t nx = 0;
+    nx += ggml_nelements(model->tok_embeddings);
+    nx += ggml_nelements(model->norm);
+    nx += ggml_nelements(model->output);
+
+    for (uint32_t i = 0; i < model->layers.size(); ++i) {
+        auto & layer = model->layers[i];
+        nx += ggml_nelements(layer.attention_norm);
+        nx += ggml_nelements(layer.wq);
+        nx += ggml_nelements(layer.wk);
+        nx += ggml_nelements(layer.wv);
+        nx += ggml_nelements(layer.wo);
+        nx += ggml_nelements(layer.ffn_norm);
+        nx += ggml_nelements(layer.w1);
+        nx += ggml_nelements(layer.w2);
+        nx += ggml_nelements(layer.w3);
+    }
+    return nx;
+}
+
 int main(int argc, char ** argv) {
     struct train_params params = get_default_train_params();
 
@@ -1007,52 +1028,58 @@ int main(int argc, char ** argv) {
     model.hparams.rope_freq_base  = params.rope_freq_base;
     model.hparams.rope_freq_scale = params.rope_freq_scale;
 
-    print_params(&model.hparams);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
     struct train_state      * train = init_train_state();
     struct ggml_opt_context * opt   = train->opt;
 
-    struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
-    opt_params_adam.print_forward_graph     = false;
-    opt_params_adam.print_backward_graph    = false;
-    opt_params_adam.n_threads               = params.common.n_threads;
-    opt_params_adam.past                    = params.common.opt_past;
-    opt_params_adam.delta                   = params.common.opt_delta;
-    opt_params_adam.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt_params_adam.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt_params_adam.adam.n_iter             = params.common.adam_n_iter;
-    opt_params_adam.adam.sched              = 1.0f;
-    opt_params_adam.adam.alpha              = params.common.adam_alpha;
-    opt_params_adam.adam.decay              = params.common.adam_decay;
-    opt_params_adam.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt_params_adam.adam.beta1              = params.common.adam_beta1;
-    opt_params_adam.adam.beta2              = params.common.adam_beta2;
-    opt_params_adam.adam.gclip              = params.common.adam_gclip;
-    opt_params_adam.adam.eps_f              = params.common.adam_eps_f;
-
-    opt->params = opt_params_adam;
+    // set opt params from command line
+    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    opt->params.print_forward_graph     = false;
+    opt->params.print_backward_graph    = false;
+    opt->params.n_threads               = params.common.n_threads;
+    opt->params.past                    = params.common.opt_past;
+    opt->params.delta                   = params.common.opt_delta;
+    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
+    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
+    opt->params.adam.n_iter             = params.common.adam_n_iter;
+    opt->params.adam.sched              = 1.0f;
+    opt->params.adam.alpha              = params.common.adam_alpha;
+    opt->params.adam.decay              = params.common.adam_decay;
+    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
+    opt->params.adam.beta1              = params.common.adam_beta1;
+    opt->params.adam.beta2              = params.common.adam_beta2;
+    opt->params.adam.gclip              = params.common.adam_gclip;
+    opt->params.adam.eps_f              = params.common.adam_eps_f;
 
     printf("%s: init model\n", __func__);
     bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
-    if (!existed) {
-        init_model(&model);
-    }
-
-    opt->params = opt_params_adam;
+    if (existed) {
+        // overwrite last n_ctx with user provided n_ctx
+        if (params.common.custom_n_ctx) {
+            model.hparams.n_ctx = params.common.n_ctx;
+        }
 
-    opt->iter = train->train_its;
-    printf("%s: opt iter %d\n", __func__, opt->iter);
+        const bool opt_past_changed = opt->params.past != params.common.opt_past;
 
-    bool from_scratch = !existed;
-    if (from_scratch) {
+        if (opt_past_changed) {
+            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
+            // need to discard previous optimizer past function value statistics and opt_init with new shapes
+            // TODO
+        }
+    } else {
+        init_model(&model);
         randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
+        ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model));
     }
+    opt->iter = train->train_its;
 
+    print_params(&model.hparams);
+    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
+    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
+    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
+    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
     printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
+    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
+    printf("%s: opt iter %d\n", __func__, opt->iter);
 
     // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
@@ -1066,6 +1093,10 @@ int main(int argc, char ** argv) {
         alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
     }
 
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
+    int n_batch  = params.common.n_batch;
+
     std::vector<llama_token> train_tokens;
     std::vector<size_t> train_samples_begin;
     std::vector<size_t> train_samples_size;

From f9b5d9b760753db1eccf6144164ad0aa03f9b2b4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 17:41:53 +0200
Subject: [PATCH 222/235] train-text-from-scratch: automatically allocate input
 tensors

---
 .../train-text-from-scratch.cpp               | 52 +++++++++++++------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 069e460c15006..80cf2e255f016 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1088,15 +1088,46 @@ int main(int argc, char ** argv) {
     size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
     uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
 
+    int n_tokens = model.hparams.n_ctx;
+    int n_vocab  = model.hparams.n_vocab;
+    int n_batch  = params.common.n_batch;
+
+    std::vector<uint8_t> mem_input_data;
+    std::vector<uint8_t> mem_compute_data;
+
     ggml_allocr * alloc = NULL;
+
+    // context for input tensors without their data
+    struct ggml_init_params ctx_input_params = {
+        ggml_tensor_overhead() * 2, // mem_size
+        NULL,                       // mem_buffer
+        true,                       // no_alloc
+    };
+    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
+
+    // the input tensors
+    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
+    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
+
+    // measure required memory for input tensors
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc, target_probs);
+    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
+    ggml_allocr_free(alloc);
+    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
+
+    // allocate input tensors
+    mem_input_data.resize(max_input_size);
+    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc, target_probs);
+    ggml_allocr_free(alloc);
+
     if (params.use_alloc) {
         alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
     }
 
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
     std::vector<llama_token> train_tokens;
     std::vector<size_t> train_samples_begin;
     std::vector<size_t> train_samples_size;
@@ -1167,8 +1198,8 @@ int main(int argc, char ** argv) {
     opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
     opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
     opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = NULL;
-    opt_cb_data.target_probs           = NULL;
+    opt_cb_data.tokens_input           = tokens_input;
+    opt_cb_data.target_probs           = target_probs;
     opt_cb_data.first_iter             = opt->iter;
     opt_cb_data.last_time              = ggml_time_ms();
     opt_cb_data.millis_per_iter        = 0.0;
@@ -1184,21 +1215,12 @@ int main(int argc, char ** argv) {
         };
         struct ggml_context * ctx0 = ggml_init(cparams);
 
-        ggml_set_no_alloc(ctx0, false);
-
-        // don't use alloc for input tensors, so we can safely fill them with data
-        struct ggml_tensor * tokens_input           = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch);
-        struct ggml_tensor * target_probs           = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
         ggml_set_no_alloc(ctx0, (alloc != NULL));
 
         if (alloc) {
             ggml_allocr_reset(alloc);
         }
 
-        opt_cb_data.tokens_input  = tokens_input;
-        opt_cb_data.target_probs  = target_probs;
-
         int n_past = 0;
 
         struct ggml_cgraph * gf = ggml_new_graph(ctx0);

From c993246bfd1c21240ebf7b477a9774c10a451371 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 17:52:22 +0200
Subject: [PATCH 223/235] train-text-from-scratch: automatically allocate
 compute memory

---
 .../train-text-from-scratch.cpp               | 175 ++++++++++--------
 1 file changed, 93 insertions(+), 82 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 80cf2e255f016..ecb71c0aef3fd 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1081,13 +1081,6 @@ int main(int argc, char ** argv) {
     printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
     printf("%s: opt iter %d\n", __func__, opt->iter);
 
-    // TODO: use std::vector<uint8_t> intead of "new"
-    size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
-    uint8_t * compute_addr = new uint8_t[compute_size];
-
-    size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb);
-    uint8_t * compute_buf_0 = new uint8_t[size_buf_0];
-
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.common.n_batch;
@@ -1124,9 +1117,82 @@ int main(int argc, char ** argv) {
     ggml_allocr_alloc(alloc, target_probs);
     ggml_allocr_free(alloc);
 
-    if (params.use_alloc) {
-        alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment);
+    // context for compute tensors without their data
+    size_t estimated_compute_size_wo_data = (
+        ggml_tensor_overhead()*GGML_MAX_NODES*2
+      + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
+            params.common.use_checkpointing ? 3 : 2
+        )
+    );
+    struct ggml_init_params ctx_compute_params = {
+        estimated_compute_size_wo_data, // mem_size
+        NULL,                           // mem_buffer
+        true,                           // no_alloc
+    };
+    struct ggml_context * ctx_compute = NULL;
+
+    struct ggml_tensor * loss   = NULL;
+    struct ggml_tensor * logits = NULL;
+
+    struct ggml_cgraph * gf     = NULL;
+    struct ggml_cgraph * gb     = NULL;
+    struct ggml_cgraph * gb_tmp = NULL;
+
+    // measure required memory for compute tensors
+    size_t best_compute_size = SIZE_MAX;
+    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
+    // find best evaluation order
+    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
+        ctx_compute = ggml_init(ctx_compute_params);
+        alloc = ggml_allocr_new_measure(tensor_alignment);
+        gf = ggml_new_graph(ctx_compute);
+        gf->order = (enum ggml_cgraph_eval_order) order;
+        gb = ggml_new_graph(ctx_compute);
+        gb_tmp = params.common.use_checkpointing
+            ? ggml_new_graph(ctx_compute)
+            : NULL;
+        loss = llama_build_train_graphs(
+            &model, alloc, ctx_compute,
+            gf, gb, gb_tmp,
+            &logits, tokens_input, target_probs,
+            n_tokens, n_batch,
+            params.common.use_flash,
+            params.common.use_checkpointing
+        );
+        size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
+        if (max_compute_size < best_compute_size) {
+            best_compute_size = max_compute_size;
+            best_order = gf->order;
+        }
+        ggml_allocr_free(alloc);
+        ggml_free(ctx_compute);
     }
+    size_t max_compute_size = best_compute_size;
+    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
+    printf("%s: evaluation order = %s\n", __func__,
+        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
+        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
+        "invalid");
+
+    // allocate compute tensors
+    mem_compute_data.resize(max_compute_size);
+    ctx_compute = ggml_init(ctx_compute_params);
+    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    gf = ggml_new_graph(ctx_compute);
+    gf->order = best_order;
+    gb = ggml_new_graph(ctx_compute);
+    gb_tmp = params.common.use_checkpointing
+        ? ggml_new_graph(ctx_compute)
+        : NULL;
+    loss = llama_build_train_graphs(
+        &model, alloc, ctx_compute,
+        gf, gb, gb_tmp,
+        &logits, tokens_input, target_probs,
+        n_tokens, n_batch,
+        params.common.use_flash,
+        params.common.use_checkpointing
+    );
+    ggml_allocr_free(alloc);
 
     std::vector<llama_token> train_tokens;
     std::vector<size_t> train_samples_begin;
@@ -1204,83 +1270,30 @@ int main(int argc, char ** argv) {
     opt_cb_data.last_time              = ggml_time_ms();
     opt_cb_data.millis_per_iter        = 0.0;
 
-    int64_t t0 = ggml_time_ms();
-
-    for (int ex = 0; ex < params.n_examples; ++ex) {
-
-        struct ggml_init_params cparams = {
-            compute_size, // mem_size
-            compute_addr, // mem_buffer
-            false,        // no_alloc
-        };
-        struct ggml_context * ctx0 = ggml_init(cparams);
-
-        ggml_set_no_alloc(ctx0, (alloc != NULL));
-
-        if (alloc) {
-            ggml_allocr_reset(alloc);
-        }
-
-        int n_past = 0;
-
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb = ggml_new_graph(ctx0);
-        struct ggml_cgraph * gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph(ctx0)
-            : NULL;
-
-        GGML_ASSERT(n_past == 0);
-
-        struct ggml_tensor * loss   = NULL;
-        struct ggml_tensor * logits = NULL;
+    // measure required memory for work buffer
+    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
+    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
 
-        loss = llama_build_train_graphs(
-            &model, alloc, ctx0,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing
-        );
-
-        size_t used_mem_before_opt = ggml_used_mem(ctx0);
-
-        opt->params.adam.sched = learning_schedule(
-            opt->iter,
-            params.common.warmup,
-            params.common.cos_decay_steps,
-            params.common.adam_alpha,
-            params.common.adam_min_alpha,
-            params.common.cos_decay_min,
-            params.common.cos_decay_restart,
-            params.common.enable_restart);
-
-        printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched);
-
-        ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
+    // context for work buffer
+    struct ggml_init_params ctx_work_params = {
+        max_work_size, // mem_size
+        NULL,          // mem_buffer
+        false,         // no_alloc
+    };
+    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
 
-        size_t used_mem_after_opt = ggml_used_mem(ctx0);
+    int64_t t0 = ggml_time_ms();
 
-        int n_iter = params.common.adam_n_iter;
-        train->train_its = opt->iter;
-        train->train_samples += n_batch * n_iter;
-        train->train_tokens  += n_batch * n_tokens * n_iter;
+    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
 
-        if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) {
-            printf("Example %d, opt iter %d\n", ex, opt->iter);
-            printf("error_before_opt: %.6f\n", opt->loss_before);
-            printf("error_after_opt:  %.6f\n", opt->loss_after);
-            printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt);
-            printf("used_mem_after_opt:  %zu bytes\n", used_mem_after_opt);
-        }
-
-        ggml_free(ctx0);
-    }
+    ggml_free(ctx_work);
+    ggml_free(ctx_compute);
+    ggml_free(ctx_input);
 
     int64_t t1 = ggml_time_ms();
-    int64_t d  = t1-t0;
-    double  dd = (double) d * 1e-3;
-    printf("%s: total training time=%f seconds\n", __func__, dd);
+    printf("%s: total training time: ", __func__);
+    print_duration((double) (t1 - t0));
+    printf("\n");
 
     int new_iters = opt->iter - opt_cb_data.last_save_iter;
     if (new_iters > 0) {
@@ -1295,8 +1308,6 @@ int main(int argc, char ** argv) {
         ggml_allocr_free(alloc);
     }
 
-    delete[] compute_addr;
-    delete[] compute_buf_0;
     ggml_free(opt->ctx);
     free_train_state(train);
     ggml_free(model.ctx);

From 3b9d97484c0e5eed08cf3b7cc8e04300b42aa591 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 18:01:16 +0200
Subject: [PATCH 224/235] remove unused options and equalize
 train-text-from-scratch with finetune

---
 examples/finetune/finetune.cpp                |  7 +-
 .../train-text-from-scratch.cpp               | 81 +++++++------------
 2 files changed, 33 insertions(+), 55 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index caeea9c3f1e69..bb6b14547906f 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1276,7 +1276,7 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
 
     fprintf(stderr, "  --model-base FNAME         model path from which to load base model (default '%s')\n", params->fn_model_base);
     fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
-    fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training\n");
+    fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training.  use this if you only want to convert a checkpoint to a lora adapter.\n");
     fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
@@ -1660,8 +1660,6 @@ int main(int argc, char ** argv) {
     printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
     printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
     printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
 
     if (params.only_write_lora) {
         save_train_files_data save_data;
@@ -1681,6 +1679,9 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
+    printf("%s: opt iter %d\n", __func__, opt->iter);
+
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
     int n_batch  = params.common.n_batch;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index ecb71c0aef3fd..56eb816a6cb05 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -762,24 +762,17 @@ struct train_params {
     const char * fn_vocab_model;
     const char * fn_model_out;
 
+    bool only_write_model;
+
     int n_ctx;
     int n_embd;
     int n_head;
     int n_layer;
     int n_ff;
 
-    int n_examples;
-
     float f_norm_rms_eps;
     float rope_freq_base;
     float rope_freq_scale;
-
-    int print_info_interval;
-
-    bool use_alloc;
-
-    int mem_compute_gb;
-    int mem_compute0_gb;
 };
 
 struct train_params get_default_train_params() {
@@ -788,24 +781,18 @@ struct train_params get_default_train_params() {
     params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
     params.fn_model_out      = "ggml-checkpoint-f32.bin";
 
+    params.only_write_model = false;
+
     params.n_ctx      =  128;
     params.n_embd     =  256;
     params.n_head     =    8;
     params.n_layer    =   16;
     params.n_ff       =  768;
 
-    params.n_examples =    1;
-
     params.f_norm_rms_eps  = 1e-5f;
     params.rope_freq_base  = 10000.0f;
     params.rope_freq_scale = 1.0f;
 
-    params.print_info_interval    = 1;
-
-    params.use_alloc              = true;
-
-    params.mem_compute_gb = 24;
-    params.mem_compute0_gb = 8;
     return params;
 }
 
@@ -817,6 +804,7 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
 
     fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
     fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
+    fprintf(stderr, "  --only-write-model         only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n");
     fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
     fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
     fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
@@ -824,12 +812,6 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
     fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
     fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
     fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-    fprintf(stderr, "  -n N, --examples N         Number of examples to train (default %d)\n", params->n_examples);
-    fprintf(stderr, "  --print-info-interval N    Print infos during training each N examples (default %d)\n", params->print_info_interval);
-    fprintf(stderr, "  --no-alloc                 Don't use allocator\n");
-    fprintf(stderr, "  --use-alloc                Use allocator (default)\n");
-    fprintf(stderr, "  --mem-compute N            Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb);
-    fprintf(stderr, "  --mem-compute0 N           Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb);
 
     print_common_train_usage(argc, argv, &params->common);
 }
@@ -865,6 +847,8 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->fn_model_out = argv[i];
+        } else if (arg == "--only-write-model") {
+            params->only_write_model = true;
         } else if (arg == "--embd") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -907,34 +891,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
                 break;
             }
             params->rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "-n" || arg == "--examples") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_examples = std::stoi(argv[i]);
-        } else if (arg == "--print-info-interval") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->print_info_interval = std::stoi(argv[i]);
-        } else if (arg == "--no-alloc") {
-            params->use_alloc = false;
-        } else if (arg == "--use-alloc") {
-            params->use_alloc = true;
-        } else if (arg == "--mem-compute") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute_gb = std::stoi(argv[i]);
-        } else if (arg == "--mem-compute0") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->mem_compute0_gb = std::stoi(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             train_print_usage(argc, argv, &default_params);
@@ -1068,7 +1024,9 @@ int main(int argc, char ** argv) {
     } else {
         init_model(&model);
         randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model));
+        if (!params.only_write_model) {
+            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model));
+        }
     }
     opt->iter = train->train_its;
 
@@ -1078,6 +1036,25 @@ int main(int argc, char ** argv) {
     printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
     printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
     printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
+
+    if (params.only_write_model) {
+        save_train_files_data save_data;
+        save_data.fn_checkpoint_out = "";
+        save_data.fn_model_out      = params.fn_model_out;
+        save_data.fn_vocab_model    = params.fn_vocab_model;
+        save_data.pattern_fn_it     = params.common.pattern_fn_it;
+        save_data.fn_latest         = params.common.fn_latest;
+        save_data.model             = &model;
+
+        save_train_files(&save_data, train);
+
+        free_train_state(train);
+        ggml_free(model.ctx);
+        llama_free(lctx);
+        llama_free_model(lmodel);
+        return 0;
+    }
+
     printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
     printf("%s: opt iter %d\n", __func__, opt->iter);
 

From 5ce74ee4613c06bf3391c72d7115d10726200bff Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 17 Sep 2023 19:42:19 +0200
Subject: [PATCH 225/235] initialize opt->loss_after with zero

---
 common/train.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/train.cpp b/common/train.cpp
index 10e0107eb3bc0..3e8b1427fa3e6 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -32,6 +32,7 @@ struct train_state  * init_train_state() {
     state->opt = new struct ggml_opt_context;
     state->opt->ctx = NULL;
     state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    state->opt->loss_after = 0.0f;
 
     return state;
 }

From 0ede0f44346326c660b8fef0951fb07b8577557b Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 19:56:13 +0200
Subject: [PATCH 226/235] add export-lora program

---
 examples/CMakeLists.txt              |   1 +
 examples/export-lora/CMakeLists.txt  |   5 +
 examples/export-lora/README.md       |  26 ++
 examples/export-lora/export-lora.cpp | 454 +++++++++++++++++++++++++++
 4 files changed, 486 insertions(+)
 create mode 100644 examples/export-lora/CMakeLists.txt
 create mode 100644 examples/export-lora/README.md
 create mode 100644 examples/export-lora/export-lora.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 66f7db8495557..8f436765cc2c4 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -34,4 +34,5 @@ else()
     if (LLAMA_BUILD_SERVER)
         add_subdirectory(server)
     endif()
+    add_subdirectory(export-lora)
 endif()
diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt
new file mode 100644
index 0000000000000..cbbdaec67488d
--- /dev/null
+++ b/examples/export-lora/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET export-lora)
+add_executable(${TARGET} export-lora.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
new file mode 100644
index 0000000000000..0cf3e8e4549bb
--- /dev/null
+++ b/examples/export-lora/README.md
@@ -0,0 +1,26 @@
+# export-lora
+
+Apply LORA adapters to base model and export the resulting model.
+
+```
+usage: export-lora [options]
+
+options:
+  -h, --help                         show this help message and exit
+  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
+  -o FNAME, --model-out FNAME        path to save exported model (default '')
+  -l FNAME, --lora FNAME             apply LoRA adapter
+  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
+  -t N, --threads N                  number of threads to use during computation (default: 4)
+```
+
+For example:
+
+```bash
+./bin/export-lora \
+    -m open-llama-3b-v2-q8_0.gguf \
+    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
+    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+```
+
+Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
new file mode 100644
index 0000000000000..fc1a2de4b1e14
--- /dev/null
+++ b/examples/export-lora/export-lora.cpp
@@ -0,0 +1,454 @@
+
+#include "common.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#include <vector>
+#include <string>
+#include <thread>
+
+static const size_t tensor_alignment = 32;
+
+struct lora_info {
+    std::string filename;
+    float scale;
+};
+
+struct export_lora_params {
+    std::string fn_model_base;
+    std::string fn_model_out;
+    std::vector<struct lora_info> lora;
+    int n_threads;
+};
+
+struct tensor_info {
+    uint32_t    n_dims;
+    uint32_t    type;
+    uint32_t    ne[4];
+    size_t      offset;
+    size_t      nbytes;
+    std::string name;
+};
+
+struct lora_data {
+    struct lora_info     info;
+    std::vector<uint8_t> data;
+    struct ggml_context * ctx;
+
+    uint32_t lora_r;
+    uint32_t lora_alpha;
+};
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            die("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    bool eof() {
+        return tell() >= size;
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+static struct export_lora_params get_default_export_lora_params() {
+    struct export_lora_params result;
+    result.fn_model_base = "";
+    result.fn_model_out  = "";
+    result.n_threads = GGML_DEFAULT_N_THREADS;
+    return result;
+}
+
+static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
+    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
+    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
+    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
+    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
+}
+
+static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct export_lora_params default_params = get_default_export_lora_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-m" || arg == "--model-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_base = argv[i];
+        } else if (arg == "-o" || arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-l" || arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            lora.scale = 1.0f;
+            params->lora.push_back(lora);
+        } else if (arg == "-s" || arg == "--lora-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            lora.scale = std::stof(argv[i]);
+            params->lora.push_back(lora);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+            if (params->n_threads <= 0) {
+                params->n_threads = std::thread::hardware_concurrency();
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
+            export_lora_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    return true;
+}
+
+static struct lora_data * load_lora(struct lora_info * info) {
+    struct lora_data * result = new struct lora_data;
+    result->info = *info;
+    result->ctx = NULL;
+    result->lora_r     = 1;
+    result->lora_alpha = 1;
+    
+    struct llama_file file(info->filename.c_str(), "rb");
+    if (file.fp == NULL) {
+        return result;
+    }
+
+    struct ggml_init_params params_ggml;
+    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_MAX_NODES;
+    params_ggml.mem_buffer = NULL;
+    params_ggml.no_alloc   = true;
+    result->ctx = ggml_init(params_ggml);
+
+    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
+    uint32_t magic   = file.read_u32();
+    if (magic != LLAMA_FILE_MAGIC_LORA) {
+        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
+    }
+    uint32_t version = file.read_u32();
+    if (version != 1) {
+        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
+    }
+    result->lora_r     = file.read_u32();
+    result->lora_alpha = file.read_u32();
+    // read tensor infos from file
+    std::vector<char> name_buf;
+    std::vector<struct ggml_tensor *> tensors;
+    std::vector<size_t> tensors_offset;
+    size_t total_nbytes_pad = 0;
+    while(!file.eof()) {
+        int64_t ne[4]   = {1,1,1,1};
+        uint32_t n_dims  = file.read_u32();
+        uint32_t namelen = file.read_u32();
+        uint32_t type    = file.read_u32();
+        for (uint32_t k = 0; k < n_dims; ++k) {
+            ne[k] = (int64_t)file.read_u32();
+        }
+        name_buf.clear();
+        name_buf.resize(namelen + 1, '\0');
+        file.read_raw(name_buf.data(), namelen);
+        file.seek((0-file.tell()) & 31, SEEK_CUR);
+        size_t offset = file.tell();
+        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
+        ggml_set_name(tensor, name_buf.data());
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        total_nbytes_pad += nbytes_pad;
+        tensors.push_back(tensor);
+        tensors_offset.push_back(offset);
+        file.seek(nbytes, SEEK_CUR);
+    }
+    // read tensor data
+    result->data.resize(total_nbytes_pad);
+    size_t data_offset = 0;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        struct ggml_tensor * tensor = tensors[i];
+        size_t offset     = tensors_offset[i];
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        file.seek(offset, SEEK_SET);
+        tensor->data = result->data.data() + data_offset;
+        file.read_raw(tensor->data, nbytes);
+        data_offset += nbytes_pad;
+    }
+    return result;
+}
+
+static void free_lora(struct lora_data * lora) {
+    if (lora->ctx != NULL) {
+        ggml_free(lora->ctx);
+    }
+    delete lora;
+}
+
+static struct ggml_cgraph * build_graph_lora(
+    struct ggml_context * ctx,
+    struct ggml_tensor * tensor,
+    struct ggml_tensor * lora_a,
+    struct ggml_tensor * lora_b,
+    float scaling
+) {
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
+    if (scaling != 1.0f) {
+        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
+    }
+    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand (gf, res);
+    return gf;
+}
+
+static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
+    std::string name = ggml_get_name(tensor);
+    std::string name_a = name + std::string(".loraA");
+    std::string name_b = name + std::string(".loraB");
+    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
+    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
+    if (lora_a == NULL || lora_b == NULL) {
+        return false;
+    }
+
+    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
+
+    struct ggml_init_params params;
+    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead() * 4;
+    params.mem_buffer = NULL;
+    params.no_alloc   = true;
+    struct ggml_context * ctx = NULL;
+    struct ggml_allocr * alloc = NULL;
+    struct ggml_cgraph * gf = NULL;
+
+    ctx   = ggml_init(params);
+    alloc = ggml_allocr_new_measure(tensor_alignment);
+    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
+    size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
+    ggml_allocr_free(alloc);
+    ggml_free(ctx);
+
+    static std::vector<uint8_t> data_compute;
+    data_compute.resize(alloc_size + tensor_alignment);
+
+    ctx   = ggml_init(params);
+    alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
+    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
+    ggml_allocr_alloc_graph(alloc, gf);
+    ggml_allocr_free(alloc);
+
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> data_work;
+    data_work.resize(cplan.work_size);
+    cplan.work_data = data_work.data();
+
+    ggml_graph_compute(gf, &cplan);
+
+    ggml_free(ctx);
+    return true;
+}
+
+static void export_lora(struct export_lora_params * params) {
+    // load all loras
+    std::vector<struct lora_data *> loras;
+    for (size_t i = 0; i < params->lora.size(); ++i) {
+        loras.push_back(load_lora(&params->lora[i]));
+    }
+
+    // open base model gguf, read tensors without their data
+    struct ggml_context * ctx_in;
+    struct gguf_init_params params_gguf;
+    params_gguf.no_alloc = true;
+    params_gguf.ctx      = &ctx_in;
+    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
+
+    // create new gguf
+    struct gguf_context * gguf_out = gguf_init_empty();
+
+    // copy meta data from base model: kv and tensors
+    gguf_set_kv(gguf_out, gguf_in);
+    int n_tensors = gguf_get_n_tensors(gguf_in);
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+        gguf_add_tensor(gguf_out, tensor);
+    }
+
+    // create output file
+    struct llama_file fout(params->fn_model_out.c_str(), "wb");
+    if (!fout.fp) {
+        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
+    }
+
+    // write gguf meta data
+    std::vector<uint8_t> meta;
+    meta.resize(gguf_get_meta_size(gguf_out));
+    gguf_get_meta_data(gguf_out, meta.data());
+    fout.write_raw(meta.data(), meta.size());
+
+    struct llama_file fin(params->fn_model_base.c_str(), "rb");
+    std::vector<uint8_t> data;
+    std::vector<uint8_t> padding;
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+        
+        // read tensor data
+        data.resize(ggml_nbytes(tensor));
+        tensor->data = data.data();
+        size_t offset = gguf_get_tensor_offset(gguf_in, i);
+        fin.seek(offset + meta.size(), SEEK_SET);
+        fin.read_raw(data.data(), data.size());
+
+        // apply all loras
+        for (size_t k = 0; k < loras.size(); ++k) {
+            apply_lora(tensor, loras[k], params->n_threads);
+        }
+
+        // write tensor data + padding
+        padding.clear();
+        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
+
+        GGML_ASSERT(fout.tell() == offset + meta.size());
+        // fout.seek(offset, SEEK_SET);
+        fout.write_raw(data.data(), data.size());
+        fout.write_raw(padding.data(), padding.size());
+
+        if (i % 2 == 0) {
+            printf(".");
+        }
+    }
+    printf("\n");
+
+    // close gguf
+    gguf_free(gguf_out);
+    gguf_free(gguf_in);
+
+    // free loras
+    for (size_t i = 0; i < loras.size(); ++i) {
+        free_lora(loras[i]);
+    }
+}
+
+int main(int argc, char ** argv) {
+    struct export_lora_params params = get_default_export_lora_params();
+
+    if (!export_lora_params_parse(argc, argv, &params)) {
+        return 1;
+    }
+
+    export_lora(&params);
+
+    return 0;
+}

From b91e3dd2eefbe32a55e3204909c2373509ff306a Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 20:01:53 +0200
Subject: [PATCH 227/235] remove trailing whitespace

---
 examples/export-lora/export-lora.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index fc1a2de4b1e14..35f63184dce91 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -221,7 +221,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
     result->ctx = NULL;
     result->lora_r     = 1;
     result->lora_alpha = 1;
-    
+
     struct llama_file file(info->filename.c_str(), "rb");
     if (file.fp == NULL) {
         return result;
@@ -403,7 +403,7 @@ static void export_lora(struct export_lora_params * params) {
     for (int i=0; i < n_tensors; ++i) {
         const char * name = gguf_get_tensor_name(gguf_in, i);
         struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-        
+
         // read tensor data
         data.resize(ggml_nbytes(tensor));
         tensor->data = data.data();

From d38260be0f5abf241bb9cb1e0bfa578efb74a371 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 20:04:07 +0200
Subject: [PATCH 228/235] add export-lora build in Makefile

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6e20397fddb50..b797d6e73361b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative finetune tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative finetune export-lora tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@@ -561,6 +561,9 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co
 finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 

From 904c19bebdaabc7200cef0ff2c3ef5ef6567cacc Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 20:19:12 +0200
Subject: [PATCH 229/235] remove unused struct tensor_info from export-lora

---
 examples/export-lora/export-lora.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 35f63184dce91..c6a3e5ecbe58b 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -21,15 +21,6 @@ struct export_lora_params {
     int n_threads;
 };
 
-struct tensor_info {
-    uint32_t    n_dims;
-    uint32_t    type;
-    uint32_t    ne[4];
-    size_t      offset;
-    size_t      nbytes;
-    std::string name;
-};
-
 struct lora_data {
     struct lora_info     info;
     std::vector<uint8_t> data;
@@ -421,7 +412,7 @@ static void export_lora(struct export_lora_params * params) {
         padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
 
         GGML_ASSERT(fout.tell() == offset + meta.size());
-        // fout.seek(offset, SEEK_SET);
+        // fout.seek(offset + meta.size(), SEEK_SET);
         fout.write_raw(data.data(), data.size());
         fout.write_raw(padding.data(), padding.size());
 

From 758c46cf7dcde20e534ce509a90fce498860fa97 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 20:20:13 +0200
Subject: [PATCH 230/235] add export-lora build dependency to llama

because it depends on common, which depends on llama
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b797d6e73361b..cc582df88aac7 100644
--- a/Makefile
+++ b/Makefile
@@ -561,7 +561,7 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co
 finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o common.o $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)

From 9145c87accbc77f73e420f6784f418065b9c41c4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 20:54:00 +0200
Subject: [PATCH 231/235] update finetune README.md

---
 examples/finetune/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/finetune/README.md b/examples/finetune/README.md
index beb8f8a617dc2..b7347c20ca0ab 100644
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -66,9 +66,11 @@ The scale numbers don't need to add up to one, and you can also use numbers crea
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
 
-The LORA rank is configured for each model tensor type separately with these command line options:
+The default LORA rank can be specified with `--lora-r N`.
+The LORA rank can be configured for each model tensor type separately with these command line options:
 
 ```bash
+  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
   --rank-att-norm N          LORA rank for attention norm tensor (default 1)
   --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
   --rank-out-norm N          LORA rank for output norm tensor (default 1)

From da05205af6012a512fe12e9e0da48f5a5ae2f6f9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Fri, 22 Sep 2023 21:00:46 +0200
Subject: [PATCH 232/235] cancel optimization when specified number of epochs
 is completed

---
 common/train.cpp                              | 22 ++++++++-
 common/train.h                                |  5 +-
 examples/finetune/finetune.cpp                |  2 +
 .../train-text-from-scratch.cpp               |  2 +
 ggml.c                                        | 47 ++++++++++++++++---
 ggml.h                                        |  2 +-
 6 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/common/train.cpp b/common/train.cpp
index 3e8b1427fa3e6..8e27f5d983cee 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1044,6 +1044,7 @@ struct train_params_common get_default_train_params_common() {
     params.n_threads  =    6;
     params.n_batch    =    8;
     params.n_gradient_accumulation = 1;
+    params.n_epochs   = -1;
 
     params.custom_n_ctx = false;
 
@@ -1122,7 +1123,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
     fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
     fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
     fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
-    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
+    fprintf(stderr, "  --epochs N                 Maximum number epochs to process. (default %d)\n", params->n_epochs);
     fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
     fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
     fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
@@ -1131,6 +1132,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
     fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
+    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
     fprintf(stderr, "\n");
 }
 
@@ -1296,6 +1298,12 @@ bool consume_common_train_arg(
             return true;
         }
         params->adam_eps_f = std::stof(argv[i]);
+    } else if (arg == "--epochs") {
+        if (++i >= argc) {
+            *invalid_param = true;
+            return true;
+        }
+        params->n_epochs = std::stoi(argv[i]);
     } else if (arg == "--adam-iter") {
         if (++i >= argc) {
             *invalid_param = true;
@@ -1359,7 +1367,7 @@ void finish_processing_train_args(struct train_params_common * params) {
     }
 }
 
-void train_opt_callback(void * vdata, int accum_step, float * sched) {
+void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
     struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
     struct train_params_common     * params = data->params;
     struct train_state             * train  = data->train;
@@ -1475,4 +1483,14 @@ void train_opt_callback(void * vdata, int accum_step, float * sched) {
             data->samples_count);
         train->shuffle_next_sample = 0;
     }
+
+    const bool last_epoch_reached = (params->n_epochs > 0 && train->train_epochs - data->first_epoch >= params->n_epochs);
+    if (last_epoch_reached) {
+        // allow optimization iteration at last epoch to be completed before canceling
+        if (data->iter_at_last_epoch < 0) {
+            data->iter_at_last_epoch = opt->iter;
+        } else if (opt->iter > data->iter_at_last_epoch) {
+            *cancel = true;
+        }
+    }
 }
diff --git a/common/train.h b/common/train.h
index 6ef1f9fc50542..42fa704b897ae 100644
--- a/common/train.h
+++ b/common/train.h
@@ -43,6 +43,7 @@ struct train_params_common {
     int n_threads;
     int n_batch;
     int n_gradient_accumulation;
+    int n_epochs;
 
     bool custom_n_ctx;
 
@@ -101,6 +102,8 @@ struct train_opt_callback_data {
     struct ggml_tensor         * tokens_input;
     struct ggml_tensor         * target_probs;
     int                          first_iter;
+    int                          first_epoch;
+    int                          iter_at_last_epoch;
     int64_t                      last_time;
     double                       millis_per_iter;
 };
@@ -224,4 +227,4 @@ void save_train_state_gguf(struct gguf_context * fctx, struct train_state * trai
 
 std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
 
-void train_opt_callback(void * vdata, int accum_step, float * sched);
+void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index bb6b14547906f..8c03b9f530cf7 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1881,6 +1881,8 @@ int main(int argc, char ** argv) {
     opt_cb_data.tokens_input           = tokens_input;
     opt_cb_data.target_probs           = target_probs;
     opt_cb_data.first_iter             = opt->iter;
+    opt_cb_data.first_epoch            = train->train_epochs;
+    opt_cb_data.iter_at_last_epoch     = -1;
     opt_cb_data.last_time              = ggml_time_ms();
     opt_cb_data.millis_per_iter        = 0.0;
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 56eb816a6cb05..b2b8feb9137d5 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1244,6 +1244,8 @@ int main(int argc, char ** argv) {
     opt_cb_data.tokens_input           = tokens_input;
     opt_cb_data.target_probs           = target_probs;
     opt_cb_data.first_iter             = opt->iter;
+    opt_cb_data.first_epoch            = train->train_epochs;
+    opt_cb_data.iter_at_last_epoch     = -1;
     opt_cb_data.last_time              = ggml_time_ms();
     opt_cb_data.millis_per_iter        = 0.0;
 
diff --git a/ggml.c b/ggml.c
index ec9ea80a4cae5..2f17ef6502db6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19268,14 +19268,17 @@ static enum ggml_opt_result ggml_opt_adam(
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
+    bool cancel = false;
 
     // compute the function value
-
     float fx = 0;
     ggml_set_zero(opt->adam.g);
     for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
         if (callback) {
-            callback(callback_data, accum_step, &sched);
+            callback(callback_data, accum_step, &sched, &cancel);
+            if (cancel) {
+                break;
+            }
         }
         // ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
@@ -19283,6 +19286,9 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_opt_acc_grad(np, ps, g, accum_norm);
         fx += ggml_get_f32_1d(f, 0);
     }
+    if (cancel) {
+        return GGML_OPT_DID_NOT_CONVERGE;
+    }
     fx *= accum_norm;
 
     opt->adam.fx_prev = fx;
@@ -19308,6 +19314,9 @@ static enum ggml_opt_result ggml_opt_adam(
 
     // run the optimizer
     for (int t = 0; t < params.adam.n_iter; ++t) {
+        if (cancel) {
+            break;
+        }
         opt->iter = iter0 + t + 1;
         GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
 
@@ -19363,7 +19372,10 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_set_zero(opt->adam.g);
         for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
             if (callback) {
-                callback(callback_data, accum_step, &sched);
+                callback(callback_data, accum_step, &sched, &cancel);
+                if (cancel) {
+                    break;
+                }
             }
             // ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
@@ -19371,6 +19383,9 @@ static enum ggml_opt_result ggml_opt_adam(
             ggml_opt_acc_grad(np, ps, g, accum_norm);
             fx += ggml_get_f32_1d(f, 0);
         }
+        if (cancel) {
+            break;
+        }
         fx *= accum_norm;
 
         opt->loss_after = fx;
@@ -19456,6 +19471,7 @@ static enum ggml_opt_result linesearch_backtracking(
         struct ggml_cplan  * cplan,
         const int np,
         struct ggml_tensor * ps[],
+        bool * cancel,
         ggml_opt_callback callback,
         void * callback_data) {
     int count = 0;
@@ -19488,7 +19504,7 @@ static enum ggml_opt_result linesearch_backtracking(
     finit = *fx;
     dgtest = params->lbfgs.ftol*dginit;
 
-    while (true) {
+    while (!*cancel) {
         ggml_vec_cpy_f32(nx, x, xp);
         ggml_vec_mad_f32(nx, x, d, *step);
 
@@ -19502,7 +19518,10 @@ static enum ggml_opt_result linesearch_backtracking(
                 if (callback) {
                     // LBFG-S does not support learning rate -> ignore learning schedule
                     float sched = 0;
-                    callback(callback_data, accum_step, &sched);
+                    callback(callback_data, accum_step, &sched, cancel);
+                    if (*cancel) {
+                        break;
+                    }
                 }
                 // ggml_graph_reset  (gf);
                 ggml_set_f32      (f->grad, 1.0f);
@@ -19510,6 +19529,9 @@ static enum ggml_opt_result linesearch_backtracking(
                 ggml_opt_acc_grad(np, ps, g, accum_norm);
                 *fx += ggml_get_f32_1d(f, 0);
             }
+            if (*cancel) {
+                break;
+            }
             *fx *= accum_norm;
 
         }
@@ -19628,6 +19650,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
     float * lm_s     = opt->lbfgs.lms->data;
     float * lm_y     = opt->lbfgs.lmy->data;
 
+    bool cancel = false;
+
     // evaluate the function value and its gradient
     {
         ggml_opt_set_params(np, ps, x);
@@ -19638,7 +19662,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             if (callback) {
                 // LBFG-S does not support learning rate -> ignore learning schedule
                 float sched = 0;
-                callback(callback_data, accum_step, &sched);
+                callback(callback_data, accum_step, &sched, &cancel);
+                if (cancel) {
+                    break;
+                }
             }
             // ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
@@ -19646,6 +19673,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             ggml_opt_acc_grad(np, ps, g, accum_norm);
             fx += ggml_get_f32_1d(f, 0);
         }
+        if (cancel) {
+            return GGML_OPT_DID_NOT_CONVERGE;
+        }
         fx *= accum_norm;
 
         opt->loss_before = fx;
@@ -19704,7 +19734,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
+        if (!cancel) {
+            break;
+        }
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return
diff --git a/ggml.h b/ggml.h
index 50b849eb88bd8..43d655ffc219a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1726,7 +1726,7 @@ extern "C" {
         GGML_LINESEARCH_INVALID_PARAMETERS,
     };
 
-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched);
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
 
     // optimization parameters
     //

From 2912f17010dd2cda2b95e04bab60568fa9b0f0a5 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 24 Sep 2023 14:42:52 +0200
Subject: [PATCH 233/235] improve handling of export-lora arguments

print errors and warnings when files could not be read or created
---
 examples/export-lora/export-lora.cpp | 47 ++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index c6a3e5ecbe58b..72280f709122e 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -198,6 +198,17 @@ static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_
             exit(1);
         }
     }
+
+    if (params->fn_model_base == default_params.fn_model_base) {
+        fprintf(stderr, "error: please specify a filename for model-base.\n", arg.c_str());
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (params->fn_model_out == default_params.fn_model_out) {
+        fprintf(stderr, "error: please specify a filename for model-out.\n", arg.c_str());
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
     if (invalid_param) {
         fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
         export_lora_print_usage(argc, argv, &default_params);
@@ -206,6 +217,13 @@ static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_
     return true;
 }
 
+static void free_lora(struct lora_data * lora) {
+    if (lora->ctx != NULL) {
+        ggml_free(lora->ctx);
+    }
+    delete lora;
+}
+
 static struct lora_data * load_lora(struct lora_info * info) {
     struct lora_data * result = new struct lora_data;
     result->info = *info;
@@ -215,7 +233,10 @@ static struct lora_data * load_lora(struct lora_info * info) {
 
     struct llama_file file(info->filename.c_str(), "rb");
     if (file.fp == NULL) {
-        return result;
+        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
+            info->filename.c_str());
+        free_lora(result);
+        return NULL;
     }
 
     struct ggml_init_params params_ggml;
@@ -278,12 +299,6 @@ static struct lora_data * load_lora(struct lora_info * info) {
     return result;
 }
 
-static void free_lora(struct lora_data * lora) {
-    if (lora->ctx != NULL) {
-        ggml_free(lora->ctx);
-    }
-    delete lora;
-}
 
 static struct ggml_cgraph * build_graph_lora(
     struct ggml_context * ctx,
@@ -304,6 +319,9 @@ static struct ggml_cgraph * build_graph_lora(
 }
 
 static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
+    if (lora->ctx == NULL) {
+        return false;
+    }
     std::string name = ggml_get_name(tensor);
     std::string name_a = name + std::string(".loraA");
     std::string name_b = name + std::string(".loraB");
@@ -354,7 +372,19 @@ static void export_lora(struct export_lora_params * params) {
     // load all loras
     std::vector<struct lora_data *> loras;
     for (size_t i = 0; i < params->lora.size(); ++i) {
-        loras.push_back(load_lora(&params->lora[i]));
+        struct lora_data * lora = load_lora(&params->lora[i]);
+        if (lora != NULL) {
+            loras.push_back(lora);
+        }
+    }
+    if (loras.size() == 0) {
+        fprintf(stderr, "warning: no lora adapters will be applied.\n");
+    }
+
+    // open input file
+    struct llama_file fin(params->fn_model_base.c_str(), "rb");
+    if (!fin.fp) {
+        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
     }
 
     // open base model gguf, read tensors without their data
@@ -388,7 +418,6 @@ static void export_lora(struct export_lora_params * params) {
     gguf_get_meta_data(gguf_out, meta.data());
     fout.write_raw(meta.data(), meta.size());
 
-    struct llama_file fin(params->fn_model_base.c_str(), "rb");
     std::vector<uint8_t> data;
     std::vector<uint8_t> padding;
     for (int i=0; i < n_tensors; ++i) {

From ad64e33aadeec6583786beb7752d1f676f115696 Mon Sep 17 00:00:00 2001
From: meatbag-18a <145869052+meatbag-18a@users.noreply.github.com>
Date: Sun, 24 Sep 2023 05:48:19 -0700
Subject: [PATCH 234/235] Fix export-lora.cpp "not enough space in the
 context's memory pool" (#1)

* Fix export-lora.cpp "not enough space in the context's memory pool"

Without this patch, export-lora would sometimes error with "not enough space in the context's memory pool (needed 656784, available 656800)".

* increase required context size by 5*GGML_MEM_ALIGN instead of plain 16

---------

Co-authored-by: xaedes <xaedes@gmail.com>
---
 examples/export-lora/export-lora.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 72280f709122e..ef85685dad603 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
     float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
 
     struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead() * 4;
+    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
     params.mem_buffer = NULL;
     params.no_alloc   = true;
     struct ggml_context * ctx = NULL;

From 166065837e63ad1797e39225546d85273a5c42a4 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sun, 24 Sep 2023 14:55:21 +0200
Subject: [PATCH 235/235] improve handling of not yet supported tensor types

---
 examples/finetune/finetune.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 8c03b9f530cf7..e5eacd9664ef7 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -644,9 +644,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
     auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
         if (ggml_is_quantized(a->type)) {
             return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
-        } else {
-            GGML_ASSERT(a->type == GGML_TYPE_F32);
+        } else if (a->type == GGML_TYPE_F32) {
             return ggml_add(ctx, a, b);
+        } else {
+            die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n",
+                __func__, ggml_type_name(a->type));
         }
     };