From d75eae63335044713d5e24596af07a6eea1d0d8b Mon Sep 17 00:00:00 2001
From: gwjr <502526+gwjr@users.noreply.github.com>
Date: Tue, 14 Nov 2023 15:08:09 +0000
Subject: [PATCH 1/4] Remove logically superfluous assertions and order by
 dimension

---
 ggml.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index 3202a517b7868..21123c250d686 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
+    GGML_ASSERT(ne0  == ne00);
+    GGML_ASSERT(ne1  == ne10);
+    GGML_ASSERT(ne2  == ne02);
     GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
     GGML_ASSERT(ne3  == ne13);
+    GGML_ASSERT(ne03 == ne13);
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == sizeof(float));
@@ -9625,11 +9627,6 @@ static void ggml_compute_forward_out_prod_f32(
     // GGML_ASSERT(nb1 <= nb2);
     // GGML_ASSERT(nb2 <= nb3);
 
-    GGML_ASSERT(ne0 == ne00);
-    GGML_ASSERT(ne1 == ne10);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 

From 2f0c5dcaf5f36c5b66d1821c3708c597dabe20d6 Mon Sep 17 00:00:00 2001
From: gwjr <502526+gwjr@users.noreply.github.com>
Date: Tue, 14 Nov 2023 15:20:32 +0000
Subject: [PATCH 2/4] Use cblas_sgemm() to implement
 ggml_compute_forward_out_prod()

---
 ggml.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 21123c250d686..06324bf380790 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9598,6 +9598,35 @@ static void ggml_compute_forward_mul_mat(
 
 // ggml_compute_forward_out_prod
 
+#if defined(GGML_USE_ACCELERATE)
+    // helper function to determine if it is better to use BLAS or not
+    // based on ggml_compute_forward_mul_mat_use_blas()
+    // However, testing suggested that BLAS was never slower than the existing code
+static bool ggml_compute_forward_out_prod_use_blas(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+
+    UNUSED(dst);
+//    const int64_t ne10 = src1->ne[0];
+//
+//    const int64_t ne0 = dst->ne[0];
+//    const int64_t ne1 = dst->ne[1];
+
+    if (ggml_is_matrix(src0) &&
+        ggml_is_matrix(src1) &&
+        ggml_is_contiguous(src0) &&
+        (ggml_is_contiguous(src1) || ggml_is_transposed(src1))){ //&&
+        //(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+        return true;
+    }
+//    if (ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
+//        printf("Cannot use BLAS for large matrix at %s; ne0: %lld, ne1: %lld, ne10:, %lld", dst->name, ne0, ne1, ne10);
+//    }
+    return false;
+}
+#endif
+
 static void ggml_compute_forward_out_prod_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -9634,13 +9663,63 @@ static void ggml_compute_forward_out_prod_f32(
     // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 
     if (params->type == GGML_TASK_INIT) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+#if !defined(GGML_USE_ACCELERATE) // gemm beta will do this
+        if (!ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) {
+#endif
+            ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
+#if !defined(GGML_USE_ACCELERATE)
+        }
+#endif
         return;
     }
 
     if (params->type == GGML_TASK_FINALIZE) {
         return;
     }
+    
+#if defined(GGML_USE_ACCELERATE)
+    if (ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) {
+        if (params->ith != 0) { // All threads other than the first do no work.
+            return;
+        }
+        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
+        // src0: (k,n)
+        // src1: (k,m)
+        // dst:  (m,n)
+        //
+        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+        // Also expressed as (major,minor)
+        // a: (m,k): so src1 transposed
+        // b: (k,n): so src0
+        // c: (m,n)
+        //
+        // However, if ggml_is_transposed(src1) is true, then
+        // src1->data already contains a transposed version, so sgemm mustn't
+        // transpose it further.
+        
+        int n = src0->ne[0];
+        int k = src0->ne[1];
+        int m = src1->ne[0];
+        
+        int transposeA, lda;
+        
+        if (!ggml_is_transposed(src1)) {
+            transposeA = CblasTrans;
+            lda = m;
+        } else {
+            transposeA = CblasNoTrans;
+            lda = k;
+        }
+        
+        float * a = (float *) ((char *) src1->data);
+        float * b = (float *) ((char *) src0->data);
+        float * c = (float *) ((char *) dst->data);
+  
+        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+        
+        return;
+    }
+#endif
 
     // dst[:,:,:,:] = 0
     // for i2,i3:

From e5c1f02645918b067263b4f9154ebca0e5f1fde9 Mon Sep 17 00:00:00 2001
From: gwjr <502526+gwjr@users.noreply.github.com>
Date: Wed, 15 Nov 2023 23:00:44 +0000
Subject: [PATCH 3/4] Remove ggml_compute_forward_out_prod_use_blas(), fix
 compiling errors on cmake/zig, remove trailing whitespace

---
 ggml.c | 63 +++++++++++++++++++---------------------------------------
 1 file changed, 20 insertions(+), 43 deletions(-)

diff --git a/ggml.c b/ggml.c
index 06324bf380790..8fed22a01396e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9598,35 +9598,6 @@ static void ggml_compute_forward_mul_mat(
 
 // ggml_compute_forward_out_prod
 
-#if defined(GGML_USE_ACCELERATE)
-    // helper function to determine if it is better to use BLAS or not
-    // based on ggml_compute_forward_mul_mat_use_blas()
-    // However, testing suggested that BLAS was never slower than the existing code
-static bool ggml_compute_forward_out_prod_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-
-    UNUSED(dst);
-//    const int64_t ne10 = src1->ne[0];
-//
-//    const int64_t ne0 = dst->ne[0];
-//    const int64_t ne1 = dst->ne[1];
-
-    if (ggml_is_matrix(src0) &&
-        ggml_is_matrix(src1) &&
-        ggml_is_contiguous(src0) &&
-        (ggml_is_contiguous(src1) || ggml_is_transposed(src1))){ //&&
-        //(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-        return true;
-    }
-//    if (ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
-//        printf("Cannot use BLAS for large matrix at %s; ne0: %lld, ne1: %lld, ne10:, %lld", dst->name, ne0, ne1, ne10);
-//    }
-    return false;
-}
-#endif
-
 static void ggml_compute_forward_out_prod_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -9660,25 +9631,31 @@ static void ggml_compute_forward_out_prod_f32(
     //   compute by src0 rows
 
     // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
-    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+    // TODO: #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 
-    if (params->type == GGML_TASK_INIT) {
-#if !defined(GGML_USE_ACCELERATE) // gemm beta will do this
-        if (!ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) {
+#if defined(GGML_USE_ACCELERATE)
+    bool use_blas = ggml_is_matrix(src0) &&
+        ggml_is_matrix(src1) &&
+        ggml_is_contiguous(src0) &&
+        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
 #endif
-            ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-#if !defined(GGML_USE_ACCELERATE)
+
+    if (params->type == GGML_TASK_INIT) {
+#if defined(GGML_USE_ACCELERATE) // gemm beta will zero dst
+        if (use_blas) {
+            return;
         }
 #endif
+        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
         return;
     }
 
     if (params->type == GGML_TASK_FINALIZE) {
         return;
     }
-    
+
 #if defined(GGML_USE_ACCELERATE)
-    if (ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) {
+    if (use_blas) {
         if (params->ith != 0) { // All threads other than the first do no work.
             return;
         }
@@ -9696,13 +9673,13 @@ static void ggml_compute_forward_out_prod_f32(
         // However, if ggml_is_transposed(src1) is true, then
         // src1->data already contains a transposed version, so sgemm mustn't
         // transpose it further.
-        
+
         int n = src0->ne[0];
         int k = src0->ne[1];
         int m = src1->ne[0];
-        
+
         int transposeA, lda;
-        
+
         if (!ggml_is_transposed(src1)) {
             transposeA = CblasTrans;
             lda = m;
@@ -9710,13 +9687,13 @@ static void ggml_compute_forward_out_prod_f32(
             transposeA = CblasNoTrans;
             lda = k;
         }
-        
+
         float * a = (float *) ((char *) src1->data);
         float * b = (float *) ((char *) src0->data);
         float * c = (float *) ((char *) dst->data);
-  
+
         cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
-        
+
         return;
     }
 #endif

From da122af024eb302995317e80a0e93035dc32575a Mon Sep 17 00:00:00 2001
From: gwjr <502526+gwjr@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:45:33 +0000
Subject: [PATCH 4/4] Add openBLAS support for sgemm() in
 compute_forward_out_prod()

---
 ggml.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index 8fed22a01396e..f43962acac832 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9631,9 +9631,9 @@ static void ggml_compute_forward_out_prod_f32(
     //   compute by src0 rows
 
     // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
-    // TODO: #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+    // TODO: #if defined(GGML_USE_CLBLAST)
 
-#if defined(GGML_USE_ACCELERATE)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     bool use_blas = ggml_is_matrix(src0) &&
         ggml_is_matrix(src1) &&
         ggml_is_contiguous(src0) &&
@@ -9641,7 +9641,7 @@ static void ggml_compute_forward_out_prod_f32(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
-#if defined(GGML_USE_ACCELERATE) // gemm beta will zero dst
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
         if (use_blas) {
             return;
         }
@@ -9654,7 +9654,7 @@ static void ggml_compute_forward_out_prod_f32(
         return;
     }
 
-#if defined(GGML_USE_ACCELERATE)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (use_blas) {
         if (params->ith != 0) { // All threads other than the first do no work.
             return;