From d75eae63335044713d5e24596af07a6eea1d0d8b Mon Sep 17 00:00:00 2001 From: gwjr <502526+gwjr@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:08:09 +0000 Subject: [PATCH 1/4] Remove logically superfluous assertions and order by dimension --- ggml.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index 3202a517b7868..21123c250d686 100644 --- a/ggml.c +++ b/ggml.c @@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32( const int ith = params->ith; const int nth = params->nth; + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == sizeof(float)); @@ -9625,11 +9627,6 @@ static void ggml_compute_forward_out_prod_f32( // GGML_ASSERT(nb1 <= nb2); // GGML_ASSERT(nb2 <= nb3); - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - // nb01 >= nb00 - src0 is not transposed // compute by src0 rows From 2f0c5dcaf5f36c5b66d1821c3708c597dabe20d6 Mon Sep 17 00:00:00 2001 From: gwjr <502526+gwjr@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:20:32 +0000 Subject: [PATCH 2/4] Use cblas_sgemm() to implement ggml_compute_forward_out_prod() --- ggml.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 21123c250d686..06324bf380790 100644 --- a/ggml.c +++ b/ggml.c @@ -9598,6 +9598,35 @@ static void ggml_compute_forward_mul_mat( // ggml_compute_forward_out_prod +#if defined(GGML_USE_ACCELERATE) + // helper function to determine if it is better to use BLAS or not + // based on ggml_compute_forward_mul_mat_use_blas() + // However, testing suggested that BLAS was never slower than the existing code +static bool ggml_compute_forward_out_prod_use_blas( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + + UNUSED(dst); +// const int64_t ne10 = src1->ne[0]; +// +// const int64_t ne0 = dst->ne[0]; +// const int64_t ne1 = dst->ne[1]; + + if (ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1))){ //&& + //(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + return true; + } +// if (ne0 >= 32 && ne1 >= 32 && ne10 >= 32) { +// printf("Cannot use BLAS for large matrix at %s; ne0: %lld, ne1: %lld, ne10:, %lld", dst->name, ne0, ne1, ne10); +// } + return false; +} +#endif + static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -9634,13 +9663,63 @@ static void ggml_compute_forward_out_prod_f32( // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_INIT) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); +#if !defined(GGML_USE_ACCELERATE) // gemm beta will do this + if (!ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) { +#endif + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); +#if !defined(GGML_USE_ACCELERATE) + } +#endif return; } if (params->type == GGML_TASK_FINALIZE) { return; } + +#if defined(GGML_USE_ACCELERATE) + if (ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) { + if (params->ith != 0) { // All threads other than the first do no work. + return; + } + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + int transposeA, lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + return; + } +#endif // dst[:,:,:,:] = 0 // for i2,i3: From e5c1f02645918b067263b4f9154ebca0e5f1fde9 Mon Sep 17 00:00:00 2001 From: gwjr <502526+gwjr@users.noreply.github.com> Date: Wed, 15 Nov 2023 23:00:44 +0000 Subject: [PATCH 3/4] Remove ggml_compute_forward_out_prod_use_blas(), fix compiling errors on cmake/zig, remove trailing whitespace --- ggml.c | 63 +++++++++++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 43 deletions(-) diff --git a/ggml.c b/ggml.c index 06324bf380790..8fed22a01396e 100644 --- a/ggml.c +++ b/ggml.c @@ -9598,35 +9598,6 @@ static void ggml_compute_forward_mul_mat( // ggml_compute_forward_out_prod -#if defined(GGML_USE_ACCELERATE) - // helper function to determine if it is better to use BLAS or not - // based on ggml_compute_forward_mul_mat_use_blas() - // However, testing suggested that BLAS was never slower than the existing code -static bool ggml_compute_forward_out_prod_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - - UNUSED(dst); -// const int64_t ne10 = src1->ne[0]; -// -// const int64_t ne0 = dst->ne[0]; -// const int64_t ne1 = dst->ne[1]; - - if (ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1))){ //&& - //(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - return true; - } -// if (ne0 >= 32 && ne1 >= 32 && ne10 >= 32) { -// printf("Cannot use BLAS for large matrix at %s; ne0: %lld, ne1: %lld, ne10:, %lld", dst->name, ne0, ne1, ne10); -// } - return false; -} -#endif - static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -9660,25 +9631,31 @@ static void ggml_compute_forward_out_prod_f32( // compute by src0 rows // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod - // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) + // TODO: #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (params->type == GGML_TASK_INIT) { -#if !defined(GGML_USE_ACCELERATE) // gemm beta will do this - if (!ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) { +#if defined(GGML_USE_ACCELERATE) + bool use_blas = ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); #endif - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); -#if !defined(GGML_USE_ACCELERATE) + + if (params->type == GGML_TASK_INIT) { +#if defined(GGML_USE_ACCELERATE) // gemm beta will zero dst + if (use_blas) { + return; } #endif + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } if (params->type == GGML_TASK_FINALIZE) { return; } - + #if defined(GGML_USE_ACCELERATE) - if (ggml_compute_forward_out_prod_use_blas(src0, src1, dst)) { + if (use_blas) { if (params->ith != 0) { // All threads other than the first do no work. return; } @@ -9696,13 +9673,13 @@ static void ggml_compute_forward_out_prod_f32( // However, if ggml_is_transposed(src1) is true, then // src1->data already contains a transposed version, so sgemm mustn't // transpose it further. - + int n = src0->ne[0]; int k = src0->ne[1]; int m = src1->ne[0]; - + int transposeA, lda; - + if (!ggml_is_transposed(src1)) { transposeA = CblasTrans; lda = m; @@ -9710,13 +9687,13 @@ static void ggml_compute_forward_out_prod_f32( transposeA = CblasNoTrans; lda = k; } - + float * a = (float *) ((char *) src1->data); float * b = (float *) ((char *) src0->data); float * c = (float *) ((char *) dst->data); - + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - + return; } #endif From da122af024eb302995317e80a0e93035dc32575a Mon Sep 17 00:00:00 2001 From: gwjr <502526+gwjr@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:45:33 +0000 Subject: [PATCH 4/4] Add openBLAS support for sgemm() in compute_forward_out_prod() --- ggml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 8fed22a01396e..f43962acac832 100644 --- a/ggml.c +++ b/ggml.c @@ -9631,9 +9631,9 @@ static void ggml_compute_forward_out_prod_f32( // compute by src0 rows // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod - // TODO: #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) + // TODO: #if defined(GGML_USE_CLBLAST) -#if defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) bool use_blas = ggml_is_matrix(src0) && ggml_is_matrix(src1) && ggml_is_contiguous(src0) && @@ -9641,7 +9641,7 @@ static void ggml_compute_forward_out_prod_f32( #endif if (params->type == GGML_TASK_INIT) { -#if defined(GGML_USE_ACCELERATE) // gemm beta will zero dst +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst if (use_blas) { return; } @@ -9654,7 +9654,7 @@ static void ggml_compute_forward_out_prod_f32( return; } -#if defined(GGML_USE_ACCELERATE) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (use_blas) { if (params->ith != 0) { // All threads other than the first do no work. return;