From 5b6655f3bd5f9effd073eff6012087f3953a10a2 Mon Sep 17 00:00:00 2001 From: Jean-Luc Fattebert Date: Mon, 15 Jun 2020 15:33:16 -0400 Subject: [PATCH] Add memory pool for ellblock --- CMakeLists.txt | 5 + build.sh | 3 + src/C-interface/bml_allocate.c | 5 +- .../ellblock/bml_add_ellblock_typed.c | 4 +- .../ellblock/bml_allocate_ellblock.c | 15 +- .../ellblock/bml_allocate_ellblock.h | 39 ++++ .../ellblock/bml_allocate_ellblock_typed.c | 166 ++++++++++++++++-- .../ellblock/bml_convert_ellblock_typed.c | 5 +- .../ellblock/bml_copy_ellblock_typed.c | 21 ++- .../ellblock/bml_import_ellblock_typed.c | 4 +- .../ellblock/bml_multiply_ellblock_typed.c | 11 +- .../ellblock/bml_setters_ellblock_typed.c | 21 ++- .../ellblock/bml_threshold_ellblock_typed.c | 20 +-- .../ellblock/bml_transpose_ellblock_typed.c | 11 +- src/C-interface/ellblock/bml_types_ellblock.h | 8 + .../ellblock/bml_utilities_ellblock_typed.c | 9 +- src/Fortran-interface/bml_allocate_m.F90 | 7 +- src/Fortran-interface/bml_c_interface_m.F90 | 5 +- 18 files changed, 292 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41c23d425..193cc819d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -519,6 +519,11 @@ if(NOT (NOBLAS)) endif() endif() +if(BML_ELLBLOCK_MEMPOOL) + message(STATUS "Use memory pool for elllock") + add_definitions(-DBML_ELLBLOCK_USE_MEMPOOL) +endif() + add_definitions(-D_POSIX_C_SOURCE=200112L) check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN) diff --git a/build.sh b/build.sh index 9f4cb33d7..6eae0aa2c 100755 --- a/build.sh +++ b/build.sh @@ -71,6 +71,7 @@ EOF echo "BML_MAGMA Build with MAGMA (default is ${BML_MAGMA})" echo "BML_CUSOLVER Build with cuSOLVER (default is ${BML_CUSOLVER})" echo "BML_XSMM Build with XSMM (default is ${BML_XSMM})" + echo "BML_ELLBLOCK_MEMPOOL Use ellblock memory pool (default is ${BML_ELLBLOCK_MEMPOOL}" echo "CUDA_TOOLKIT_ROOT_DIR Path to CUDA dir (default is ${CUDA_TOOLKIT_ROOT_DIR})" echo "INTEL_OPT {yes, no} (default is ${INTEL_OPT})" } @@ -104,6 +105,7 @@ set_defaults() { : ${BML_MAGMA:=no} : ${BML_CUSOLVER:=no} : ${BML_XSMM:=no} + : ${BML_ELLBLOCK_MEMPOOL:=yes} : ${CUDA_TOOLKIT_ROOT_DIR:=} : ${INTEL_OPT:=no} } @@ -179,6 +181,7 @@ configure() { -DBML_MAGMA="${BML_MAGMA}" \ -DBML_CUSOLVER="${BML_CUSOLVER}" \ -DBML_XSMM="${BML_XSMM}" \ + -DBML_ELLBLOCK_MEMPOOL="${BML_ELLBLOCK_MEMPOOL}" \ -DCUDA_TOOLKIT_ROOT_DIR="${CUDA_TOOLKIT_ROOT_DIR}" \ -DINTEL_OPT="${INTEL_OPT:=no}" \ | tee --append "${LOG_FILE}" diff --git a/src/C-interface/bml_allocate.c b/src/C-interface/bml_allocate.c index e30f18425..21a36b20e 100644 --- a/src/C-interface/bml_allocate.c +++ b/src/C-interface/bml_allocate.c @@ -313,6 +313,7 @@ bml_block_matrix( bml_matrix_precision_t matrix_precision, int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode) { @@ -321,11 +322,11 @@ bml_block_matrix( { case ellpack: return bml_block_matrix_ellblock(matrix_precision, - NB, MB, bsizes, distrib_mode); + NB, MB, M, bsizes, distrib_mode); break; case ellblock: return bml_block_matrix_ellblock(matrix_precision, - NB, MB, bsizes, distrib_mode); + NB, MB, M, bsizes, distrib_mode); break; default: LOG_ERROR("unsupported matrix type (type ID %d)\n", matrix_type); diff --git a/src/C-interface/ellblock/bml_add_ellblock_typed.c b/src/C-interface/ellblock/bml_add_ellblock_typed.c index e4bd7012d..dff04324c 100644 --- a/src/C-interface/ellblock/bml_add_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_add_ellblock_typed.c @@ -157,8 +157,8 @@ void TYPED_FUNC( { A_ptr_value[kb] = - bml_noinit_allocate_memory(nelements * - sizeof(REAL_T)); + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, + nelements); } for (int kk = 0; kk < nelements; kk++) { diff --git a/src/C-interface/ellblock/bml_allocate_ellblock.c b/src/C-interface/ellblock/bml_allocate_ellblock.c index 39daaf57c..ca5138b4b 100644 --- a/src/C-interface/ellblock/bml_allocate_ellblock.c +++ b/src/C-interface/ellblock/bml_allocate_ellblock.c @@ -120,12 +120,18 @@ void bml_deallocate_ellblock( bml_matrix_ellblock_t * A) { +#ifdef BML_ELLBLOCK_USE_MEMPOOL + bml_free_memory(A->memory_pool_offsets); + bml_free_memory(A->memory_pool_ptr); + bml_free_memory(A->memory_pool); +#else for (int ib = 0; ib < A->NB; ib++) for (int jp = 0; jp < A->nnzb[ib]; jp++) { int ind = ROWMAJOR(ib, jp, A->NB, A->MB); bml_free_memory(A->ptr_value[ind]); } +#endif bml_free_memory(A->ptr_value); bml_free_memory(A->indexb); bml_free_memory(A->nnzb); @@ -246,6 +252,7 @@ bml_block_matrix_ellblock( bml_matrix_precision_t matrix_precision, int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode) { @@ -254,19 +261,19 @@ bml_block_matrix_ellblock( switch (matrix_precision) { case single_real: - A = bml_block_matrix_ellblock_single_real(NB, MB, bsizes, + A = bml_block_matrix_ellblock_single_real(NB, MB, M, bsizes, distrib_mode); break; case double_real: - A = bml_block_matrix_ellblock_double_real(NB, MB, bsizes, + A = bml_block_matrix_ellblock_double_real(NB, MB, M, bsizes, distrib_mode); break; case single_complex: - A = bml_block_matrix_ellblock_single_complex(NB, MB, bsizes, + A = bml_block_matrix_ellblock_single_complex(NB, MB, M, bsizes, distrib_mode); break; case double_complex: - A = bml_block_matrix_ellblock_double_complex(NB, MB, bsizes, + A = bml_block_matrix_ellblock_double_complex(NB, MB, M, bsizes, distrib_mode); break; default: diff --git a/src/C-interface/ellblock/bml_allocate_ellblock.h b/src/C-interface/ellblock/bml_allocate_ellblock.h index 5ebdbcef7..955175d06 100644 --- a/src/C-interface/ellblock/bml_allocate_ellblock.h +++ b/src/C-interface/ellblock/bml_allocate_ellblock.h @@ -158,30 +158,35 @@ bml_matrix_ellblock_t *bml_block_matrix_ellblock( bml_matrix_precision_t matrix_precision, int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode); bml_matrix_ellblock_t *bml_block_matrix_ellblock_single_real( int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode); bml_matrix_ellblock_t *bml_block_matrix_ellblock_double_real( int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode); bml_matrix_ellblock_t *bml_block_matrix_ellblock_single_complex( int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode); bml_matrix_ellblock_t *bml_block_matrix_ellblock_double_complex( int NB, int MB, + int M, int *bsizes, bml_distribution_mode_t distrib_mode); @@ -204,4 +209,38 @@ int bml_get_nb( ); int count_nelements( bml_matrix_ellblock_t * A); + +void *bml_allocate_block_ellblock_single_real( + bml_matrix_ellblock_t * A, + const int ib, + const int nelements); +void *bml_allocate_block_ellblock_double_real( + bml_matrix_ellblock_t * A, + const int ib, + const int nelements); +void *bml_allocate_block_ellblock_single_complex( + bml_matrix_ellblock_t * A, + const int ib, + const int nelements); +void *bml_allocate_block_ellblock_double_complex( + bml_matrix_ellblock_t * A, + const int ib, + const int nelements); + +void bml_free_block_ellblock_single_real( + bml_matrix_ellblock_t * A, + const int ib, + const int jb); +void bml_free_block_ellblock_double_real( + bml_matrix_ellblock_t * A, + const int ib, + const int jb); +void bml_free_block_ellblock_single_complex( + bml_matrix_ellblock_t * A, + const int ib, + const int jb); +void bml_free_block_ellblock_double_complex( + bml_matrix_ellblock_t * A, + const int ib, + const int jb); #endif diff --git a/src/C-interface/ellblock/bml_allocate_ellblock_typed.c b/src/C-interface/ellblock/bml_allocate_ellblock_typed.c index 2316defd8..ea4c525c9 100644 --- a/src/C-interface/ellblock/bml_allocate_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_allocate_ellblock_typed.c @@ -17,9 +17,105 @@ #include #endif +void *TYPED_FUNC( + bml_allocate_block_ellblock) ( + bml_matrix_ellblock_t * A, + const int ib, + const int nelements) +{ +#ifdef BML_ELLBLOCK_USE_MEMPOOL + void *allocation = A->memory_pool_ptr[ib]; + assert(allocation != NULL); + + // update block row ib pointer for next call + A->memory_pool_ptr[ib] = ((REAL_T **) A->memory_pool_ptr)[ib] + nelements; + + return allocation; +#else + //return calloc(nelements, sizeof(REAL_T)); + return bml_noinit_allocate_memory(nelements * sizeof(REAL_T)); +#endif +} + +void TYPED_FUNC( + bml_free_block_ellblock) ( + bml_matrix_ellblock_t * A, + const int ib, + const int jb) +{ +#ifdef BML_ELLBLOCK_USE_MEMPOOL + int shift = 0; + REAL_T *dst = NULL; + REAL_T *src = NULL; + int nelements = 0; + // loop over allocated blocks in row + for (int jp = 0; jp < A->nnzb[ib]; jp++) + { + int ind = ROWMAJOR(ib, jp, A->NB, A->MB); + int j = A->indexb[ind]; + // shift pointers when block to be freed is reached + if (j == jb) + { + shift = A->bsize[ib] * A->bsize[jb]; + //printf("Remove block %d %d %d\n", ib, jb, shift); + // simply set pointer to NULL if last allocated block in row + if (jp == A->nnzb[ib] - 1) + { + A->ptr_value[ind] = NULL; + break; + } + // save pointers to beginning and end of memory + // to be freed + dst = A->ptr_value[ind]; + src = A->ptr_value[ind + 1]; + + // shift pointers for subsequent blocks + for (int jpp = jp + 1; jpp < A->nnzb[ib]; jpp++) + { + int ind = ROWMAJOR(ib, jpp, A->NB, A->MB); + + // count elements to be shifted in memory + int jbb = A->indexb[ind]; + nelements += A->bsize[ib] * A->bsize[jbb]; + //printf("%d <- %d\n", ind-1, ind); + A->indexb[ind - 1] = A->indexb[ind]; + A->ptr_value[ind - 1] = (REAL_T *) A->ptr_value[ind] - shift; + } + A->ptr_value[A->nnzb[ib] - 1] = NULL; + break; + } + } + // make sure we found block to deallocate + assert(shift > 0); + + // move data in memory to reflect shift in pointers + if (nelements > 0) + memmove(dst, src, nelements * sizeof(REAL_T)); +#else + for (int jp = 0; jp < A->nnzb[ib]; jp++) + { + int ind = ROWMAJOR(ib, jp, A->NB, A->MB); + int j = A->indexb[ind]; + if (j == jb) + { + bml_free_memory(A->ptr_value[ind]); + for (int jpp = jp + 1; jpp < A->nnzb[ib]; jpp++) + { + int ind = ROWMAJOR(ib, jpp, A->NB, A->MB); + A->indexb[ind - 1] = A->indexb[ind]; + A->ptr_value[ind - 1] = A->ptr_value[ind]; + } + A->ptr_value[A->nnzb[ib] - 1] = NULL; + break; + } + } +#endif + A->nnzb[ib]--; +} + /** Clear a matrix. * - * Numbers of non-zeroes, indeces, and values are set to zero. + * Numbers of non-zeroes/row are set to zero. * * \ingroup allocate_group * @@ -29,12 +125,18 @@ void TYPED_FUNC( bml_clear_ellblock) ( bml_matrix_ellblock_t * A) { +#ifdef BML_ELLBLOCK_USE_MEMPOOL + for (int ib = 0; ib < A->NB; ib++) + A->memory_pool_ptr[ib] = + (REAL_T *) A->memory_pool + A->memory_pool_offsets[ib]; +#else for (int ib = 0; ib < A->NB; ib++) for (int jp = 0; jp < A->nnzb[ib]; jp++) { int ind = ROWMAJOR(ib, jp, A->NB, A->MB); bml_free_memory(A->ptr_value[ind]); } +#endif memset(A->nnzb, 0, A->NB * sizeof(int)); } @@ -45,9 +147,11 @@ bml_matrix_ellblock_t distrib_mode) { int N = matrix_dimension.N_rows; + int M = matrix_dimension.N_nz_max; return TYPED_FUNC(bml_block_matrix_ellblock) (bml_get_nb(), bml_get_mb(), - bml_get_block_sizes(N, 0), + M, bml_get_block_sizes(N, + 0), distrib_mode); } @@ -70,6 +174,7 @@ bml_matrix_ellblock_t *TYPED_FUNC( bml_block_matrix_ellblock) ( int NB, int MB, + int M, int *bsize, bml_distribution_mode_t distrib_mode) { @@ -78,6 +183,11 @@ bml_matrix_ellblock_t *TYPED_FUNC( for (int ib = 0; ib < NB; ib++) assert(bsize[ib] < 1e6); + int N = 0; + for (int ib = 0; ib < NB; ib++) + N += bsize[ib]; + assert(N >= M); + bml_matrix_ellblock_t *A = bml_allocate_memory(sizeof(bml_matrix_ellblock_t)); A->matrix_type = ellblock; @@ -86,15 +196,41 @@ bml_matrix_ellblock_t *TYPED_FUNC( A->MB = MB; A->bsize = bml_allocate_memory(sizeof(int) * NB); memcpy(A->bsize, bsize, NB * sizeof(int)); + A->distribution_mode = distrib_mode; A->indexb = bml_allocate_memory(sizeof(int) * NB * MB); + memset(A->indexb, -1, sizeof(int) * NB * MB); A->nnzb = bml_allocate_memory(sizeof(int) * NB); - A->ptr_value = bml_allocate_memory(sizeof(REAL_T *) * NB * MB); - int N = 0; + // allocate memory for matrix elements + // we make sure we have enough memory for at least 3 blocks/row + int maxbsize = 0; for (int ib = 0; ib < NB; ib++) - N += bsize[ib]; + maxbsize = MAX(maxbsize, bsize[ib]); + int ncols_storage = M; + if (ncols_storage < 3 * maxbsize) + ncols_storage = 3 * maxbsize; +#ifdef BML_ELLBLOCK_USE_MEMPOOL + A->memory_pool = + (REAL_T *) bml_allocate_memory(sizeof(REAL_T) * N * ncols_storage); + + // allocate memory for pointers to allocated blocks + A->memory_pool_offsets = (int *) bml_allocate_memory(sizeof(int) * NB); + A->memory_pool_offsets[0] = 0; + for (int ib = 1; ib < NB; ib++) + A->memory_pool_offsets[ib] = A->memory_pool_offsets[ib - 1] + + bsize[ib - 1] * ncols_storage; + A->memory_pool_ptr = bml_allocate_memory(sizeof(REAL_T *) * NB); + for (int ib = 0; ib < NB; ib++) + A->memory_pool_ptr[ib] = + (REAL_T *) (A->memory_pool) + A->memory_pool_offsets[ib]; +#endif + A->ptr_value = bml_allocate_memory(sizeof(REAL_T *) * NB * MB); + for (int i = 0; i < NB * MB; i++) + A->ptr_value[i] = NULL; + A->N = N; + A->M = M; //printf("bml_block_matrix_ellblock %d %d\n",NB,MB); if (bml_get_mb() == 0) { @@ -115,7 +251,8 @@ bml_matrix_ellblock_t *TYPED_FUNC( int nb = bml_get_nb(); bml_matrix_ellblock_t *A = - TYPED_FUNC(bml_block_matrix_ellblock) (nb, mb, bsize, distrib_mode); + TYPED_FUNC(bml_block_matrix_ellblock) (nb, mb, M, bsize, + distrib_mode); return A; } @@ -184,7 +321,8 @@ bml_matrix_ellblock_t *TYPED_FUNC( int nb = bml_get_nb(); bml_matrix_ellblock_t *A = - TYPED_FUNC(bml_block_matrix_ellblock) (nb, mb, bsize, distrib_mode); + TYPED_FUNC(bml_block_matrix_ellblock) (nb, mb, M, bsize, + distrib_mode); //now fill with random values int NB = A->NB; @@ -204,8 +342,8 @@ bml_matrix_ellblock_t *TYPED_FUNC( //allocate storage int nelements = bsize[ib] * bsize[jb]; - A_ptr_value[ind] - = bml_noinit_allocate_memory(nelements * sizeof(REAL_T)); + A_ptr_value[ind] = + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, nelements); REAL_T *A_value = A_ptr_value[ind]; assert(A_value != NULL); @@ -249,7 +387,8 @@ bml_matrix_ellblock_t *TYPED_FUNC( assert(mb > 0); bml_matrix_ellblock_t *A = - TYPED_FUNC(bml_block_matrix_ellblock) (nb, mb, bsize, distrib_mode); + TYPED_FUNC(bml_block_matrix_ellblock) (nb, mb, M, bsize, + distrib_mode); REAL_T **A_ptr_value = (REAL_T **) A->ptr_value; @@ -258,10 +397,13 @@ bml_matrix_ellblock_t *TYPED_FUNC( for (int ib = 0; ib < NB; ib++) { - int nelements = bsize[ib] * bsize[ib]; int ind = ROWMAJOR(ib, 0, NB, MB); - A_ptr_value[ind] = bml_allocate_memory(nelements * sizeof(REAL_T)); + int nelements = bsize[ib] * bsize[ib]; + A_ptr_value[ind] = + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, nelements); + REAL_T *A_value = A_ptr_value[ind]; + memset(A_value, 0, nelements * sizeof(REAL_T)); for (int ii = 0; ii < bsize[ib]; ii++) { A_value[ROWMAJOR(ii, ii, bsize[ib], bsize[ib])] = 1.0; diff --git a/src/C-interface/ellblock/bml_convert_ellblock_typed.c b/src/C-interface/ellblock/bml_convert_ellblock_typed.c index dda5b93e4..33de04bf6 100644 --- a/src/C-interface/ellblock/bml_convert_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_convert_ellblock_typed.c @@ -32,7 +32,8 @@ bml_matrix_ellblock_t *TYPED_FUNC( int nb = bml_get_nb(); bml_matrix_ellblock_t *B = - TYPED_FUNC(bml_block_matrix_ellblock) (nb, nb, bsize, distrib_mode); + TYPED_FUNC(bml_block_matrix_ellblock) (nb, nb, M, bsize, + distrib_mode); int NB = B->NB; int *offset = malloc(NB * sizeof(int)); @@ -50,7 +51,7 @@ bml_matrix_ellblock_t *TYPED_FUNC( int nelements = bsize[ib] * bsize[jb]; int ind = ROWMAJOR(ib, jb, NB, NB); B_ptr_value[ind] - = bml_noinit_allocate_memory(nelements * sizeof(REAL_T));; + = TYPED_FUNC(bml_allocate_block_ellblock) (B, ib, nelements); B->indexb[ind] = jb; REAL_T *block = malloc(nelements * sizeof(REAL_T)); diff --git a/src/C-interface/ellblock/bml_copy_ellblock_typed.c b/src/C-interface/ellblock/bml_copy_ellblock_typed.c index b04162e3f..eff8bf1f8 100644 --- a/src/C-interface/ellblock/bml_copy_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_copy_ellblock_typed.c @@ -23,7 +23,7 @@ bml_matrix_ellblock_t * TYPED_FUNC(bml_copy_ellblock_new) (bml_matrix_ellblock_t * A) { bml_matrix_ellblock_t *B = - TYPED_FUNC(bml_block_matrix_ellblock) (A->NB, A->MB, A->bsize, + TYPED_FUNC(bml_block_matrix_ellblock) (A->NB, A->MB, A->M, A->bsize, A->distribution_mode); int NB = A->NB; @@ -36,23 +36,25 @@ bml_matrix_ellblock_t REAL_T **B_ptr_value = (REAL_T **) B->ptr_value; memcpy(B->nnzb, A->nnzb, sizeof(int) * A->NB); - memcpy(B->bsize, A->bsize, sizeof(int) * A->NB); - memcpy(B_indexb, A_indexb, NB * MB * sizeof(int)); #pragma omp parallel for for (int ib = 0; ib < NB; ib++) { + assert(B->bsize[ib] > 0); + assert(B->bsize[ib] < 10); for (int jp = 0; jp < A->nnzb[ib]; jp++) { int ind = ROWMAJOR(ib, jp, NB, MB); int jb = B_indexb[ind]; + assert(jb < NB); + assert(B_ptr_value[ind] == NULL); + int nelements = A->bsize[ib] * A->bsize[jb]; B_ptr_value[ind] = - bml_noinit_allocate_memory(A->bsize[ib] * A->bsize[jb] * - sizeof(REAL_T)); - + TYPED_FUNC(bml_allocate_block_ellblock) (B, ib, nelements); + assert(B_ptr_value[ind] != NULL); memcpy(B_ptr_value[ind], A_ptr_value[ind], - A->bsize[ib] * A->bsize[jb] * sizeof(REAL_T)); + nelements * sizeof(REAL_T)); } } @@ -96,7 +98,10 @@ void TYPED_FUNC( int nelements = A->bsize[ib] * A->bsize[jb]; if (B_ptr_value[ind] == NULL) B_ptr_value[ind] - = bml_noinit_allocate_memory(nelements * sizeof(REAL_T)); + = + TYPED_FUNC(bml_allocate_block_ellblock) (B, ib, + nelements); + assert(B_ptr_value[ind] != NULL); memcpy(B_ptr_value[ind], A_ptr_value[ind], nelements * sizeof(REAL_T)); } diff --git a/src/C-interface/ellblock/bml_import_ellblock_typed.c b/src/C-interface/ellblock/bml_import_ellblock_typed.c index d1991c8fe..e75e5d578 100644 --- a/src/C-interface/ellblock/bml_import_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_import_ellblock_typed.c @@ -85,7 +85,9 @@ bml_matrix_ellblock_t if (norminf > threshold) { int ind = ROWMAJOR(ib, A_nnzb[ib], NB, MB); - A_bml->ptr_value[ind] = malloc(nelements * sizeof(REAL_T)); + A_bml->ptr_value[ind] = + TYPED_FUNC(bml_allocate_block_ellblock) (A_bml, ib, + nelements); REAL_T *A_value = A_bml->ptr_value[ind]; assert(A_value != NULL); memcpy(A_value, A_ij, nelements * sizeof(REAL_T)); diff --git a/src/C-interface/ellblock/bml_multiply_ellblock_typed.c b/src/C-interface/ellblock/bml_multiply_ellblock_typed.c index cf94011e5..0980d0bb0 100644 --- a/src/C-interface/ellblock/bml_multiply_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_multiply_ellblock_typed.c @@ -62,7 +62,8 @@ void TYPED_FUNC( else { bml_matrix_ellblock_t *A2 = - TYPED_FUNC(bml_block_matrix_ellblock) (C->NB, C->MB, C->bsize, + TYPED_FUNC(bml_block_matrix_ellblock) (C->NB, C->MB, C->M, + C->bsize, C->distribution_mode); if (A != NULL && A == B) @@ -238,7 +239,9 @@ void *TYPED_FUNC( int ind = ROWMAJOR(ib, ll, NB, MB); assert(ind < NB * MB); if (X2_ptr_value[ind] == NULL) - X2_ptr_value[ind] = malloc(nelements * sizeof(REAL_T)); + X2_ptr_value[ind] = + TYPED_FUNC(bml_allocate_block_ellblock) (X2, ib, + nelements); REAL_T *X2_value = X2_ptr_value[ind]; assert(X2_value != NULL); memcpy(X2_value, xtmp, nelements * sizeof(REAL_T)); @@ -394,7 +397,9 @@ void TYPED_FUNC( int nelements = bsize[ib] * bsize[jp]; int ind = ROWMAJOR(ib, ll, NB, C->MB); C_ptr_value[ind] - = bml_noinit_allocate_memory(nelements * sizeof(REAL_T)); + = + TYPED_FUNC(bml_allocate_block_ellblock) (C, ib, + nelements); memcpy(C_ptr_value[ind], xtmp, nelements * sizeof(REAL_T)); C_indexb[ind] = jp; ll++; diff --git a/src/C-interface/ellblock/bml_setters_ellblock_typed.c b/src/C-interface/ellblock/bml_setters_ellblock_typed.c index 3aec3a84f..9bcfdf916 100644 --- a/src/C-interface/ellblock/bml_setters_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_setters_ellblock_typed.c @@ -5,6 +5,7 @@ #include "../bml_logger.h" #include "../bml_types.h" #include "bml_setters_ellblock.h" +#include "bml_allocate_ellblock.h" #include "bml_types_ellblock.h" #include "bml_utilities_ellblock.h" @@ -63,6 +64,7 @@ void TYPED_FUNC( if (A_indexb[ind] == jb) { REAL_T *A_value = A_ptr_value[ind]; + assert(A_value != NULL); A_value[ROWMAJOR(ii, jj, A_bsize[ib], A_bsize[jb])] = *((REAL_T *) element); block_found = 1; @@ -77,11 +79,15 @@ void TYPED_FUNC( int ind = ROWMAJOR(ib, A_nnzb[ib], A->NB, A->MB); assert(A_ptr_value[ind] == NULL); - //printf("allocate new block ib=%d, ind=%d\n", ib, ind); - A_ptr_value[ind] = calloc(A_bsize[ib] * A_bsize[jb], sizeof(REAL_T)); + // printf("allocate new block ib=%d, ind=%d\n", ib, ind); + int nelements = A_bsize[ib] * A_bsize[jb]; + A_ptr_value[ind] = + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, nelements); A_indexb[ind] = jb; REAL_T *A_value = A_ptr_value[ind]; + assert(A_value != NULL); + memset(A_value, 0, nelements * sizeof(REAL_T)); A_value[ROWMAJOR(ii, jj, A_bsize[ib], A_bsize[jb])] = *((REAL_T *) element); A_nnzb[ib]++; @@ -160,7 +166,6 @@ void TYPED_FUNC( jj -= A_bsize[jb]; jb++; } - printf("jb=%d\n", jb); int block_found = 0; for (int jp = 0; jp < A_nnzb[ib]; jp++) { @@ -178,8 +183,11 @@ void TYPED_FUNC( int ind = ROWMAJOR(ib, A_nnzb[ib], A->NB, A->MB); int nelements = A_bsize[ib] * A_bsize[jb]; A_ptr_value[ind] - = bml_allocate_memory(nelements * sizeof(REAL_T)); + = + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, + nelements); REAL_T *A_value = A_ptr_value[ind]; + memset(A_value, 0, nelements * sizeof(REAL_T)); assert(A_value != NULL); A_value[ROWMAJOR(ii, jj, A_bsize[ib], A_bsize[jb])] = row[k]; A_nnzb[ib]++; @@ -256,11 +264,13 @@ void TYPED_FUNC( if (normdiag > threshold) { int ind = ROWMAJOR(ib, A_nnzb[ib], A->NB, A->MB); + int nelemnts = A_bsize[ib] * A_bsize[ib]; A_ptr_value[ind] = - calloc(A_bsize[ib] * A_bsize[ib], sizeof(REAL_T)); + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, nelemnts); A_indexb[ind] = ib; REAL_T *A_value = A_ptr_value[ind]; + memset(A_value, 0, nelemnts * sizeof(REAL_T)); for (int ii = 0; ii < A_bsize[ib]; ii++) { A_value[ROWMAJOR(ii, ii, A_bsize[ib], A_bsize[ib])] @@ -296,7 +306,6 @@ void TYPED_FUNC( if (A->indexb[ind] == jb) { int n2 = A->bsize[ib] * A->bsize[jb]; - printf("n2=%d, ind=%d\n", n2, ind); REAL_T *A_value = A->ptr_value[ind]; assert(A_value != NULL); memcpy(A_value, elements, n2 * sizeof(REAL_T)); diff --git a/src/C-interface/ellblock/bml_threshold_ellblock_typed.c b/src/C-interface/ellblock/bml_threshold_ellblock_typed.c index 400b32d39..0199af695 100644 --- a/src/C-interface/ellblock/bml_threshold_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_threshold_ellblock_typed.c @@ -35,7 +35,7 @@ bml_matrix_ellblock_t int *bsize = A->bsize; bml_matrix_ellblock_t *B = - TYPED_FUNC(bml_block_matrix_ellblock) (NB, MB, bsize, + TYPED_FUNC(bml_block_matrix_ellblock) (NB, MB, A->M, bsize, A->distribution_mode); REAL_T **A_ptr_value = (REAL_T **) A->ptr_value; @@ -61,7 +61,9 @@ bml_matrix_ellblock_t int nelements = bsize[ib] * bsize[jb]; int indB = ROWMAJOR(ib, B_nnzb[ib], NB, MB); B_ptr_value[indB] - = bml_noinit_allocate_memory(nelements * sizeof(REAL_T)); + = + TYPED_FUNC(bml_allocate_block_ellblock) (B, ib, + nelements); REAL_T *B_value = B_ptr_value[indB]; memcpy(B_value, A_value, nelements * sizeof(REAL_T)); for (int ii = 0; ii < bsize[ib]; ii++) @@ -107,7 +109,6 @@ void TYPED_FUNC( for (int ib = 0; ib < NB; ib++) { - rlen = 0; for (int jp = 0; jp < A_nnzb[ib]; jp++) { int ind = ROWMAJOR(ib, jp, NB, MB); @@ -118,13 +119,6 @@ void TYPED_FUNC( if (is_above_threshold(normA, threshold)) { - if (rlen < jp) - { - ind = ROWMAJOR(ib, rlen, NB, MB); - A_ptr_value[ind] = A_ptr_value[ROWMAJOR(ib, jp, NB, MB)]; - A_indexb[ind] = A_indexb[ROWMAJOR(ib, jp, NB, MB)]; - jb = A_indexb[ind]; - } //apply thresholding within block REAL_T *B_value = A_ptr_value[ind]; for (int ii = 0; ii < bsize[ib]; ii++) @@ -136,13 +130,13 @@ void TYPED_FUNC( B_value[index] = 0.; } } - rlen++; } else { - free(A_ptr_value[ind]); + //printf("remove block %d %d\n",ib,jb); + TYPED_FUNC(bml_free_block_ellblock) (A, ib, jb); + jp--; } } - A_nnzb[ib] = rlen; } } diff --git a/src/C-interface/ellblock/bml_transpose_ellblock_typed.c b/src/C-interface/ellblock/bml_transpose_ellblock_typed.c index 4bb4cf594..8675e9260 100644 --- a/src/C-interface/ellblock/bml_transpose_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_transpose_ellblock_typed.c @@ -33,9 +33,10 @@ bml_matrix_ellblock_t int *bsize = A->bsize; bml_distribution_mode_t distmode = A->distribution_mode; - bml_matrix_ellblock_t *B = TYPED_FUNC(bml_block_matrix_ellblock) (NB, MB, - bsize, - distmode); + bml_matrix_ellblock_t *B = + TYPED_FUNC(bml_block_matrix_ellblock) (NB, MB, A->M, + bsize, + distmode); REAL_T **A_ptr_value = (REAL_T **) A->ptr_value; int *A_indexb = A->indexb; @@ -52,11 +53,11 @@ bml_matrix_ellblock_t int indA = ROWMAJOR(ib, jp, NB, MB); int jb = A_indexb[indA]; int indB = ROWMAJOR(jb, B_nnzb[jb], NB, MB); - int nelements = bsize[ib] * bsize[jb]; //add a new block in B B_indexb[indB] = ib; + int nelements = bsize[ib] * bsize[jb]; B_ptr_value[indB] = - bml_noinit_allocate_memory(nelements * sizeof(REAL_T)); + TYPED_FUNC(bml_allocate_block_ellblock) (B, ib, nelements); B_nnzb[jb]++; //transpose block REAL_T *B_value = B_ptr_value[indB]; diff --git a/src/C-interface/ellblock/bml_types_ellblock.h b/src/C-interface/ellblock/bml_types_ellblock.h index 32e409df5..c5de1e4bf 100644 --- a/src/C-interface/ellblock/bml_types_ellblock.h +++ b/src/C-interface/ellblock/bml_types_ellblock.h @@ -20,6 +20,14 @@ struct bml_matrix_ellblock_t int NB; /** The max. number of blocks per row. */ int MB; +#ifdef BML_ELLBLOCK_USE_MEMPOOL + /** Storage for matrix elements/blocks. */ + void *memory_pool; + /** offsets for storage of row blocks */ + int *memory_pool_offsets; + /** Ptr to last stored block in row block */ + void **memory_pool_ptr; +#endif /** Pointers to blocks of values */ void **ptr_value; /** The block index matrix. */ diff --git a/src/C-interface/ellblock/bml_utilities_ellblock_typed.c b/src/C-interface/ellblock/bml_utilities_ellblock_typed.c index 504ebd1b9..8880905cf 100644 --- a/src/C-interface/ellblock/bml_utilities_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_utilities_ellblock_typed.c @@ -6,6 +6,7 @@ #include "../bml_utilities.h" #include "bml_types_ellblock.h" #include "bml_utilities_ellblock.h" +#include "bml_allocate_ellblock.h" #include #include @@ -30,7 +31,6 @@ void TYPED_FUNC( FILE *hFile; char header1[20], header2[20], header3[20], header4[20], header5[20]; int hdimx, nnz, irow, icol; - double real_part, imaginary_part; REAL_T value; int NB = A->NB; @@ -71,6 +71,7 @@ void TYPED_FUNC( LOG_ERROR("read error\n"); } #elif defined(SINGLE_COMPLEX) || defined(DOUBLE_COMPLEX) + double real_part, imaginary_part; if (fscanf (hFile, "%d %d %le %le\n", &irow, &icol, &real_part, &imaginary_part) != 4) @@ -118,12 +119,14 @@ void TYPED_FUNC( { //printf("Add new block %d, %d\n",ib, jb); assert(A_nnzb[ib] < MB); - int nelements = A_bsize[ib] * A_bsize[jb]; int ind = ROWMAJOR(ib, A_nnzb[ib], NB, MB); A_indexb[ind] = jb; A_nnzb[ib]++; - A_ptr_value[ind] = calloc(nelements, sizeof(REAL_T)); + int nelements = A_bsize[ib] * A_bsize[jb]; + A_ptr_value[ind] = + TYPED_FUNC(bml_allocate_block_ellblock) (A, ib, nelements); REAL_T *A_value = A_ptr_value[ind]; + memset(A_value, 0, nelements * sizeof(REAL_T)); assert(A_value != NULL); A_value[ROWMAJOR(irow, icol, A_bsize[ib], A_bsize[jb])] = value; } diff --git a/src/Fortran-interface/bml_allocate_m.F90 b/src/Fortran-interface/bml_allocate_m.F90 index 962a9f919..5f8c28cc3 100644 --- a/src/Fortran-interface/bml_allocate_m.F90 +++ b/src/Fortran-interface/bml_allocate_m.F90 @@ -103,12 +103,12 @@ subroutine bml_zero_matrix(matrix_type, element_type, element_precision, & end subroutine bml_zero_matrix subroutine bml_block_matrix(matrix_type, element_type, element_precision, & - & nb, mb, bsizes, a, distrib_mode) + & nb, mb, m, bsizes, a, distrib_mode) character(len=*), intent(in) :: matrix_type, element_type character(len=*), optional, intent(in) :: distrib_mode integer, intent(in) :: element_precision - integer(C_INT), intent(in) :: nb, mb + integer(C_INT), intent(in) :: nb, mb, m integer, allocatable, intent(in) :: bsizes(:) type(bml_matrix_t), intent(inout) :: a @@ -121,10 +121,9 @@ subroutine bml_block_matrix(matrix_type, element_type, element_precision, & endif call bml_deallocate(a) - print*,'bsizes=',bsizes(1) a%ptr = bml_block_matrix_C(get_matrix_id(matrix_type), & & get_element_id(element_type, element_precision), & - & nb, mb, bsizes, get_dmode_id(distrib_mode_)) + & nb, mb, m, bsizes, get_dmode_id(distrib_mode_)) end subroutine bml_block_matrix diff --git a/src/Fortran-interface/bml_c_interface_m.F90 b/src/Fortran-interface/bml_c_interface_m.F90 index 26e379f45..d662007d0 100644 --- a/src/Fortran-interface/bml_c_interface_m.F90 +++ b/src/Fortran-interface/bml_c_interface_m.F90 @@ -597,13 +597,14 @@ function bml_zero_matrix_C(matrix_type, matrix_precision, n, m, & type(C_PTR) :: bml_zero_matrix_C end function bml_zero_matrix_C - function bml_block_matrix_C(matrix_type, matrix_precision, nb, mb, bsizes, & - & distrib_mode) bind(C, name="bml_block_matrix") + function bml_block_matrix_C(matrix_type, matrix_precision, nb, mb, m, & + & bsizes, distrib_mode) bind(C, name="bml_block_matrix") import :: C_INT, C_PTR integer(C_INT), value, intent(in) :: matrix_type integer(C_INT), value, intent(in) :: matrix_precision integer(C_INT), value, intent(in) :: nb integer(C_INT), value, intent(in) :: mb + integer(C_INT), value, intent(in) :: m integer(C_INT), intent(in), dimension(*) :: bsizes integer(C_INT), value, intent(in) :: distrib_mode type(C_PTR) :: bml_block_matrix_C