From 7bfdada0a087d5382e07187bd7f3eaa70892f4f9 Mon Sep 17 00:00:00 2001 From: Jean-Luc Fattebert Date: Tue, 22 Aug 2023 21:17:07 -0400 Subject: [PATCH] Implement noinit allocation for dense and distributed2d --- src/C-interface/bml_allocate.c | 11 ++-- src/C-interface/dense/bml_allocate_dense.c | 43 +++++++++++++++ src/C-interface/dense/bml_allocate_dense.h | 25 +++++++++ .../dense/bml_allocate_dense_typed.c | 53 +++++++++++++++++++ src/C-interface/dense/bml_copy_dense_typed.c | 4 +- .../dense/bml_transpose_dense_typed.c | 2 +- .../bml_allocate_distributed2d.c | 30 +++++++++++ .../bml_allocate_distributed2d.h | 6 +++ .../distributed2d/bml_copy_distributed2d.c | 6 +-- .../distributed2d/bml_scale_distributed2d.c | 6 +-- .../bml_threshold_distributed2d.c | 6 +-- 11 files changed, 175 insertions(+), 17 deletions(-) diff --git a/src/C-interface/bml_allocate.c b/src/C-interface/bml_allocate.c index e31888a05..98921c189 100644 --- a/src/C-interface/bml_allocate.c +++ b/src/C-interface/bml_allocate.c @@ -276,16 +276,17 @@ bml_noinit_rectangular_matrix( matrix_dimension.N_rows); #ifdef BML_USE_MPI if (distrib_mode == distributed) - return bml_zero_matrix_distributed2d(matrix_type, matrix_precision, - matrix_dimension.N_rows, - matrix_dimension.N_nz_max); + return bml_noinit_matrix_distributed2d(matrix_type, matrix_precision, + matrix_dimension.N_rows, + matrix_dimension.N_nz_max); else #endif switch (matrix_type) { case dense: - return bml_zero_matrix_dense(matrix_precision, - matrix_dimension, distrib_mode); + return bml_noinit_matrix_dense(matrix_precision, + matrix_dimension, + distrib_mode); break; case ellpack: return bml_noinit_matrix_ellpack(matrix_precision, diff --git a/src/C-interface/dense/bml_allocate_dense.c b/src/C-interface/dense/bml_allocate_dense.c index b89a62f43..57631d561 100644 --- a/src/C-interface/dense/bml_allocate_dense.c +++ b/src/C-interface/dense/bml_allocate_dense.c @@ -99,6 +99,49 @@ bml_clear_dense( } } +/** Allocate an uninitialized matrix. + * + * \ingroup allocate_group + * + * \param matrix_precision The precision of the matrix. The default + * is double precision. + * \param matrix_dimension The matrix size. + * \param distrib_mode The distribution mode. + * \return The matrix. + */ +bml_matrix_dense_t * +bml_noinit_matrix_dense( + bml_matrix_precision_t matrix_precision, + bml_matrix_dimension_t matrix_dimension, + bml_distribution_mode_t distrib_mode) +{ + switch (matrix_precision) + { + case single_real: + return bml_noinit_matrix_dense_single_real(matrix_dimension, + distrib_mode); + break; + case double_real: + return bml_noinit_matrix_dense_double_real(matrix_dimension, + distrib_mode); + break; +#ifdef BML_COMPLEX + case single_complex: + return bml_noinit_matrix_dense_single_complex(matrix_dimension, + distrib_mode); + break; + case double_complex: + return bml_noinit_matrix_dense_double_complex(matrix_dimension, + distrib_mode); + break; +#endif + default: + LOG_ERROR("unknown precision (%d)\n", matrix_precision); + break; + } + return NULL; +} + /** Allocate the zero matrix. * * Note that the matrix \f$ a \f$ will be newly allocated. If it is diff --git a/src/C-interface/dense/bml_allocate_dense.h b/src/C-interface/dense/bml_allocate_dense.h index 7798dc4dc..594bae45b 100644 --- a/src/C-interface/dense/bml_allocate_dense.h +++ b/src/C-interface/dense/bml_allocate_dense.h @@ -33,6 +33,31 @@ void bml_clear_dense_single_complex( void bml_clear_dense_double_complex( bml_matrix_dense_t * A); +bml_matrix_dense_t *bml_noinit_matrix_dense( + bml_matrix_precision_t matrix_precision, + bml_matrix_dimension_t matrix_dimension, + bml_distribution_mode_t distrib_mode); + +bml_matrix_dense_t *bml_noinit_matrix_dense_single_real( + bml_matrix_dimension_t matrix_dimension, + bml_distribution_mode_t distrib_mode); + +bml_matrix_dense_t *bml_noinit_matrix_dense_double_real( + bml_matrix_dimension_t matrix_dimension, + bml_distribution_mode_t distrib_mode); + +bml_matrix_dense_t + * bml_noinit_matrix_dense_single_complex(bml_matrix_dimension_t + matrix_dimension, + bml_distribution_mode_t + distrib_mode); + +bml_matrix_dense_t + * bml_noinit_matrix_dense_double_complex(bml_matrix_dimension_t + matrix_dimension, + bml_distribution_mode_t + distrib_mode); + bml_matrix_dense_t *bml_zero_matrix_dense( bml_matrix_precision_t matrix_precision, bml_matrix_dimension_t matrix_dimension, diff --git a/src/C-interface/dense/bml_allocate_dense_typed.c b/src/C-interface/dense/bml_allocate_dense_typed.c index 36fc5dcdf..5f0f6f15a 100644 --- a/src/C-interface/dense/bml_allocate_dense_typed.c +++ b/src/C-interface/dense/bml_allocate_dense_typed.c @@ -78,6 +78,59 @@ void TYPED_FUNC( #endif } +/** Allocate a matrix with uninitialized values. + * + * \ingroup allocate_group + * + * \param matrix_precision The precision of the matrix. The default + * is double precision. + * \param matrix_dimension The matrix size. + * \param distrib_mode The distribution mode. + * \return The matrix. + */ +bml_matrix_dense_t *TYPED_FUNC( + bml_noinit_matrix_dense) ( + bml_matrix_dimension_t matrix_dimension, + bml_distribution_mode_t distrib_mode) +{ + bml_matrix_dense_t *A = + bml_noinit_allocate_memory(sizeof(bml_matrix_dense_t)); + A->matrix_type = dense; + A->matrix_precision = MATRIX_PRECISION; + A->N = matrix_dimension.N_rows; + A->distribution_mode = distrib_mode; +#ifdef BML_USE_MAGMA + A->ld = magma_roundup(matrix_dimension.N_rows, 32); + int device; + magma_getdevice(&device); + bml_queue_create(device); + magma_int_t ret = MAGMA(malloc) ((MAGMA_T **) & A->matrix, + A->ld * matrix_dimension.N_rows); + assert(ret == MAGMA_SUCCESS); +#else + A->ld = matrix_dimension.N_rows; + A->matrix = + bml_noinit_allocate_memory(sizeof(REAL_T) * matrix_dimension.N_rows * + matrix_dimension.N_rows); +#ifdef MKL_GPU + int sizea = A->ld * A->ld; + int dnum = 0; + + REAL_T *A_matrix = (REAL_T *) A->matrix; + // allocate and offload the matrix to GPU +#pragma omp target enter data map(alloc:A_matrix[0:sizea]) +#endif // end of MKL_GPU + +#endif + A->domain = + bml_default_domain(matrix_dimension.N_rows, matrix_dimension.N_rows, + distrib_mode); + A->domain2 = + bml_default_domain(matrix_dimension.N_rows, matrix_dimension.N_rows, + distrib_mode); + return A; +} + /** Allocate the zero matrix. * * Note that the matrix \f$ a \f$ will be newly allocated. If it is diff --git a/src/C-interface/dense/bml_copy_dense_typed.c b/src/C-interface/dense/bml_copy_dense_typed.c index e021f09f1..8130f7930 100644 --- a/src/C-interface/dense/bml_copy_dense_typed.c +++ b/src/C-interface/dense/bml_copy_dense_typed.c @@ -35,8 +35,8 @@ bml_matrix_dense_t *TYPED_FUNC( { bml_matrix_dimension_t matrix_dimension = { A->N, A->N, A->N }; bml_matrix_dense_t *B = - TYPED_FUNC(bml_zero_matrix_dense) (matrix_dimension, - A->distribution_mode); + TYPED_FUNC(bml_noinit_matrix_dense) (matrix_dimension, + A->distribution_mode); #ifdef BML_USE_MAGMA MAGMA(copymatrix) (A->N, A->N, A->matrix, A->ld, B->matrix, B->ld, bml_queue()); diff --git a/src/C-interface/dense/bml_transpose_dense_typed.c b/src/C-interface/dense/bml_transpose_dense_typed.c index 76b0b53a6..d3f99de47 100644 --- a/src/C-interface/dense/bml_transpose_dense_typed.c +++ b/src/C-interface/dense/bml_transpose_dense_typed.c @@ -37,7 +37,7 @@ bml_matrix_dense_t *TYPED_FUNC( bml_matrix_dimension_t matrix_dimension = { A->N, A->N, A->N }; bml_matrix_dense_t *B = - TYPED_FUNC(bml_zero_matrix_dense) (matrix_dimension, + TYPED_FUNC(bml_noinit_matrix_dense) (matrix_dimension, A->distribution_mode); REAL_T *A_matrix = A->matrix; REAL_T *B_matrix = B->matrix; diff --git a/src/C-interface/distributed2d/bml_allocate_distributed2d.c b/src/C-interface/distributed2d/bml_allocate_distributed2d.c index dde6f8212..04cd81859 100644 --- a/src/C-interface/distributed2d/bml_allocate_distributed2d.c +++ b/src/C-interface/distributed2d/bml_allocate_distributed2d.c @@ -105,6 +105,36 @@ bml_clear_distributed2d( bml_clear(A->matrix); } +/** Allocate uninitialized matrix. + * + * \ingroup allocate_group + * + * \param matrix_precision The precision of the matrix. The default + * is double precision. + * \param matrix_dimension The matrix size. + * \return The matrix. + */ +bml_matrix_distributed2d_t * +bml_noinit_matrix_distributed2d( + bml_matrix_type_t matrix_type, + bml_matrix_precision_t matrix_precision, + int N, + int M) +{ + assert(N > 0); + assert(M > 0); + + bml_matrix_distributed2d_t *A = + bml_noinit_allocate_memory(sizeof(bml_matrix_distributed2d_t)); + bml_setup_distributed2d(N, A); + A->M = M; + A->matrix_precision = matrix_precision; + int m = M / bml_sqrtint(A->ntasks); + A->matrix = + bml_noinit_matrix(matrix_type, matrix_precision, A->n, m, sequential); + return A; +} + /** Allocate the zero matrix. * * Note that the matrix \f$ a \f$ will be newly allocated. If it is diff --git a/src/C-interface/distributed2d/bml_allocate_distributed2d.h b/src/C-interface/distributed2d/bml_allocate_distributed2d.h index 7f0ee7974..066a415eb 100644 --- a/src/C-interface/distributed2d/bml_allocate_distributed2d.h +++ b/src/C-interface/distributed2d/bml_allocate_distributed2d.h @@ -28,6 +28,12 @@ void bml_clear_distributed2d_single_complex( void bml_clear_distributed2d_double_complex( bml_matrix_distributed2d_t * A); +bml_matrix_distributed2d_t *bml_noinit_matrix_distributed2d( + bml_matrix_type_t matrix_type, + bml_matrix_precision_t matrix_precision, + int N, + int M); + bml_matrix_distributed2d_t *bml_zero_matrix_distributed2d( bml_matrix_type_t matrix_type, bml_matrix_precision_t matrix_precision, diff --git a/src/C-interface/distributed2d/bml_copy_distributed2d.c b/src/C-interface/distributed2d/bml_copy_distributed2d.c index f4ae01938..9a4ebb0c3 100644 --- a/src/C-interface/distributed2d/bml_copy_distributed2d.c +++ b/src/C-interface/distributed2d/bml_copy_distributed2d.c @@ -23,9 +23,9 @@ bml_copy_distributed2d_new( assert(A->M > 0); bml_matrix_distributed2d_t *B = - bml_zero_matrix_distributed2d(bml_get_type(A->matrix), - bml_get_precision(A->matrix), A->N, - A->M); + bml_noinit_matrix_distributed2d(bml_get_type(A->matrix), + bml_get_precision(A->matrix), A->N, + A->M); // copy local block bml_copy(A->matrix, B->matrix); diff --git a/src/C-interface/distributed2d/bml_scale_distributed2d.c b/src/C-interface/distributed2d/bml_scale_distributed2d.c index 307b4b136..5fb774757 100644 --- a/src/C-interface/distributed2d/bml_scale_distributed2d.c +++ b/src/C-interface/distributed2d/bml_scale_distributed2d.c @@ -23,9 +23,9 @@ bml_scale_distributed2d_new( assert(A->M > 0); bml_matrix_distributed2d_t *B = - bml_zero_matrix_distributed2d(bml_get_type(A->matrix), - bml_get_precision(A->matrix), A->N, - A->M); + bml_noinit_matrix_distributed2d(bml_get_type(A->matrix), + bml_get_precision(A->matrix), A->N, + A->M); bml_scale(scale_factor, A->matrix, B->matrix); diff --git a/src/C-interface/distributed2d/bml_threshold_distributed2d.c b/src/C-interface/distributed2d/bml_threshold_distributed2d.c index 7c48c8c11..68e6e45a8 100644 --- a/src/C-interface/distributed2d/bml_threshold_distributed2d.c +++ b/src/C-interface/distributed2d/bml_threshold_distributed2d.c @@ -23,9 +23,9 @@ bml_matrix_distributed2d_t assert(A->M > 0); bml_matrix_distributed2d_t *B = - bml_zero_matrix_distributed2d(bml_get_type(A->matrix), - bml_get_precision(A->matrix), A->N, - A->M); + bml_noinit_matrix_distributed2d(bml_get_type(A->matrix), + bml_get_precision(A->matrix), A->N, + A->M); // copy local block bml_copy(A->matrix, B->matrix);