From 5aa44ec2b53b77210bd930e0daf0390ce60bc078 Mon Sep 17 00:00:00 2001 From: Mike Wall Date: Thu, 15 Jun 2023 15:29:32 -0400 Subject: [PATCH] Fix OMP offload build memory leaks o Fix leaks seen in running progress benchmarks - bml_add_ellpack arrays allocated but not freed - bml_multiply_ellpack arrays allocated but not freed o Fix similar leaks in other subroutines o Also increase efficiency of a target region in bml_prune_rocsparse_ellpack --- .../ellpack/bml_add_ellpack_typed.c | 37 +++++++++++++------ .../ellpack/bml_allocate_ellpack_typed.c | 2 +- .../bml_element_multiply_ellpack_typed.c | 19 ++++++---- .../ellpack/bml_multiply_ellpack_typed.c | 29 +++++++++++---- .../ellpack/bml_scale_ellpack_typed.c | 11 ++---- 5 files changed, 65 insertions(+), 33 deletions(-) diff --git a/src/C-interface/ellpack/bml_add_ellpack_typed.c b/src/C-interface/ellpack/bml_add_ellpack_typed.c index 933cd4544..576c9a861 100644 --- a/src/C-interface/ellpack/bml_add_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_add_ellpack_typed.c @@ -95,11 +95,12 @@ void TYPED_FUNC( all_jx = calloc(N * num_chunks, sizeof(int)); all_x = calloc(N * num_chunks, sizeof(REAL_T)); -#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) - +#pragma omp target enter data map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) + #endif #if defined (USE_OMP_OFFLOAD) +#pragma omp target #if BML_OFFLOAD_CHUNKS #pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ @@ -115,7 +116,7 @@ void TYPED_FUNC( x = &all_x[chunk * N]; #else -#pragma omp target teams distribute parallel for \ +#pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ shared(A_index, A_value, A_nnz) \ shared(B_index, B_value, B_nnz) \ @@ -197,6 +198,10 @@ void TYPED_FUNC( } #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS } +#pragma omp target exit data map(delete:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) + free(all_ix); + free(all_jx); + free(all_x); #endif #endif @@ -269,11 +274,12 @@ double TYPED_FUNC( all_x = calloc(N * num_chunks, sizeof(REAL_T)); all_y = calloc(N * num_chunks, sizeof(REAL_T)); -#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm) +#pragma omp target enter data map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) #endif #if defined (USE_OMP_OFFLOAD) +#pragma omp target map(tofrom:trnorm) #if BML_OFFLOAD_CHUNKS #pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ @@ -292,7 +298,7 @@ double TYPED_FUNC( #else -#pragma omp target teams distribute parallel for \ +#pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ shared(A_index, A_value, A_nnz) \ shared(B_index, B_value, B_nnz) \ @@ -385,6 +391,11 @@ double TYPED_FUNC( } #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS } +#pragma omp target exit data map(delete:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) + free(all_ix); + free(all_jx); + free(all_x); + free(all_y); #endif return trnorm; @@ -426,18 +437,19 @@ void TYPED_FUNC( #if BML_OFFLOAD_CHUNKS int num_chunks = MIN(BML_OFFLOAD_NUM_CHUNKS, N); - int all_jx[N * num_chunks]; - REAL_T all_x[N * num_chunks]; + int *all_jx; + REAL_T *all_x; - memset(all_jx, 0, N * num_chunks * sizeof(int)); - memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T)); + all_jx = calloc(N * num_chunks, sizeof(int)); + all_x = calloc(N * num_chunks, sizeof(REAL_T)); #if defined (USE_OMP_OFFLOAD) -#pragma omp target map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) +#pragma omp target enter data map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) #endif #endif #if defined (USE_OMP_OFFLOAD) +#pragma omp target #if BML_OFFLOAD_CHUNKS #pragma omp teams distribute parallel for \ shared(N, A_M) \ @@ -451,7 +463,7 @@ void TYPED_FUNC( x = &all_x[chunk * N]; #else -#pragma omp target teams distribute parallel for \ +#pragma omp teams distribute parallel for \ shared(N, A_M) \ shared(A_index, A_value, A_nnz) \ firstprivate(jx, x) @@ -524,6 +536,9 @@ void TYPED_FUNC( } #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS } +#pragma omp target exit data map(delete:all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) + free(all_jx); + free(all_x); #endif } diff --git a/src/C-interface/ellpack/bml_allocate_ellpack_typed.c b/src/C-interface/ellpack/bml_allocate_ellpack_typed.c index 6dccc4214..fe3cc5486 100644 --- a/src/C-interface/ellpack/bml_allocate_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_allocate_ellpack_typed.c @@ -540,7 +540,7 @@ void TYPED_FUNC( REAL_T threshold = (REAL_T)threshold_in; -#pragma omp target map(from:nnz) +#pragma omp target map(to:N) map(from:nnz) { nnz = csrRowPtr[N]; } diff --git a/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c index 64f7b57ff..975d96a4f 100644 --- a/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c @@ -78,18 +78,19 @@ void TYPED_FUNC( #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS int num_chunks = MIN(BML_OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); - int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks]; - REAL_T all_x[C_N * num_chunks]; + int *all_ix, *all_jx; + REAL_T *all_x; - memset(all_ix, 0, C_N * num_chunks * sizeof(int)); - memset(all_jx, 0, C_N * num_chunks * sizeof(int)); - memset(all_x, 0.0, C_N * num_chunks * sizeof(REAL_T)); + all_ix = calloc(C_N * num_chunks, sizeof(int)); + all_jx = calloc(C_N * num_chunks, sizeof(int)); + all_x = calloc(C_N * num_chunks, sizeof(REAL_T)); -#pragma omp target map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks]) +#pragma omp target enter data map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks]) #endif #if defined (USE_OMP_OFFLOAD) +#pragma omp target #if BML_OFFLOAD_CHUNKS #pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ @@ -106,7 +107,7 @@ void TYPED_FUNC( x = &all_x[chunk * C_N]; #else -#pragma omp target teams distribute parallel for \ +#pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ shared(A_localRowMin, A_localRowMax) \ shared(B_N, B_M, B_nnz, B_index, B_value) \ @@ -201,5 +202,9 @@ void TYPED_FUNC( } #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS } +#pragma omp target exit data map(delete:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks]) + free(all_ix); + free(all_jx); + free(all_x); #endif } diff --git a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c index 909aee032..4efab1d6b 100644 --- a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c @@ -187,11 +187,12 @@ void *TYPED_FUNC( all_jx = calloc(X_N * num_chunks, sizeof(int)); all_x = calloc(X_N * num_chunks, sizeof(REAL_T)); -#pragma omp target map(to:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks]) +#pragma omp target enter data map(to:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks]) #endif #if defined (USE_OMP_OFFLOAD) +#pragma omp target #if BML_OFFLOAD_CHUNKS #pragma omp teams distribute parallel for \ shared(X_N, X_M, X_index, X_nnz, X_value) \ @@ -208,7 +209,7 @@ void *TYPED_FUNC( x = &all_x[chunk * X_N]; #else -#pragma omp target teams distribute parallel for \ +#pragma omp teams distribute parallel for \ shared(X_N, X_M, X_index, X_nnz, X_value) \ shared(X2_N, X2_M, X2_index, X2_nnz, X2_value) \ shared(rowMin, rowMax) \ @@ -319,6 +320,10 @@ void *TYPED_FUNC( #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS } +#pragma omp target exit data map(delete:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks]) + free(all_ix); + free(all_jx); + free(all_x); #endif #endif // endif cusparse or rocsparse @@ -407,11 +412,12 @@ void TYPED_FUNC( all_jx = calloc(C_N * num_chunks, sizeof(int)); all_x = calloc(C_N * num_chunks, sizeof(REAL_T)); -#pragma omp target map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks]) +#pragma omp target enter data map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks]) #endif #if defined (USE_OMP_OFFLOAD) +#pragma omp target #if BML_OFFLOAD_CHUNKS #pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ @@ -428,7 +434,7 @@ void TYPED_FUNC( x = &all_x[chunk * C_N]; #else -#pragma omp target teams distribute parallel for \ +#pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ shared(A_localRowMin, A_localRowMax) \ shared(B_N, B_M, B_nnz, B_index, B_value) \ @@ -520,6 +526,10 @@ void TYPED_FUNC( } #if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS } +#pragma omp target exit data map(delete:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks]) + free(all_ix); + free(all_jx); + free(all_x); #endif #endif // endif cusparse or rocsparse @@ -1157,7 +1167,8 @@ void TYPED_FUNC( rocsparse_spgemm_alg_default, rocsparse_spgemm_stage_buffer_size, &bufferSize1, NULL)); - // hipDeviceSynchronize(); + // hipDeviceSynchronize(); // Ensure that the previous call is finished + // Allocate the spgemm working buffer dBuffer1 = (char *) malloc(sizeof(char) * bufferSize1); // Allocate the same array on the device @@ -1174,7 +1185,8 @@ void TYPED_FUNC( rocsparse_spgemm_stage_nnz, &bufferSize1, dBuffer1)); } - // hipDeviceSynchronize(); + // hipDeviceSynchronize(); // Ensure that the previous call is finished + // Get nnz value returned by spgemm() int64_t C_num_rows, C_num_cols, C_nnz_tmp; BML_CHECK_ROCSPARSE(rocsparse_spmat_get_size @@ -1217,6 +1229,9 @@ void TYPED_FUNC( rocsparse_spgemm_stage_compute, &bufferSize1, dBuffer1)); } + // hipDeviceSynchronize(); // Ensure that the previous call is finished + + // Delete the temporary work array #pragma omp target exit data map(delete:dBuffer1[:bufferSize1]) // Place the resulting matrix in C #pragma omp target teams distribute parallel for @@ -1237,7 +1252,7 @@ void TYPED_FUNC( TYPED_FUNC(bml_prune_rocsparse_ellpack) (handle,C,threshold); // Free the temporary arrays used on the device and host -#pragma omp target exit data map(delete:csrRowPtrC_tmp[:C_num_rows+1],csrColIndC_tmp[:C_nnz_tmp],csrValC_tmp[:C_nnz_tmp],dBuffer1[:bufferSize1]) +#pragma omp target exit data map(delete:csrRowPtrC_tmp[:C_num_rows+1],csrColIndC_tmp[:C_nnz_tmp],csrValC_tmp[:C_nnz_tmp]) free(csrRowPtrC_tmp); free(csrColIndC_tmp); diff --git a/src/C-interface/ellpack/bml_scale_ellpack_typed.c b/src/C-interface/ellpack/bml_scale_ellpack_typed.c index 1ed75140a..9ac4c0ae4 100644 --- a/src/C-interface/ellpack/bml_scale_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_scale_ellpack_typed.c @@ -146,14 +146,11 @@ void TYPED_FUNC( int *A_nnz = A->nnz; int *A_index = A->index; REAL_T scale = *scale_factor; -#pragma omp target teams distribute parallel for collapse(2) - for (int i = 0; i < N; i++) + size_t MbyN = N * M; +#pragma omp target teams distribute parallel for map(to:MbyN,scale) + for (size_t i = 0; i < MbyN; i++) { - for (int j = 0; j < M; j++) - { - A_value[ROWMAJOR(i, j, M, N)] = - scale * A_value[ROWMAJOR(i, j, M, N)]; - } + A_value[i] = scale * A_value[i]; } #else // offload conditional