pass reference not pointer of UA, overlap shared

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
ginkgo-project · Aug 4, 2021 · e89a897 · e89a897
1 parent d7f61b4
commit e89a897
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 91 deletions.
diff --git a/common/solver/idr_kernels.hpp.inc b/common/solver/idr_kernels.hpp.inc
@@ -59,9 +59,10 @@ __global__
     const auto tidx = thread::get_thread_id_flat();
 
     __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
-    ValueType *__restrict__ reduction_helper = reduction_helper_array;
-
-    __shared__ remove_complex<ValueType> reduction_helper_real[block_size];
+    // they are not be used in the same time.
+    ValueType *reduction_helper = reduction_helper_array;
+    auto reduction_helper_real =
+        reinterpret_cast<remove_complex<ValueType> *>(reduction_helper);
 
     for (size_type row = 0; row < num_rows; row++) {
         for (size_type i = 0; i < row; i++) {

diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
@@ -129,13 +129,13 @@ template <std::uint32_t block_size, typename ValueType>
 void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
                       ValueType *__restrict__ block_sum,
                       sycl::nd_item<3> item_ct1,
-                      UninitializedArray<ValueType, block_size> *prefix_helper)
+                      UninitializedArray<ValueType, block_size> &prefix_helper)
 {
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     const auto element_id = item_ct1.get_local_id(2);
 
     // do not need to access the last element when exclusive prefix sum
-    (*prefix_helper)[element_id] =
+    prefix_helper[element_id] =
         (tidx + 1 < num_elements) ? elements[tidx] : zero<ValueType>();
     auto this_block = group::this_thread_block(item_ct1);
     this_block.sync();
@@ -146,17 +146,17 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
         const auto ai = i * (2 * element_id + 1) - 1;
         const auto bi = i * (2 * element_id + 2) - 1;
         if (bi < block_size) {
-            (*prefix_helper)[bi] += (*prefix_helper)[ai];
+            prefix_helper[bi] += prefix_helper[ai];
         }
         this_block.sync();
     }
 
     if (element_id == 0) {
         // Store the total sum except the last block
         if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) {
-            block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1];
+            block_sum[item_ct1.get_group(2)] = prefix_helper[block_size - 1];
         }
-        (*prefix_helper)[block_size - 1] = zero<ValueType>();
+        prefix_helper[block_size - 1] = zero<ValueType>();
     }
 
     this_block.sync();
@@ -167,14 +167,14 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
         const auto ai = i * (2 * element_id + 1) - 1;
         const auto bi = i * (2 * element_id + 2) - 1;
         if (bi < block_size) {
-            auto tmp = (*prefix_helper)[ai];
-            (*prefix_helper)[ai] = (*prefix_helper)[bi];
-            (*prefix_helper)[bi] += tmp;
+            auto tmp = prefix_helper[ai];
+            prefix_helper[ai] = prefix_helper[bi];
+            prefix_helper[bi] += tmp;
         }
         this_block.sync();
     }
     if (tidx < num_elements) {
-        elements[tidx] = (*prefix_helper)[element_id];
+        elements[tidx] = prefix_helper[element_id];
     }
 }
 
@@ -193,7 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                          [=](sycl::nd_item<3> item_ct1) {
                              start_prefix_sum<block_size>(
                                  num_elements, elements, block_sum, item_ct1,
-                                 prefix_helper_acc_ct1.get_pointer().get());
+                                 *prefix_helper_acc_ct1.get_pointer());
                          });
     });
 }

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
@@ -205,14 +205,14 @@ template <std::uint32_t cfg, typename ValueType>
 void reduce_add_array(
     size_type size, const ValueType *__restrict__ source,
     ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *block_sum)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &block_sum)
 {
     reduce_array<KCFG_1D::decode<1>(cfg)>(
-        size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
+        size, source, static_cast<ValueType *>(block_sum), item_ct1,
         [](const ValueType &x, const ValueType &y) { return x + y; });
 
     if (item_ct1.get_local_id(2) == 0) {
-        result[item_ct1.get_group(2)] = (*block_sum)[0];
+        result[item_ct1.get_group(2)] = block_sum[0];
     }
 }
 
@@ -230,7 +230,7 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 reduce_add_array<cfg>(size, source, result, item_ct1,
-                                      block_sum_acc_ct1.get_pointer().get());
+                                      *block_sum_acc_ct1.get_pointer());
             });
     });
 }

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -89,7 +89,7 @@ template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
 void compute_partial_reduce(
     size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
     CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
-    UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+    UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)> &tmp_work)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -101,7 +101,7 @@ void compute_partial_reduce(
     const auto global_id =
         thread::get_thread_id<sg_size, warps_per_block>(item_ct1);
 
-    OutType *tmp_work_array = *tmp_work;
+    OutType *tmp_work_array = tmp_work;
     auto tmp = zero<OutType>();
     for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) {
         tmp = reduce_op(tmp, get_value(i));
@@ -124,7 +124,7 @@ void finalize_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     CallableReduce reduce_op, CallableFinalize finalize_op,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -135,7 +135,7 @@ void finalize_reduce_computation(
     for (auto i = local_id; i < size; i += wg_size) {
         tmp = reduce_op(tmp, work[i]);
     }
-    ValueType *tmp_work_array = *tmp_work;
+    ValueType *tmp_work_array = tmp_work;
     tmp_work_array[local_id] = tmp;
 
     ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
@@ -152,7 +152,7 @@ void compute_partial_dot(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     const ValueType *__restrict__ y, size_type stride_y,
     ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
 {
     compute_partial_reduce<cfg>(
         num_rows, work,
@@ -181,7 +181,7 @@ void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 compute_partial_dot<cfg>(num_rows, x, stride_x, y, stride_y,
                                          work, item_ct1,
-                                         tmp_work_acc_ct1.get_pointer().get());
+                                         *tmp_work_acc_ct1.get_pointer());
             });
     });
 }
@@ -197,7 +197,7 @@ void compute_partial_conj_dot(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     const ValueType *__restrict__ y, size_type stride_y,
     ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
 {
     compute_partial_reduce<cfg>(
         num_rows, work,
@@ -225,9 +225,9 @@ void compute_partial_conj_dot(dim3 grid, dim3 block,
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_conj_dot<cfg>(
-                    num_rows, x, stride_x, y, stride_y, work, item_ct1,
-                    tmp_work_acc_ct1.get_pointer().get());
+                compute_partial_conj_dot<cfg>(num_rows, x, stride_x, y,
+                                              stride_y, work, item_ct1,
+                                              *tmp_work_acc_ct1.get_pointer());
             });
     });
 }
@@ -242,7 +242,7 @@ template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void finalize_sum_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
 {
     finalize_reduce_computation<cfg>(
         size, work, result,
@@ -267,7 +267,7 @@ void finalize_sum_reduce_computation(dim3 grid, dim3 block,
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_sum_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
-                                 tmp_work_acc_ct1.get_pointer().get());
+                                 *tmp_work_acc_ct1.get_pointer());
                          });
     });
 }
@@ -283,7 +283,7 @@ void compute_partial_norm2(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
     UninitializedArray<remove_complex<ValueType>, KCFG_1D::decode<0>(cfg)>
-        *tmp_work)
+        &tmp_work)
 {
     using norm_type = remove_complex<ValueType>;
     compute_partial_reduce<cfg>(
@@ -306,12 +306,12 @@ void compute_partial_norm2(dim3 grid, dim3 block,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             compute_partial_norm2<cfg>(
-                                 num_rows, x, stride_x, work, item_ct1,
-                                 tmp_work_acc_ct1.get_pointer().get());
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_norm2<cfg>(num_rows, x, stride_x, work,
+                                           item_ct1,
+                                           *tmp_work_acc_ct1.get_pointer());
+            });
     });
 }
 
@@ -325,7 +325,7 @@ template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void finalize_sqrt_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
 {
     finalize_reduce_computation<cfg>(
         size, work, result,
@@ -351,7 +351,7 @@ void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_sqrt_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
-                                 tmp_work_acc_ct1.get_pointer().get());
+                                 *tmp_work_acc_ct1.get_pointer());
                          });
     });
 }
@@ -677,21 +677,21 @@ void transpose(const size_type nrows, const size_type ncols,
                const ValueType *__restrict__ in, const size_type in_stride,
                ValueType *__restrict__ out, const size_type out_stride,
                Closure op, sycl::nd_item<3> item_ct1,
-               UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
+               UninitializedArray<ValueType, sg_size *(sg_size + 1)> &space)
 {
     auto local_x = item_ct1.get_local_id(2);
     auto local_y = item_ct1.get_local_id(1);
     auto x = item_ct1.get_group(2) * sg_size + local_x;
     auto y = item_ct1.get_group(1) * sg_size + local_y;
     if (y < nrows && x < ncols) {
-        (*space)[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]);
+        space[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]);
     }
 
     item_ct1.barrier(sycl::access::fence_space::local_space);
     x = item_ct1.get_group(1) * sg_size + local_x;
     y = item_ct1.get_group(2) * sg_size + local_y;
     if (y < ncols && x < nrows) {
-        out[y * out_stride + x] = (*space)[local_x * (sg_size + 1) + local_y];
+        out[y * out_stride + x] = space[local_x * (sg_size + 1) + local_y];
     }
 }
 
@@ -701,7 +701,7 @@ void transpose(const size_type nrows, const size_type ncols,
                const ValueType *__restrict__ in, const size_type in_stride,
                ValueType *__restrict__ out, const size_type out_stride,
                sycl::nd_item<3> item_ct1,
-               UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
+               UninitializedArray<ValueType, sg_size *(sg_size + 1)> &space)
 {
     transpose<sg_size>(
         nrows, ncols, in, in_stride, out, out_stride,
@@ -723,7 +723,7 @@ void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 transpose<sg_size>(nrows, ncols, in, in_stride, out, out_stride,
-                                   item_ct1, space_acc_ct1.get_pointer().get());
+                                   item_ct1, *space_acc_ct1.get_pointer());
             });
     });
 }
@@ -739,7 +739,7 @@ void conj_transpose(
     const ValueType *__restrict__ in, const size_type in_stride,
     ValueType *__restrict__ out, const size_type out_stride,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
+    UninitializedArray<ValueType, sg_size *(sg_size + 1)> &space)
 {
     transpose<sg_size>(
         nrows, ncols, in, in_stride, out, out_stride,
@@ -763,7 +763,7 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 conj_transpose<sg_size>(nrows, ncols, in, in_stride, out,
                                         out_stride, item_ct1,
-                                        space_acc_ct1.get_pointer().get());
+                                        *space_acc_ct1.get_pointer());
             });
     });
 }

diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -470,7 +470,7 @@ void multidot_kernel(
     size_type stride_next_krylov, const Accessor3d krylov_bases,
     ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
     const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, dot_dim * dot_dim> *reduction_helper_array)
+    UninitializedArray<ValueType, dot_dim * dot_dim> &reduction_helper_array)
 {
     /*
      * In general in this kernel:
@@ -497,8 +497,7 @@ void multidot_kernel(
     const size_type k = item_ct1.get_group(0);
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
-
-    ValueType *__restrict__ reduction_helper = (*reduction_helper_array);
+    ValueType *__restrict__ reduction_helper = reduction_helper_array;
 
     ValueType local_res = zero<ValueType>();
     if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
@@ -549,8 +548,7 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     num_rows, num_cols, next_krylov_basis, stride_next_krylov,
                     krylov_bases, hessenberg_iter, stride_hessenberg,
                     stop_status, item_ct1,
-                    (UninitializedArray<ValueType, dot_dim * dot_dim> *)
-                        reduction_helper_array_acc_ct1.get_pointer());
+                    *reduction_helper_array_acc_ct1.get_pointer());
             });
     });
 }
@@ -562,7 +560,7 @@ void singledot_kernel(
     size_type stride_next_krylov, const Accessor3d krylov_bases,
     ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
     const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size> *reduction_helper_array)
+    UninitializedArray<ValueType, block_size> &reduction_helper_array)
 {
     /*
      * In general in this kernel:
@@ -585,7 +583,7 @@ void singledot_kernel(
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
 
-    ValueType *__restrict__ reduction_helper = (*reduction_helper_array);
+    ValueType *__restrict__ reduction_helper = reduction_helper_array;
 
     ValueType local_res = zero<ValueType>();
     if (!stop_status[col_idx].has_stopped()) {
@@ -630,8 +628,7 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     num_rows, next_krylov_basis, stride_next_krylov,
                     krylov_bases, hessenberg_iter, stride_hessenberg,
                     stop_status, item_ct1,
-                    (UninitializedArray<ValueType, block_size> *)
-                        reduction_helper_array_acc_ct1.get_pointer());
+                    *reduction_helper_array_acc_ct1.get_pointer());
             });
     });
 }