ginkgo-project · yhmtsai · Aug 6, 2021 · Jul 19, 2021 · Jul 20, 2021 · Jul 28, 2021
diff --git a/common/solver/idr_kernels.hpp.inc b/common/solver/idr_kernels.hpp.inc
@@ -59,9 +59,10 @@ __global__
     const auto tidx = thread::get_thread_id_flat();
 
     __shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
-    ValueType *__restrict__ reduction_helper = reduction_helper_array;
-
-    __shared__ remove_complex<ValueType> reduction_helper_real[block_size];
+    // they are not be used in the same time.
+    ValueType *reduction_helper = reduction_helper_array;
+    auto reduction_helper_real =
+        reinterpret_cast<remove_complex<ValueType> *>(reduction_helper);
 
     for (size_type row = 0; row < num_rows; row++) {
         for (size_type i = 0; i < row; i++) {
@@ -70,9 +71,9 @@ __global__
                 dot += values[row * stride + j] * conj(values[i * stride + j]);
             }
 
-            reduction_helper[tidx] = dot;
-
+            // Ensure already finish reading this shared memory
             __syncthreads();
+            reduction_helper[tidx] = dot;
             reduce(
                 group::this_thread_block(), reduction_helper,
                 [](const ValueType &a, const ValueType &b) { return a + b; });
@@ -88,10 +89,9 @@ __global__
         for (size_type j = tidx; j < num_cols; j += block_size) {
             norm += squared_norm(values[row * stride + j]);
         }
-
-        reduction_helper_real[tidx] = norm;
-
+        // Ensure already finish reading this shared memory
         __syncthreads();
+        reduction_helper_real[tidx] = norm;
         reduce(group::this_thread_block(), reduction_helper_real,
                [](const remove_complex<ValueType> &a,
                   const remove_complex<ValueType> &b) { return a + b; });

diff --git a/cuda/test/solver/idr_kernels.cpp b/cuda/test/solver/idr_kernels.cpp
@@ -67,9 +67,6 @@ class Idr : public ::testing::Test {
         ref = gko::ReferenceExecutor::create();
         cuda = gko::CudaExecutor::create(0, ref);
 
-        mtx = gen_mtx(123, 123);
-        d_mtx = Mtx::create(cuda);
-        d_mtx->copy_from(mtx.get());
         cuda_idr_factory =
             Solver::build()
                 .with_deterministic(true)
@@ -100,11 +97,11 @@ class Idr : public ::testing::Test {
             std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
     }
 
-    void initialize_data()
+    void initialize_data(int size = 597, int input_nrhs = 17)
     {
-        int size = 597;
-        nrhs = 17;
+        nrhs = input_nrhs;
         int s = 4;
+        mtx = gen_mtx(size, size);
         x = gen_mtx(size, nrhs);
         b = gen_mtx(size, nrhs);
         r = gen_mtx(size, nrhs);
@@ -125,6 +122,7 @@ class Idr : public ::testing::Test {
             stop_status->get_data()[i].reset();
         }
 
+        d_mtx = Mtx::create(cuda);
         d_x = Mtx::create(cuda);
         d_b = Mtx::create(cuda);
         d_r = Mtx::create(cuda);
@@ -142,6 +140,7 @@ class Idr : public ::testing::Test {
         d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
             new gko::Array<gko::stopping_status>(cuda));
 
+        d_mtx->copy_from(mtx.get());
         d_x->copy_from(x.get());
         d_b->copy_from(b.get());
         d_r->copy_from(r.get());
@@ -291,16 +290,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef)
 
 TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)
 {
-    int m = 123;
-    int n = 1;
+    initialize_data(123, 1);
     auto ref_solver = ref_idr_factory->generate(mtx);
     auto cuda_solver = cuda_idr_factory->generate(d_mtx);
-    auto b = gen_mtx(m, n);
-    auto x = gen_mtx(m, n);
-    auto d_b = Mtx::create(cuda);
-    auto d_x = Mtx::create(cuda);
-    d_b->copy_from(b.get());
-    d_x->copy_from(x.get());
 
     ref_solver->apply(b.get(), x.get());
     cuda_solver->apply(d_b.get(), d_x.get());
@@ -312,8 +304,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)
 
 TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
 {
-    int m = 123;
-    int n = 1;
+    initialize_data(123, 1);
     cuda_idr_factory =
         Solver::build()
             .with_deterministic(true)
@@ -330,12 +321,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
             .on(ref);
     auto ref_solver = ref_idr_factory->generate(mtx);
     auto cuda_solver = cuda_idr_factory->generate(d_mtx);
-    auto b = gen_mtx(m, n);
-    auto x = gen_mtx(m, n);
-    auto d_b = Mtx::create(cuda);
-    auto d_x = Mtx::create(cuda);
-    d_b->copy_from(b.get());
-    d_x->copy_from(x.get());
 
     ref_solver->apply(b.get(), x.get());
     cuda_solver->apply(d_b.get(), d_x.get());
@@ -347,16 +332,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
 
 TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)
 {
-    int m = 123;
-    int n = 16;
+    initialize_data(123, 16);
     auto cuda_solver = cuda_idr_factory->generate(d_mtx);
     auto ref_solver = ref_idr_factory->generate(mtx);
-    auto b = gen_mtx(m, n);
-    auto x = gen_mtx(m, n);
-    auto d_b = Mtx::create(cuda);
-    auto d_x = Mtx::create(cuda);
-    d_b->copy_from(b.get());
-    d_x->copy_from(x.get());
 
     ref_solver->apply(b.get(), x.get());
     cuda_solver->apply(d_b.get(), d_x.get());
@@ -368,8 +346,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)
 
 TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
 {
-    int m = 123;
-    int n = 16;
+    initialize_data(123, 16);
     cuda_idr_factory =
         Solver::build()
             .with_deterministic(true)
@@ -386,12 +363,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
             .on(ref);
     auto cuda_solver = cuda_idr_factory->generate(d_mtx);
     auto ref_solver = ref_idr_factory->generate(mtx);
-    auto b = gen_mtx(m, n);
-    auto x = gen_mtx(m, n);
-    auto d_b = Mtx::create(cuda);
-    auto d_x = Mtx::create(cuda);
-    d_b->copy_from(b.get());
-    d_x->copy_from(x.get());
 
     ref_solver->apply(b.get(), x.get());
     cuda_solver->apply(d_b.get(), d_x.get());

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
@@ -8,6 +8,8 @@ set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 
 find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
 set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
+find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
-find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
+find_package(oneDPL CONFIG REQUIRED HINTS "$ENV{DPL_ROOT}")
-find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
+find_package(oneDPL CONFIG REQUIRED HINTS "$ENV{DPL_ROOT}")
+set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE)
 
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
@@ -75,7 +77,7 @@ else ()
     target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
 endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
-target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
+target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()

diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
@@ -129,13 +129,13 @@ template <std::uint32_t block_size, typename ValueType>
 void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
                       ValueType *__restrict__ block_sum,
                       sycl::nd_item<3> item_ct1,
-                      UninitializedArray<ValueType, block_size> *prefix_helper)
+                      UninitializedArray<ValueType, block_size> &prefix_helper)
 {
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     const auto element_id = item_ct1.get_local_id(2);
 
     // do not need to access the last element when exclusive prefix sum
-    (*prefix_helper)[element_id] =
+    prefix_helper[element_id] =
         (tidx + 1 < num_elements) ? elements[tidx] : zero<ValueType>();
     auto this_block = group::this_thread_block(item_ct1);
     this_block.sync();
@@ -146,17 +146,17 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
         const auto ai = i * (2 * element_id + 1) - 1;
         const auto bi = i * (2 * element_id + 2) - 1;
         if (bi < block_size) {
-            (*prefix_helper)[bi] += (*prefix_helper)[ai];
+            prefix_helper[bi] += prefix_helper[ai];
         }
         this_block.sync();
     }
 
     if (element_id == 0) {
         // Store the total sum except the last block
         if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) {
-            block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1];
+            block_sum[item_ct1.get_group(2)] = prefix_helper[block_size - 1];
         }
-        (*prefix_helper)[block_size - 1] = zero<ValueType>();
+        prefix_helper[block_size - 1] = zero<ValueType>();
     }
 
     this_block.sync();
@@ -167,14 +167,14 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
         const auto ai = i * (2 * element_id + 1) - 1;
         const auto bi = i * (2 * element_id + 2) - 1;
         if (bi < block_size) {
-            auto tmp = (*prefix_helper)[ai];
-            (*prefix_helper)[ai] = (*prefix_helper)[bi];
-            (*prefix_helper)[bi] += tmp;
+            auto tmp = prefix_helper[ai];
+            prefix_helper[ai] = prefix_helper[bi];
+            prefix_helper[bi] += tmp;
         }
         this_block.sync();
     }
     if (tidx < num_elements) {
-        elements[tidx] = (*prefix_helper)[element_id];
+        elements[tidx] = prefix_helper[element_id];
     }
 }
 
@@ -193,7 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                          [=](sycl::nd_item<3> item_ct1) {
                              start_prefix_sum<block_size>(
                                  num_elements, elements, block_sum, item_ct1,
-                                 prefix_helper_acc_ct1.get_pointer().get());
+                                 *prefix_helper_acc_ct1.get_pointer());
                          });
     });
 }

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
@@ -205,14 +205,14 @@ template <std::uint32_t cfg, typename ValueType>
 void reduce_add_array(
     size_type size, const ValueType *__restrict__ source,
     ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *block_sum)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &block_sum)
 {
     reduce_array<KCFG_1D::decode<1>(cfg)>(
-        size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
+        size, source, static_cast<ValueType *>(block_sum), item_ct1,
         [](const ValueType &x, const ValueType &y) { return x + y; });
 
     if (item_ct1.get_local_id(2) == 0) {
-        result[item_ct1.get_group(2)] = (*block_sum)[0];
+        result[item_ct1.get_group(2)] = block_sum[0];
     }
 }
 
@@ -230,7 +230,7 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 reduce_add_array<cfg>(size, source, result, item_ct1,
-                                      block_sum_acc_ct1.get_pointer().get());
+                                      *block_sum_acc_ct1.get_pointer());
             });
     });
 }