From b66b62454f8d5b59b27fd1f8e36aa0212b121ac8 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 19 Jul 2021 17:05:42 +0200 Subject: [PATCH 1/8] porting and formating --- dpcpp/solver/idr_kernels.dp.cpp | 804 +++++++++++++++++++++++++++++- dpcpp/test/solver/idr_kernels.cpp | 404 +++++++++++++++ 2 files changed, 1197 insertions(+), 11 deletions(-) create mode 100644 dpcpp/test/solver/idr_kernels.cpp diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 9bb956f3807..7618c8b70d1 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -33,15 +33,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/idr_kernels.hpp" -#include +#include +#include +#include +#include + + +#include -#include #include -#include #include -#include -#include + + +#include "core/components/fill_array.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" +#include "dpcpp/components/atomic.dp.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" namespace gko { @@ -55,11 +67,744 @@ namespace dpcpp { namespace idr { +constexpr int default_block_size = 512; +constexpr int default_dot_dim = 32; +constexpr int default_dot_size = default_dot_dim * default_dot_dim; + + +template +void initialize_m_kernel(size_type subspace_dim, size_type nrhs, + ValueType *__restrict__ m_values, size_type m_stride, + stopping_status *__restrict__ stop_status, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row = global_id / m_stride; + const auto col = global_id % m_stride; + + if (global_id < nrhs) { + stop_status[global_id].reset(); + } + + if (row < subspace_dim && col < nrhs * subspace_dim) { + m_values[row * m_stride + col] = + (row == col / nrhs) ? one() : zero(); + } +} + +template +void initialize_m_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type subspace_dim, + size_type nrhs, ValueType *m_values, + size_type m_stride, stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + initialize_m_kernel(subspace_dim, nrhs, m_values, m_stride, + stop_status, item_ct1); + }); + }); +} + + +template +void orthonormalize_subspace_vectors_kernel( + size_type num_rows, size_type num_cols, ValueType *__restrict__ values, + size_type stride, sycl::nd_item<3> item_ct1, + UninitializedArray *reduction_helper_array, + remove_complex *reduction_helper_real) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + + ValueType *__restrict__ reduction_helper = (*reduction_helper_array); + + + for (size_type row = 0; row < num_rows; row++) { + for (size_type i = 0; i < row; i++) { + auto dot = zero(); + for (size_type j = tidx; j < num_cols; j += block_size) { + /* + DPCT1007:5: Migration of this CUDA API is not supported by the + Intel(R) DPC++ Compatibility Tool. + */ + dot += values[row * stride + j] * conj(values[i * stride + j]); + } + + reduction_helper[tidx] = dot; + + /* + DPCT1065:3: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance, if there is no access to global memory. + */ + item_ct1.barrier(); + reduce( + group::this_thread_block(item_ct1), reduction_helper, + [](const ValueType &a, const ValueType &b) { return a + b; }); + /* + DPCT1065:4: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance, if there is no access to global memory. + */ + item_ct1.barrier(); + + dot = reduction_helper[0]; + for (size_type j = tidx; j < num_cols; j += block_size) { + values[row * stride + j] -= dot * values[i * stride + j]; + } + } + + auto norm = zero>(); + for (size_type j = tidx; j < num_cols; j += block_size) { + norm += squared_norm(values[row * stride + j]); + } + + reduction_helper_real[tidx] = norm; + + /* + DPCT1065:1: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance, if there is no access to global memory. + */ + item_ct1.barrier(); + reduce(group::this_thread_block(item_ct1), reduction_helper_real, + [](const remove_complex &a, + const remove_complex &b) { return a + b; }); + /* + DPCT1065:2: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance, if there is no access to global memory. + */ + item_ct1.barrier(); + + norm = sycl::sqrt((float)(reduction_helper_real[0])); + for (size_type j = tidx; j < num_cols; j += block_size) { + values[row * stride + j] /= norm; + } + } +} + +template +void orthonormalize_subspace_vectors_kernel( + dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, size_type num_cols, ValueType *values, size_type stride) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_real_acc_ct1(sycl::range<1>(block_size), cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + orthonormalize_subspace_vectors_kernel( + num_rows, num_cols, values, stride, item_ct1, + (UninitializedArray *) + reduction_helper_array_acc_ct1.get_pointer(), + (remove_complex *) + reduction_helper_real_acc_ct1.get_pointer()); + }); + }); +} + + +template +void solve_lower_triangular_kernel( + size_type subspace_dim, size_type nrhs, + const ValueType *__restrict__ m_values, size_type m_stride, + const ValueType *__restrict__ f_values, size_type f_stride, + ValueType *__restrict__ c_values, size_type c_stride, + const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + + if (global_id >= nrhs) { + return; + } + + if (!stop_status[global_id].has_stopped()) { + for (size_type row = 0; row < subspace_dim; row++) { + auto temp = f_values[row * f_stride + global_id]; + for (size_type col = 0; col < row; col++) { + temp -= m_values[row * m_stride + col * nrhs + global_id] * + c_values[col * c_stride + global_id]; + } + c_values[row * c_stride + global_id] = + temp / m_values[row * m_stride + row * nrhs + global_id]; + } + } +} + +template +void solve_lower_triangular_kernel( + dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, + size_type subspace_dim, size_type nrhs, const ValueType *m_values, + size_type m_stride, const ValueType *f_values, size_type f_stride, + ValueType *c_values, size_type c_stride, const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + solve_lower_triangular_kernel( + subspace_dim, nrhs, m_values, m_stride, f_values, f_stride, + c_values, c_stride, stop_status, item_ct1); + }); + }); +} + + +template +void step_1_kernel(size_type k, size_type num_rows, size_type subspace_dim, + size_type nrhs, + const ValueType *__restrict__ residual_values, + size_type residual_stride, + const ValueType *__restrict__ c_values, size_type c_stride, + const ValueType *__restrict__ g_values, size_type g_stride, + ValueType *__restrict__ v_values, size_type v_stride, + const stopping_status *__restrict__ stop_status, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row = global_id / nrhs; + const auto col = global_id % nrhs; + + if (row >= num_rows) { + return; + } + + if (!stop_status[col].has_stopped()) { + auto temp = residual_values[row * residual_stride + col]; + for (size_type j = k; j < subspace_dim; j++) { + temp -= c_values[j * c_stride + col] * + g_values[row * g_stride + j * nrhs + col]; + } + v_values[row * v_stride + col] = temp; + } +} + +template +void step_1_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type k, size_type num_rows, + size_type subspace_dim, size_type nrhs, + const ValueType *residual_values, size_type residual_stride, + const ValueType *c_values, size_type c_stride, + const ValueType *g_values, size_type g_stride, + ValueType *v_values, size_type v_stride, + const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + step_1_kernel(k, num_rows, subspace_dim, nrhs, residual_values, + residual_stride, c_values, c_stride, g_values, + g_stride, v_values, v_stride, stop_status, + item_ct1); + }); + }); +} + + +template +void step_2_kernel(size_type k, size_type num_rows, size_type subspace_dim, + size_type nrhs, const ValueType *__restrict__ omega_values, + const ValueType *__restrict__ v_values, size_type v_stride, + const ValueType *__restrict__ c_values, size_type c_stride, + ValueType *__restrict__ u_values, size_type u_stride, + const stopping_status *__restrict__ stop_status, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row = global_id / nrhs; + const auto col = global_id % nrhs; + + if (row >= num_rows) { + return; + } + + if (!stop_status[col].has_stopped()) { + auto temp = omega_values[col] * v_values[row * v_stride + col]; + for (size_type j = k; j < subspace_dim; j++) { + temp += c_values[j * c_stride + col] * + u_values[row * u_stride + j * nrhs + col]; + } + u_values[row * u_stride + k * nrhs + col] = temp; + } +} + +template +void step_2_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type k, size_type num_rows, + size_type subspace_dim, size_type nrhs, + const ValueType *omega_values, const ValueType *v_values, + size_type v_stride, const ValueType *c_values, + size_type c_stride, ValueType *u_values, size_type u_stride, + const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + step_2_kernel(k, num_rows, subspace_dim, nrhs, omega_values, + v_values, v_stride, c_values, c_stride, u_values, + u_stride, stop_status, item_ct1); + }); + }); +} + + +template +void multidot_kernel( + size_type num_rows, size_type nrhs, const ValueType *__restrict__ p_i, + const ValueType *__restrict__ g_k, size_type g_k_stride, + ValueType *__restrict__ alpha, + const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1, + UninitializedArray + *reduction_helper_array) +{ + const auto tidx = item_ct1.get_local_id(2); + const auto tidy = item_ct1.get_local_id(1); + const auto rhs = item_ct1.get_group(2) * default_dot_dim + tidx; + const auto num = ceildiv(num_rows, item_ct1.get_group_range(1)); + const auto start_row = item_ct1.get_group(1) * num; + const auto end_row = ((item_ct1.get_group(1) + 1) * num > num_rows) + ? num_rows + : (item_ct1.get_group(1) + 1) * num; + // Used that way to get around dynamic initialization warning and + // template error when using `reduction_helper_array` directly in `reduce` + + ValueType *__restrict__ reduction_helper = (*reduction_helper_array); + + ValueType local_res = zero(); + if (rhs < nrhs && !stop_status[rhs].has_stopped()) { + for (size_type i = start_row + tidy; i < end_row; + i += default_dot_dim) { + const auto g_idx = i * g_k_stride + rhs; + local_res += p_i[i] * g_k[g_idx]; + } + } + reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; + /* + DPCT1065:10: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance, if there is no access to global memory. + */ + item_ct1.barrier(); + local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; + const auto tile_block = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto sum = + reduce(tile_block, local_res, + [](const ValueType &a, const ValueType &b) { return a + b; }); + const auto new_rhs = item_ct1.get_group(2) * default_dot_dim + tidy; + if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) { + atomic_add(alpha + new_rhs, sum); + } +} + +template +void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type nrhs, + const ValueType *p_i, const ValueType *g_k, + size_type g_k_stride, ValueType *alpha, + const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, + 0, sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + multidot_kernel( + num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status, + item_ct1, + (UninitializedArray *) + reduction_helper_array_acc_ct1.get_pointer()); + }); + }); +} + + +template +void update_g_k_and_u_kernel( + size_type k, size_type i, size_type size, size_type nrhs, + const ValueType *__restrict__ alpha, const ValueType *__restrict__ m_values, + size_type m_stride, const ValueType *__restrict__ g_values, + size_type g_stride, ValueType *__restrict__ g_k_values, + size_type g_k_stride, ValueType *__restrict__ u_values, size_type u_stride, + const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + const auto row = tidx / g_k_stride; + const auto rhs = tidx % g_k_stride; + + if (row >= size || rhs >= nrhs) { + return; + } + + if (!stop_status[rhs].has_stopped()) { + const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs]; + g_k_values[row * g_k_stride + rhs] -= + fact * g_values[row * g_stride + i * nrhs + rhs]; + u_values[row * u_stride + k * nrhs + rhs] -= + fact * u_values[row * u_stride + i * nrhs + rhs]; + } +} + +template +void update_g_k_and_u_kernel(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type k, size_type i, size_type size, + size_type nrhs, const ValueType *alpha, + const ValueType *m_values, size_type m_stride, + const ValueType *g_values, size_type g_stride, + ValueType *g_k_values, size_type g_k_stride, + ValueType *u_values, size_type u_stride, + const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + update_g_k_and_u_kernel( + k, i, size, nrhs, alpha, m_values, m_stride, + g_values, g_stride, g_k_values, g_k_stride, + u_values, u_stride, stop_status, item_ct1); + }); + }); +} + + +template +void update_g_kernel(size_type k, size_type size, size_type nrhs, + const ValueType *__restrict__ g_k_values, + size_type g_k_stride, ValueType *__restrict__ g_values, + size_type g_stride, + const stopping_status *__restrict__ stop_status, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + const auto row = tidx / g_k_stride; + const auto rhs = tidx % nrhs; + + if (row >= size || rhs >= nrhs) { + return; + } + + if (!stop_status[rhs].has_stopped()) { + g_values[row * g_stride + k * nrhs + rhs] = + g_k_values[row * g_k_stride + rhs]; + } +} + +template +void update_g_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type k, size_type size, + size_type nrhs, const ValueType *g_k_values, + size_type g_k_stride, ValueType *g_values, + size_type g_stride, const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + update_g_kernel(k, size, nrhs, g_k_values, + g_k_stride, g_values, g_stride, + stop_status, item_ct1); + }); + }); +} + + +template +void update_x_r_and_f_kernel( + size_type k, size_type size, size_type subspace_dim, size_type nrhs, + const ValueType *__restrict__ m_values, size_type m_stride, + const ValueType *__restrict__ g_values, size_type g_stride, + const ValueType *__restrict__ u_values, size_type u_stride, + ValueType *__restrict__ f_values, size_type f_stride, + ValueType *__restrict__ r_values, size_type r_stride, + ValueType *__restrict__ x_values, size_type x_stride, + const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row = global_id / x_stride; + const auto col = global_id % x_stride; + + if (row >= size || col >= nrhs) { + return; + } + + if (!stop_status[col].has_stopped()) { + const auto beta = f_values[k * f_stride + col] / + m_values[k * m_stride + k * nrhs + col]; + r_values[row * r_stride + col] -= + beta * g_values[row * g_stride + k * nrhs + col]; + x_values[row * x_stride + col] += + beta * u_values[row * u_stride + k * nrhs + col]; + + if (k < row && k + 1 < subspace_dim && row < subspace_dim) { + f_values[row * f_stride + col] -= + beta * m_values[row * m_stride + k * nrhs + col]; + } + } +} + +template +void update_x_r_and_f_kernel( + dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, + size_type k, size_type size, size_type subspace_dim, size_type nrhs, + const ValueType *m_values, size_type m_stride, const ValueType *g_values, + size_type g_stride, const ValueType *u_values, size_type u_stride, + ValueType *f_values, size_type f_stride, ValueType *r_values, + size_type r_stride, ValueType *x_values, size_type x_stride, + const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + update_x_r_and_f_kernel( + k, size, subspace_dim, nrhs, m_values, m_stride, g_values, + g_stride, u_values, u_stride, f_values, f_stride, r_values, + r_stride, x_values, x_stride, stop_status, item_ct1); + }); + }); +} + + +template +void compute_omega_kernel( + size_type nrhs, const remove_complex kappa, + const ValueType *__restrict__ tht, + const remove_complex *__restrict__ residual_norm, + ValueType *__restrict__ omega, + const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + + if (global_id >= nrhs) { + return; + } + + if (!stop_status[global_id].has_stopped()) { + auto thr = omega[global_id]; + omega[global_id] /= tht[global_id]; + auto absrho = + std::abs(thr / (sycl::sqrt((float)(real(tht[global_id]))) * + residual_norm[global_id])); + + if (absrho < kappa) { + omega[global_id] *= kappa / absrho; + } + } +} + +template +void compute_omega_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type nrhs, + const remove_complex kappa, + const ValueType *tht, + const remove_complex *residual_norm, + ValueType *omega, const stopping_status *stop_status) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_omega_kernel(nrhs, kappa, tht, residual_norm, omega, + stop_status, item_ct1); + }); + }); +} + + +namespace { + + +template +void initialize_m(const size_type nrhs, matrix::Dense *m, + Array *stop_status) +{ + const auto subspace_dim = m->get_size()[0]; + const auto m_stride = m->get_stride(); + + const auto grid_dim = ceildiv(m_stride * subspace_dim, default_block_size); + initialize_m_kernel(grid_dim, default_block_size, 0, exec->get_queue(), + subspace_dim, nrhs, m->get_values(), m_stride, + stop_status->get_data()); +} + + +template +void initialize_subspace_vectors(matrix::Dense *subspace_vectors, + bool deterministic) +{ + if (deterministic) { + auto subspace_vectors_data = matrix_data( + subspace_vectors->get_size(), std::normal_distribution<>(0.0, 1.0), + std::ranlux48(15)); + subspace_vectors->read(subspace_vectors_data); + } else { + auto gen = + curand::rand_generator(time(NULL), CURAND_RNG_PSEUDO_DEFAULT); + curand::rand_vector( + gen, + subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + 0.0, 1.0, subspace_vectors->get_values()); + } +} + + +template +void orthonormalize_subspace_vectors(matrix::Dense *subspace_vectors) +{ + orthonormalize_subspace_vectors_kernel( + 1, default_block_size, 0, exec->get_queue(), + subspace_vectors->get_size()[0], subspace_vectors->get_size()[1], + subspace_vectors->get_values(), subspace_vectors->get_stride()); +} + + +template +void solve_lower_triangular(const size_type nrhs, + const matrix::Dense *m, + const matrix::Dense *f, + matrix::Dense *c, + const Array *stop_status) +{ + const auto subspace_dim = m->get_size()[0]; + + const auto grid_dim = ceildiv(nrhs, default_block_size); + solve_lower_triangular_kernel( + grid_dim, default_block_size, 0, exec->get_queue(), subspace_dim, nrhs, + m->get_const_values(), m->get_stride(), f->get_const_values(), + f->get_stride(), c->get_values(), c->get_stride(), + stop_status->get_const_data()); +} + + +template +void update_g_and_u(std::shared_ptr exec, + const size_type nrhs, const size_type k, + const matrix::Dense *p, + const matrix::Dense *m, + matrix::Dense *alpha, + matrix::Dense *g, matrix::Dense *g_k, + matrix::Dense *u, + const Array *stop_status) +{ + const auto size = g->get_size()[0]; + const auto p_stride = p->get_stride(); + + const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 block_dim(default_dot_dim, default_dot_dim); + + for (size_type i = 0; i < k; i++) { + const auto p_i = p->get_const_values() + i * p_stride; + if (nrhs > 1 || is_complex()) { + components::fill_array(exec, alpha->get_values(), nrhs, + zero()); + multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size, + nrhs, p_i, g_k->get_values(), g_k->get_stride(), + alpha->get_values(), stop_status->get_const_data()); + } else { + cublas::dot(exec->get_cublas_handle(), size, p_i, 1, + g_k->get_values(), g_k->get_stride(), + alpha->get_values()); + } + update_g_k_and_u_kernel( + ceildiv(size * g_k->get_stride(), default_block_size), + default_block_size, 0, exec->get_queue(), k, i, size, nrhs, + alpha->get_const_values(), m->get_const_values(), m->get_stride(), + g->get_const_values(), g->get_stride(), g_k->get_values(), + g_k->get_stride(), u->get_values(), u->get_stride(), + stop_status->get_const_data()); + } + update_g_kernel( + ceildiv(size * g_k->get_stride(), default_block_size), + default_block_size, 0, exec->get_queue(), k, size, nrhs, + g_k->get_const_values(), g_k->get_stride(), g->get_values(), + g->get_stride(), stop_status->get_const_data()); +} + + +template +void update_m(std::shared_ptr exec, const size_type nrhs, + const size_type k, const matrix::Dense *p, + const matrix::Dense *g_k, matrix::Dense *m, + const Array *stop_status) +{ + const auto size = g_k->get_size()[0]; + const auto subspace_dim = m->get_size()[0]; + const auto p_stride = p->get_stride(); + const auto m_stride = m->get_stride(); + + const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 block_dim(default_dot_dim, default_dot_dim); + + for (size_type i = k; i < subspace_dim; i++) { + const auto p_i = p->get_const_values() + i * p_stride; + auto m_i = m->get_values() + i * m_stride + k * nrhs; + if (nrhs > 1 || is_complex()) { + components::fill_array(exec, m_i, nrhs, zero()); + multidot_kernel(grid_dim, block_dim, 0, exec->get_queue(), size, + nrhs, p_i, g_k->get_const_values(), + g_k->get_stride(), m_i, + stop_status->get_const_data()); + } else { + cublas::dot(exec->get_cublas_handle(), size, p_i, 1, + g_k->get_const_values(), g_k->get_stride(), m_i); + } + } +} + + +template +void update_x_r_and_f(std::shared_ptr exec, + const size_type nrhs, const size_type k, + const matrix::Dense *m, + const matrix::Dense *g, + const matrix::Dense *u, + matrix::Dense *f, matrix::Dense *r, + matrix::Dense *x, + const Array *stop_status) +{ + const auto size = x->get_size()[0]; + const auto subspace_dim = m->get_size()[0]; + + const auto grid_dim = ceildiv(size * x->get_stride(), default_block_size); + update_x_r_and_f_kernel(grid_dim, default_block_size, 0, exec->get_queue(), + k, size, subspace_dim, nrhs, m->get_const_values(), + m->get_stride(), g->get_const_values(), + g->get_stride(), u->get_const_values(), + u->get_stride(), f->get_values(), f->get_stride(), + r->get_values(), r->get_stride(), x->get_values(), + x->get_stride(), stop_status->get_const_data()); + components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs, + zero()); +} + + +} // namespace + + template void initialize(std::shared_ptr exec, const size_type nrhs, matrix::Dense *m, matrix::Dense *subspace_vectors, bool deterministic, - Array *stop_status) GKO_NOT_IMPLEMENTED; + Array *stop_status) +{ + initialize_m(nrhs, m, stop_status); + initialize_subspace_vectors(subspace_vectors, deterministic); + orthonormalize_subspace_vectors(subspace_vectors); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); @@ -71,7 +816,21 @@ void step_1(std::shared_ptr exec, const size_type nrhs, const matrix::Dense *residual, const matrix::Dense *g, matrix::Dense *c, matrix::Dense *v, - const Array *stop_status) GKO_NOT_IMPLEMENTED; + const Array *stop_status) +{ + solve_lower_triangular(nrhs, m, f, c, stop_status); + + const auto num_rows = v->get_size()[0]; + const auto subspace_dim = m->get_size()[0]; + + const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size); + step_1_kernel(grid_dim, default_block_size, 0, exec->get_queue(), k, + num_rows, subspace_dim, nrhs, residual->get_const_values(), + residual->get_stride(), c->get_const_values(), + c->get_stride(), g->get_const_values(), g->get_stride(), + v->get_values(), v->get_stride(), + stop_status->get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); @@ -81,7 +840,19 @@ void step_2(std::shared_ptr exec, const size_type nrhs, const size_type k, const matrix::Dense *omega, const matrix::Dense *preconditioned_vector, const matrix::Dense *c, matrix::Dense *u, - const Array *stop_status) GKO_NOT_IMPLEMENTED; + const Array *stop_status) +{ + const auto num_rows = preconditioned_vector->get_size()[0]; + const auto subspace_dim = u->get_size()[1] / nrhs; + + const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size); + step_2_kernel(grid_dim, default_block_size, 0, exec->get_queue(), k, + num_rows, subspace_dim, nrhs, omega->get_const_values(), + preconditioned_vector->get_const_values(), + preconditioned_vector->get_stride(), c->get_const_values(), + c->get_stride(), u->get_values(), u->get_stride(), + stop_status->get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); @@ -93,7 +864,12 @@ void step_3(std::shared_ptr exec, const size_type nrhs, matrix::Dense *u, matrix::Dense *m, matrix::Dense *f, matrix::Dense *alpha, matrix::Dense *residual, matrix::Dense *x, - const Array *stop_status) GKO_NOT_IMPLEMENTED; + const Array *stop_status) +{ + update_g_and_u(exec, nrhs, k, p, m, alpha, g, g_k, u, stop_status); + update_m(exec, nrhs, k, p, g_k, m, stop_status); + update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); @@ -103,8 +879,14 @@ void compute_omega( std::shared_ptr exec, const size_type nrhs, const remove_complex kappa, const matrix::Dense *tht, const matrix::Dense> *residual_norm, - matrix::Dense *omega, - const Array *stop_status) GKO_NOT_IMPLEMENTED; + matrix::Dense *omega, const Array *stop_status) +{ + const auto grid_dim = ceildiv(nrhs, config::warp_size); + compute_omega_kernel(grid_dim, config::warp_size, 0, exec->get_queue(), + nrhs, kappa, tht->get_const_values(), + residual_norm->get_const_values(), omega->get_values(), + stop_status->get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); diff --git a/dpcpp/test/solver/idr_kernels.cpp b/dpcpp/test/solver/idr_kernels.cpp new file mode 100644 index 00000000000..9316ec02e7d --- /dev/null +++ b/dpcpp/test/solver/idr_kernels.cpp @@ -0,0 +1,404 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/solver/idr_kernels.hpp" +#include "core/test/utils.hpp" + + +namespace { + + +class Idr : public ::testing::Test { +protected: + using Mtx = gko::matrix::Dense<>; + using Solver = gko::solver::Idr<>; + + Idr() : rand_engine(30) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + dpcpp = gko::DpcppExecutor::create(0, ref); + + mtx = gen_mtx(123, 123); + d_mtx = Mtx::create(dpcpp); + d_mtx->copy_from(mtx.get()); + dpcpp_idr_factory = + Solver::build() + .with_deterministic(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(dpcpp)) + .on(dpcpp); + + ref_idr_factory = + Solver::build() + .with_deterministic(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(ref)) + .on(ref); + } + + void TearDown() + { + if (dpcpp != nullptr) { + ASSERT_NO_THROW(dpcpp->synchronize()); + } + } + + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void initialize_data() + { + int size = 597; + nrhs = 17; + int s = 4; + x = gen_mtx(size, nrhs); + b = gen_mtx(size, nrhs); + r = gen_mtx(size, nrhs); + m = gen_mtx(s, nrhs * s); + f = gen_mtx(s, nrhs); + g = gen_mtx(size, nrhs * s); + u = gen_mtx(size, nrhs * s); + c = gen_mtx(s, nrhs); + v = gen_mtx(size, nrhs); + p = gen_mtx(s, size); + alpha = gen_mtx(1, nrhs); + omega = gen_mtx(1, nrhs); + tht = gen_mtx(1, nrhs); + residual_norm = gen_mtx(1, nrhs); + stop_status = std::unique_ptr>( + new gko::Array(ref, nrhs)); + for (size_t i = 0; i < nrhs; ++i) { + stop_status->get_data()[i].reset(); + } + + d_x = Mtx::create(dpcpp); + d_b = Mtx::create(dpcpp); + d_r = Mtx::create(dpcpp); + d_m = Mtx::create(dpcpp); + d_f = Mtx::create(dpcpp); + d_g = Mtx::create(dpcpp); + d_u = Mtx::create(dpcpp); + d_c = Mtx::create(dpcpp); + d_v = Mtx::create(dpcpp); + d_p = Mtx::create(dpcpp); + d_alpha = Mtx::create(dpcpp); + d_omega = Mtx::create(dpcpp); + d_tht = Mtx::create(dpcpp); + d_residual_norm = Mtx::create(dpcpp); + d_stop_status = std::unique_ptr>( + new gko::Array(dpcpp)); + + d_x->copy_from(x.get()); + d_b->copy_from(b.get()); + d_r->copy_from(r.get()); + d_m->copy_from(m.get()); + d_f->copy_from(f.get()); + d_g->copy_from(g.get()); + d_u->copy_from(u.get()); + d_c->copy_from(c.get()); + d_v->copy_from(v.get()); + d_p->copy_from(p.get()); + d_alpha->copy_from(alpha.get()); + d_omega->copy_from(omega.get()); + d_tht->copy_from(tht.get()); + d_residual_norm->copy_from(residual_norm.get()); + *d_stop_status = + *stop_status; // copy_from is not a public member function of Array + } + + std::shared_ptr ref; + std::shared_ptr dpcpp; + + std::ranlux48 rand_engine; + + std::shared_ptr mtx; + std::shared_ptr d_mtx; + std::unique_ptr dpcpp_idr_factory; + std::unique_ptr ref_idr_factory; + + gko::size_type nrhs; + + std::unique_ptr x; + std::unique_ptr b; + std::unique_ptr r; + std::unique_ptr m; + std::unique_ptr f; + std::unique_ptr g; + std::unique_ptr u; + std::unique_ptr c; + std::unique_ptr v; + std::unique_ptr p; + std::unique_ptr alpha; + std::unique_ptr omega; + std::unique_ptr tht; + std::unique_ptr residual_norm; + std::unique_ptr> stop_status; + + std::unique_ptr d_x; + std::unique_ptr d_b; + std::unique_ptr d_r; + std::unique_ptr d_m; + std::unique_ptr d_f; + std::unique_ptr d_g; + std::unique_ptr d_u; + std::unique_ptr d_c; + std::unique_ptr d_v; + std::unique_ptr d_p; + std::unique_ptr d_alpha; + std::unique_ptr d_omega; + std::unique_ptr d_tht; + std::unique_ptr d_residual_norm; + std::unique_ptr> d_stop_status; +}; + + +TEST_F(Idr, IdrInitializeIsEquivalentToRef) +{ + initialize_data(); + + gko::kernels::reference::idr::initialize(ref, nrhs, m.get(), p.get(), true, + stop_status.get()); + gko::kernels::dpcpp::idr::initialize(dpcpp, nrhs, d_m.get(), d_p.get(), + true, d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(m, d_m, 1e-14); + GKO_ASSERT_MTX_NEAR(p, d_p, 1e-14); +} + + +TEST_F(Idr, IdrStep1IsEquivalentToRef) +{ + initialize_data(); + + gko::size_type k = 2; + gko::kernels::reference::idr::step_1(ref, nrhs, k, m.get(), f.get(), + r.get(), g.get(), c.get(), v.get(), + stop_status.get()); + gko::kernels::dpcpp::idr::step_1(dpcpp, nrhs, k, d_m.get(), d_f.get(), + d_r.get(), d_g.get(), d_c.get(), d_v.get(), + d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(c, d_c, 1e-14); + GKO_ASSERT_MTX_NEAR(v, d_v, 1e-14); +} + + +TEST_F(Idr, IdrStep2IsEquivalentToRef) +{ + initialize_data(); + + gko::size_type k = 2; + gko::kernels::reference::idr::step_2(ref, nrhs, k, omega.get(), v.get(), + c.get(), u.get(), stop_status.get()); + gko::kernels::dpcpp::idr::step_2(dpcpp, nrhs, k, d_omega.get(), d_v.get(), + d_c.get(), d_u.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(u, d_u, 1e-14); +} + + +TEST_F(Idr, IdrStep3IsEquivalentToRef) +{ + initialize_data(); + + gko::size_type k = 2; + gko::kernels::reference::idr::step_3( + ref, nrhs, k, p.get(), g.get(), v.get(), u.get(), m.get(), f.get(), + alpha.get(), r.get(), x.get(), stop_status.get()); + gko::kernels::dpcpp::idr::step_3( + dpcpp, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(), + d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(g, d_g, 1e-14); + GKO_ASSERT_MTX_NEAR(v, d_v, 1e-14); + GKO_ASSERT_MTX_NEAR(u, d_u, 1e-14); + GKO_ASSERT_MTX_NEAR(m, d_m, 1e-14); + GKO_ASSERT_MTX_NEAR(f, d_f, 1e-14); + GKO_ASSERT_MTX_NEAR(r, d_r, 1e-14); + GKO_ASSERT_MTX_NEAR(x, d_x, 1e-14); +} + + +TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) +{ + initialize_data(); + + double kappa = 0.7; + gko::kernels::reference::idr::compute_omega(ref, nrhs, kappa, tht.get(), + residual_norm.get(), + omega.get(), stop_status.get()); + gko::kernels::dpcpp::idr::compute_omega(dpcpp, nrhs, kappa, d_tht.get(), + d_residual_norm.get(), + d_omega.get(), d_stop_status.get()); + + GKO_ASSERT_MTX_NEAR(omega, d_omega, 1e-14); +} + + +TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) +{ + int m = 123; + int n = 1; + auto ref_solver = ref_idr_factory->generate(mtx); + auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(dpcpp); + auto d_x = Mtx::create(dpcpp); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + dpcpp_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + + +TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) +{ + int m = 123; + int n = 1; + dpcpp_idr_factory = + Solver::build() + .with_deterministic(true) + .with_complex_subspace(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(dpcpp)) + .on(dpcpp); + ref_idr_factory = + Solver::build() + .with_deterministic(true) + .with_complex_subspace(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(ref)) + .on(ref); + auto ref_solver = ref_idr_factory->generate(mtx); + auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(dpcpp); + auto d_x = Mtx::create(dpcpp); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + dpcpp_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + + +TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) +{ + int m = 123; + int n = 16; + auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); + auto ref_solver = ref_idr_factory->generate(mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(dpcpp); + auto d_x = Mtx::create(dpcpp); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + dpcpp_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-12); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); +} + + +TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) +{ + int m = 123; + int n = 16; + dpcpp_idr_factory = + Solver::build() + .with_deterministic(true) + .with_complex_subspace(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(dpcpp)) + .on(dpcpp); + ref_idr_factory = + Solver::build() + .with_deterministic(true) + .with_complex_subspace(true) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(1u).on(ref)) + .on(ref); + auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); + auto ref_solver = ref_idr_factory->generate(mtx); + auto b = gen_mtx(m, n); + auto x = gen_mtx(m, n); + auto d_b = Mtx::create(dpcpp); + auto d_x = Mtx::create(dpcpp); + d_b->copy_from(b.get()); + d_x->copy_from(x.get()); + + ref_solver->apply(b.get(), x.get()); + dpcpp_solver->apply(d_b.get(), d_x.get()); + + GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); + GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); +} + + +} // namespace From e1970dda5e655da86a726ffb65e5c0c869180e2b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 20 Jul 2021 08:19:23 +0200 Subject: [PATCH 2/8] compile but idr still has issue on initial --- dpcpp/CMakeLists.txt | 4 +- dpcpp/solver/idr_kernels.dp.cpp | 92 +++++++++++++++++++------------ dpcpp/test/solver/CMakeLists.txt | 1 + dpcpp/test/solver/idr_kernels.cpp | 54 ++++++++++-------- 4 files changed, 92 insertions(+), 59 deletions(-) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index e2d476164e8..401be197820 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -8,6 +8,8 @@ set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE) find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}") set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE) +find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}") +set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE) add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp @@ -75,7 +77,7 @@ else () target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) -target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP) +target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 7618c8b70d1..2acfdc953c1 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -34,12 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include #include +#include #include @@ -67,8 +66,8 @@ namespace dpcpp { namespace idr { -constexpr int default_block_size = 512; -constexpr int default_dot_dim = 32; +constexpr int default_block_size = 256; +constexpr int default_dot_dim = 16; constexpr int default_dot_size = default_dot_dim * default_dot_dim; @@ -140,7 +139,7 @@ void orthonormalize_subspace_vectors_kernel( better performance, if there is no access to global memory. */ item_ct1.barrier(); - reduce( + ::gko::kernels::dpcpp::reduce( group::this_thread_block(item_ct1), reduction_helper, [](const ValueType &a, const ValueType &b) { return a + b; }); /* @@ -169,9 +168,10 @@ void orthonormalize_subspace_vectors_kernel( better performance, if there is no access to global memory. */ item_ct1.barrier(); - reduce(group::this_thread_block(item_ct1), reduction_helper_real, - [](const remove_complex &a, - const remove_complex &b) { return a + b; }); + ::gko::kernels::dpcpp::reduce( + group::this_thread_block(item_ct1), reduction_helper_real, + [](const remove_complex &a, + const remove_complex &b) { return a + b; }); /* DPCT1065:2: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for @@ -179,7 +179,7 @@ void orthonormalize_subspace_vectors_kernel( */ item_ct1.barrier(); - norm = sycl::sqrt((float)(reduction_helper_real[0])); + norm = std::sqrt(reduction_helper_real[0]); for (size_type j = tidx; j < num_cols; j += block_size) { values[row * stride + j] /= norm; } @@ -397,9 +397,9 @@ void multidot_kernel( local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; const auto tile_block = group::tiled_partition( group::this_thread_block(item_ct1)); - const auto sum = - reduce(tile_block, local_res, - [](const ValueType &a, const ValueType &b) { return a + b; }); + const auto sum = ::gko::kernels::dpcpp::reduce( + tile_block, local_res, + [](const ValueType &a, const ValueType &b) { return a + b; }); const auto new_rhs = item_ct1.get_group(2) * default_dot_dim + tidy; if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) { atomic_add(alpha + new_rhs, sum); @@ -595,9 +595,8 @@ void compute_omega_kernel( if (!stop_status[global_id].has_stopped()) { auto thr = omega[global_id]; omega[global_id] /= tht[global_id]; - auto absrho = - std::abs(thr / (sycl::sqrt((float)(real(tht[global_id]))) * - residual_norm[global_id])); + auto absrho = std::abs(thr / (std::sqrt((float)(real(tht[global_id]))) * + residual_norm[global_id])); if (absrho < kappa) { omega[global_id] *= kappa / absrho; @@ -627,7 +626,8 @@ namespace { template -void initialize_m(const size_type nrhs, matrix::Dense *m, +void initialize_m(std::shared_ptr exec, + const size_type nrhs, matrix::Dense *m, Array *stop_status) { const auto subspace_dim = m->get_size()[0]; @@ -641,27 +641,47 @@ void initialize_m(const size_type nrhs, matrix::Dense *m, template -void initialize_subspace_vectors(matrix::Dense *subspace_vectors, +void initialize_subspace_vectors(std::shared_ptr exec, + matrix::Dense *subspace_vectors, bool deterministic) { if (deterministic) { auto subspace_vectors_data = matrix_data( - subspace_vectors->get_size(), std::normal_distribution<>(0.0, 1.0), + subspace_vectors->get_size(), + std::normal_distribution>(0.0, 1.0), std::ranlux48(15)); subspace_vectors->read(subspace_vectors_data); } else { - auto gen = - curand::rand_generator(time(NULL), CURAND_RNG_PSEUDO_DEFAULT); - curand::rand_vector( - gen, - subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - 0.0, 1.0, subspace_vectors->get_values()); + auto size = subspace_vectors->get_size(); + auto stride = subspace_vectors->get_stride(); + auto values = subspace_vectors->get_values(); + exec->get_queue()->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(size[0] * size[1]), [=](sycl::item<1> idx) { + std::uint64_t offset = idx.get_linear_id(); + + // Create minstd_rand engine + oneapi::dpl::minstd_rand engine(132, offset); + + // Create uniform_real_distribution distribution + oneapi::dpl::uniform_real_distribution< + remove_complex> + distr; + + // Generate random number + auto res = distr(engine); + + // Store results to x_acc + values[idx / size[1] * stride + idx % size[1]] = res; + }); + }); } } template -void orthonormalize_subspace_vectors(matrix::Dense *subspace_vectors) +void orthonormalize_subspace_vectors(std::shared_ptr exec, + matrix::Dense *subspace_vectors) { orthonormalize_subspace_vectors_kernel( 1, default_block_size, 0, exec->get_queue(), @@ -671,7 +691,8 @@ void orthonormalize_subspace_vectors(matrix::Dense *subspace_vectors) template -void solve_lower_triangular(const size_type nrhs, +void solve_lower_triangular(std::shared_ptr exec, + const size_type nrhs, const matrix::Dense *m, const matrix::Dense *f, matrix::Dense *c, @@ -702,7 +723,7 @@ void update_g_and_u(std::shared_ptr exec, const auto p_stride = p->get_stride(); const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); + exec->get_num_computing_units() * 2); const dim3 block_dim(default_dot_dim, default_dot_dim); for (size_type i = 0; i < k; i++) { @@ -714,9 +735,8 @@ void update_g_and_u(std::shared_ptr exec, nrhs, p_i, g_k->get_values(), g_k->get_stride(), alpha->get_values(), stop_status->get_const_data()); } else { - cublas::dot(exec->get_cublas_handle(), size, p_i, 1, - g_k->get_values(), g_k->get_stride(), - alpha->get_values()); + onemkl::dot(*exec->get_queue(), size, p_i, 1, g_k->get_values(), + g_k->get_stride(), alpha->get_values()); } update_g_k_and_u_kernel( ceildiv(size * g_k->get_stride(), default_block_size), @@ -746,7 +766,7 @@ void update_m(std::shared_ptr exec, const size_type nrhs, const auto m_stride = m->get_stride(); const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); + exec->get_num_computing_units() * 2); const dim3 block_dim(default_dot_dim, default_dot_dim); for (size_type i = k; i < subspace_dim; i++) { @@ -759,7 +779,7 @@ void update_m(std::shared_ptr exec, const size_type nrhs, g_k->get_stride(), m_i, stop_status->get_const_data()); } else { - cublas::dot(exec->get_cublas_handle(), size, p_i, 1, + onemkl::dot(*exec->get_queue(), size, p_i, 1, g_k->get_const_values(), g_k->get_stride(), m_i); } } @@ -801,9 +821,9 @@ void initialize(std::shared_ptr exec, const size_type nrhs, matrix::Dense *subspace_vectors, bool deterministic, Array *stop_status) { - initialize_m(nrhs, m, stop_status); - initialize_subspace_vectors(subspace_vectors, deterministic); - orthonormalize_subspace_vectors(subspace_vectors); + initialize_m(exec, nrhs, m, stop_status); + initialize_subspace_vectors(exec, subspace_vectors, deterministic); + orthonormalize_subspace_vectors(exec, subspace_vectors); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); @@ -818,7 +838,7 @@ void step_1(std::shared_ptr exec, const size_type nrhs, matrix::Dense *v, const Array *stop_status) { - solve_lower_triangular(nrhs, m, f, c, stop_status); + solve_lower_triangular(exec, nrhs, m, f, c, stop_status); const auto num_rows = v->get_size()[0]; const auto subspace_dim = m->get_size()[0]; diff --git a/dpcpp/test/solver/CMakeLists.txt b/dpcpp/test/solver/CMakeLists.txt index 44046002504..b4f0e4059ca 100644 --- a/dpcpp/test/solver/CMakeLists.txt +++ b/dpcpp/test/solver/CMakeLists.txt @@ -1,2 +1,3 @@ ginkgo_create_test(gmres_kernels) ginkgo_create_test(cb_gmres_kernels) +ginkgo_create_test(idr_kernels) diff --git a/dpcpp/test/solver/idr_kernels.cpp b/dpcpp/test/solver/idr_kernels.cpp index 9316ec02e7d..e7202ea5a27 100644 --- a/dpcpp/test/solver/idr_kernels.cpp +++ b/dpcpp/test/solver/idr_kernels.cpp @@ -55,10 +55,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace { +// use another alias to avoid conflict name in the Idr +template +using rr = typename gko::test::reduction_factor; + class Idr : public ::testing::Test { protected: - using Mtx = gko::matrix::Dense<>; - using Solver = gko::solver::Idr<>; +#if GINKGO_DPCPP_SINGLE_MODE + using value_type = float; +#else + using value_type = double; +#endif + using Mtx = gko::matrix::Dense; + using Solver = gko::solver::Idr; Idr() : rand_engine(30) {} @@ -97,7 +106,8 @@ class Idr : public ::testing::Test { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + std::normal_distribution>(0.0, 1.0), + rand_engine, ref); } void initialize_data() @@ -215,8 +225,8 @@ TEST_F(Idr, IdrInitializeIsEquivalentToRef) gko::kernels::dpcpp::idr::initialize(dpcpp, nrhs, d_m.get(), d_p.get(), true, d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(m, d_m, 1e-14); - GKO_ASSERT_MTX_NEAR(p, d_p, 1e-14); + GKO_ASSERT_MTX_NEAR(m, d_m, rr::value); + GKO_ASSERT_MTX_NEAR(p, d_p, rr::value); } @@ -232,8 +242,8 @@ TEST_F(Idr, IdrStep1IsEquivalentToRef) d_r.get(), d_g.get(), d_c.get(), d_v.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(c, d_c, 1e-14); - GKO_ASSERT_MTX_NEAR(v, d_v, 1e-14); + GKO_ASSERT_MTX_NEAR(c, d_c, rr::value); + GKO_ASSERT_MTX_NEAR(v, d_v, rr::value); } @@ -247,7 +257,7 @@ TEST_F(Idr, IdrStep2IsEquivalentToRef) gko::kernels::dpcpp::idr::step_2(dpcpp, nrhs, k, d_omega.get(), d_v.get(), d_c.get(), d_u.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(u, d_u, 1e-14); + GKO_ASSERT_MTX_NEAR(u, d_u, rr::value); } @@ -263,13 +273,13 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef) dpcpp, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(), d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(g, d_g, 1e-14); - GKO_ASSERT_MTX_NEAR(v, d_v, 1e-14); - GKO_ASSERT_MTX_NEAR(u, d_u, 1e-14); - GKO_ASSERT_MTX_NEAR(m, d_m, 1e-14); - GKO_ASSERT_MTX_NEAR(f, d_f, 1e-14); - GKO_ASSERT_MTX_NEAR(r, d_r, 1e-14); - GKO_ASSERT_MTX_NEAR(x, d_x, 1e-14); + GKO_ASSERT_MTX_NEAR(g, d_g, rr::value); + GKO_ASSERT_MTX_NEAR(v, d_v, rr::value); + GKO_ASSERT_MTX_NEAR(u, d_u, rr::value); + GKO_ASSERT_MTX_NEAR(m, d_m, rr::value); + GKO_ASSERT_MTX_NEAR(f, d_f, rr::value); + GKO_ASSERT_MTX_NEAR(r, d_r, rr::value); + GKO_ASSERT_MTX_NEAR(x, d_x, rr::value); } @@ -285,7 +295,7 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) d_residual_norm.get(), d_omega.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(omega, d_omega, 1e-14); + GKO_ASSERT_MTX_NEAR(omega, d_omega, rr::value); } @@ -305,8 +315,8 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); + GKO_ASSERT_MTX_NEAR(d_b, b, rr::value * 10); + GKO_ASSERT_MTX_NEAR(d_x, x, rr::value * 10); } @@ -340,8 +350,8 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); + GKO_ASSERT_MTX_NEAR(d_b, b, rr::value * 10); + GKO_ASSERT_MTX_NEAR(d_x, x, rr::value * 10); } @@ -396,8 +406,8 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_b, b, 1e-13); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-13); + GKO_ASSERT_MTX_NEAR(d_b, b, rr::value * 10); + GKO_ASSERT_MTX_NEAR(d_x, x, rr::value * 10); } From 084398af3d2c0691e52c8ed2ed56f9ffe9e258e8 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 28 Jul 2021 22:18:34 +0200 Subject: [PATCH 3/8] add weird barrier in idr and fix precision --- dpcpp/solver/idr_kernels.dp.cpp | 7 ++++--- dpcpp/test/solver/idr_kernels.cpp | 14 +++++++------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 2acfdc953c1..dfd8ffd9af4 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -130,7 +130,8 @@ void orthonormalize_subspace_vectors_kernel( */ dot += values[row * stride + j] * conj(values[i * stride + j]); } - + // TODO: check with intel why we need this here. + item_ct1.barrier(); reduction_helper[tidx] = dot; /* @@ -595,8 +596,8 @@ void compute_omega_kernel( if (!stop_status[global_id].has_stopped()) { auto thr = omega[global_id]; omega[global_id] /= tht[global_id]; - auto absrho = std::abs(thr / (std::sqrt((float)(real(tht[global_id]))) * - residual_norm[global_id])); + auto absrho = std::abs( + thr / (std::sqrt(real(tht[global_id])) * residual_norm[global_id])); if (absrho < kappa) { omega[global_id] *= kappa / absrho; diff --git a/dpcpp/test/solver/idr_kernels.cpp b/dpcpp/test/solver/idr_kernels.cpp index e7202ea5a27..8fb08303ed8 100644 --- a/dpcpp/test/solver/idr_kernels.cpp +++ b/dpcpp/test/solver/idr_kernels.cpp @@ -273,11 +273,11 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef) dpcpp, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(), d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get()); - GKO_ASSERT_MTX_NEAR(g, d_g, rr::value); + GKO_ASSERT_MTX_NEAR(g, d_g, 2 * rr::value); GKO_ASSERT_MTX_NEAR(v, d_v, rr::value); GKO_ASSERT_MTX_NEAR(u, d_u, rr::value); GKO_ASSERT_MTX_NEAR(m, d_m, rr::value); - GKO_ASSERT_MTX_NEAR(f, d_f, rr::value); + GKO_ASSERT_MTX_NEAR(f, d_f, 2 * rr::value); GKO_ASSERT_MTX_NEAR(r, d_r, rr::value); GKO_ASSERT_MTX_NEAR(x, d_x, rr::value); } @@ -287,7 +287,7 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) { initialize_data(); - double kappa = 0.7; + value_type kappa = 0.7; gko::kernels::reference::idr::compute_omega(ref, nrhs, kappa, tht.get(), residual_norm.get(), omega.get(), stop_status.get()); @@ -350,8 +350,8 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_b, b, rr::value * 10); - GKO_ASSERT_MTX_NEAR(d_x, x, rr::value * 10); + GKO_ASSERT_MTX_NEAR(d_b, b, rr::value * 100); + GKO_ASSERT_MTX_NEAR(d_x, x, rr::value * 100); } @@ -371,8 +371,8 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); - GKO_ASSERT_MTX_NEAR(d_b, b, 1e-12); - GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); + GKO_ASSERT_MTX_NEAR(d_b, b, rr::value * 500); + GKO_ASSERT_MTX_NEAR(d_x, x, rr::value * 500); } From b1ddb40b907faf49b845f8d279efef2f3698198e Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 29 Jul 2021 15:09:05 +0200 Subject: [PATCH 4/8] oneDPL for complex generation --- dpcpp/solver/idr_kernels.dp.cpp | 36 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index dfd8ffd9af4..b9034842ceb 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -653,28 +653,22 @@ void initialize_subspace_vectors(std::shared_ptr exec, std::ranlux48(15)); subspace_vectors->read(subspace_vectors_data); } else { - auto size = subspace_vectors->get_size(); - auto stride = subspace_vectors->get_stride(); - auto values = subspace_vectors->get_values(); + auto seed = time(NULL); + auto work = reinterpret_cast *>( + subspace_vectors->get_values()); + auto n = + subspace_vectors->get_size()[0] * subspace_vectors->get_stride(); + n = is_complex() ? 2 * n : n; exec->get_queue()->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::range<1>(size[0] * size[1]), [=](sycl::item<1> idx) { - std::uint64_t offset = idx.get_linear_id(); - - // Create minstd_rand engine - oneapi::dpl::minstd_rand engine(132, offset); - - // Create uniform_real_distribution distribution - oneapi::dpl::uniform_real_distribution< - remove_complex> - distr; - - // Generate random number - auto res = distr(engine); - - // Store results to x_acc - values[idx / size[1] * stride + idx % size[1]] = res; - }); + cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { + std::uint64_t offset = idx.get_linear_id(); + oneapi::dpl::minstd_rand engine(seed, offset); + oneapi::dpl::normal_distribution> + distr(0, 1); + auto res = distr(engine); + + work[idx] = res; + }); }); } } From d9f2b985946aa50635ffbdf6d5264b8cf9f1839f Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 3 Aug 2021 12:01:43 +0200 Subject: [PATCH 5/8] relax the error bound of hessenberg --- dpcpp/test/solver/gmres_kernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpcpp/test/solver/gmres_kernels.cpp b/dpcpp/test/solver/gmres_kernels.cpp index 0cefc2e2d3a..72f7ba0ae06 100644 --- a/dpcpp/test/solver/gmres_kernels.cpp +++ b/dpcpp/test/solver/gmres_kernels.cpp @@ -282,7 +282,7 @@ TEST_F(Gmres, DpcppGmresStep1OnSingleRHSIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(d_residual_norm_collection, residual_norm_collection, r::value); GKO_ASSERT_MTX_NEAR(d_hessenberg_iter, hessenberg_iter, - r::value); + 2 * r::value); GKO_ASSERT_MTX_NEAR(d_krylov_bases, krylov_bases, r::value); GKO_ASSERT_ARRAY_EQ(*d_final_iter_nums, *final_iter_nums); } From bfbf040de1b73140989f17671527f48475ea7e8d Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 3 Aug 2021 20:41:33 +0200 Subject: [PATCH 6/8] rm dpct warning, use ref of uninitialized_array, reduce test code dup Co-authored-by: Terry Cojean Co-authored-by: Tobias Ribizel --- cuda/test/solver/idr_kernels.cpp | 47 +++++------------------ dpcpp/solver/idr_kernels.dp.cpp | 63 ++++++++----------------------- dpcpp/test/solver/idr_kernels.cpp | 47 +++++------------------ hip/test/solver/idr_kernels.cpp | 47 +++++------------------ omp/test/solver/idr_kernels.cpp | 47 +++++------------------ 5 files changed, 51 insertions(+), 200 deletions(-) diff --git a/cuda/test/solver/idr_kernels.cpp b/cuda/test/solver/idr_kernels.cpp index 9aa925187a4..cc075f9191d 100644 --- a/cuda/test/solver/idr_kernels.cpp +++ b/cuda/test/solver/idr_kernels.cpp @@ -67,9 +67,6 @@ class Idr : public ::testing::Test { ref = gko::ReferenceExecutor::create(); cuda = gko::CudaExecutor::create(0, ref); - mtx = gen_mtx(123, 123); - d_mtx = Mtx::create(cuda); - d_mtx->copy_from(mtx.get()); cuda_idr_factory = Solver::build() .with_deterministic(true) @@ -100,11 +97,11 @@ class Idr : public ::testing::Test { std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } - void initialize_data() + void initialize_data(int size = 597, int input_nrhs = 17) { - int size = 597; - nrhs = 17; + nrhs = input_nrhs; int s = 4; + mtx = gen_mtx(size, size); x = gen_mtx(size, nrhs); b = gen_mtx(size, nrhs); r = gen_mtx(size, nrhs); @@ -125,6 +122,7 @@ class Idr : public ::testing::Test { stop_status->get_data()[i].reset(); } + d_mtx = Mtx::create(cuda); d_x = Mtx::create(cuda); d_b = Mtx::create(cuda); d_r = Mtx::create(cuda); @@ -142,6 +140,7 @@ class Idr : public ::testing::Test { d_stop_status = std::unique_ptr>( new gko::Array(cuda)); + d_mtx->copy_from(mtx.get()); d_x->copy_from(x.get()); d_b->copy_from(b.get()); d_r->copy_from(r.get()); @@ -291,16 +290,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); auto ref_solver = ref_idr_factory->generate(mtx); auto cuda_solver = cuda_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(cuda); - auto d_x = Mtx::create(cuda); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); cuda_solver->apply(d_b.get(), d_x.get()); @@ -312,8 +304,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); cuda_idr_factory = Solver::build() .with_deterministic(true) @@ -330,12 +321,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) .on(ref); auto ref_solver = ref_idr_factory->generate(mtx); auto cuda_solver = cuda_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(cuda); - auto d_x = Mtx::create(cuda); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); cuda_solver->apply(d_b.get(), d_x.get()); @@ -347,16 +332,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); auto cuda_solver = cuda_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(cuda); - auto d_x = Mtx::create(cuda); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); cuda_solver->apply(d_b.get(), d_x.get()); @@ -368,8 +346,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); cuda_idr_factory = Solver::build() .with_deterministic(true) @@ -386,12 +363,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) .on(ref); auto cuda_solver = cuda_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(cuda); - auto d_x = Mtx::create(cuda); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); cuda_solver->apply(d_b.get(), d_x.get()); diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index b9034842ceb..05d63a840ff 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -111,44 +111,31 @@ template void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType *__restrict__ values, size_type stride, sycl::nd_item<3> item_ct1, - UninitializedArray *reduction_helper_array, + UninitializedArray &reduction_helper_array, remove_complex *reduction_helper_real) { const auto tidx = thread::get_thread_id_flat(item_ct1); - - ValueType *__restrict__ reduction_helper = (*reduction_helper_array); - + ValueType *__restrict__ reduction_helper = reduction_helper_array; for (size_type row = 0; row < num_rows; row++) { for (size_type i = 0; i < row; i++) { auto dot = zero(); + // TODO: check with intel why we need this here. + // Is it from we use updated the value even if it is on the same + // thread? + item_ct1.barrier(); for (size_type j = tidx; j < num_cols; j += block_size) { - /* - DPCT1007:5: Migration of this CUDA API is not supported by the - Intel(R) DPC++ Compatibility Tool. - */ dot += values[row * stride + j] * conj(values[i * stride + j]); } - // TODO: check with intel why we need this here. - item_ct1.barrier(); + reduction_helper[tidx] = dot; - /* - DPCT1065:3: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance, if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); ::gko::kernels::dpcpp::reduce( group::this_thread_block(item_ct1), reduction_helper, [](const ValueType &a, const ValueType &b) { return a + b; }); - /* - DPCT1065:4: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance, if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); dot = reduction_helper[0]; for (size_type j = tidx; j < num_cols; j += block_size) { @@ -163,22 +150,12 @@ void orthonormalize_subspace_vectors_kernel( reduction_helper_real[tidx] = norm; - /* - DPCT1065:1: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance, if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); ::gko::kernels::dpcpp::reduce( group::this_thread_block(item_ct1), reduction_helper_real, [](const remove_complex &a, const remove_complex &b) { return a + b; }); - /* - DPCT1065:2: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance, if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); norm = std::sqrt(reduction_helper_real[0]); for (size_type j = tidx; j < num_cols; j += block_size) { @@ -206,10 +183,8 @@ void orthonormalize_subspace_vectors_kernel( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { orthonormalize_subspace_vectors_kernel( num_rows, num_cols, values, stride, item_ct1, - (UninitializedArray *) - reduction_helper_array_acc_ct1.get_pointer(), - (remove_complex *) - reduction_helper_real_acc_ct1.get_pointer()); + *reduction_helper_array_acc_ct1.get_pointer(), + reduction_helper_real_acc_ct1.get_pointer().get()); }); }); } @@ -377,7 +352,6 @@ void multidot_kernel( : (item_ct1.get_group(1) + 1) * num; // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - ValueType *__restrict__ reduction_helper = (*reduction_helper_array); ValueType local_res = zero(); @@ -389,12 +363,7 @@ void multidot_kernel( } } reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - /* - DPCT1065:10: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better - performance, if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; const auto tile_block = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -426,9 +395,7 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, multidot_kernel( num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status, item_ct1, - (UninitializedArray *) - reduction_helper_array_acc_ct1.get_pointer()); + reduction_helper_array_acc_ct1.get_pointer().get()); }); }); } diff --git a/dpcpp/test/solver/idr_kernels.cpp b/dpcpp/test/solver/idr_kernels.cpp index 8fb08303ed8..a9a86b941e6 100644 --- a/dpcpp/test/solver/idr_kernels.cpp +++ b/dpcpp/test/solver/idr_kernels.cpp @@ -76,9 +76,6 @@ class Idr : public ::testing::Test { ref = gko::ReferenceExecutor::create(); dpcpp = gko::DpcppExecutor::create(0, ref); - mtx = gen_mtx(123, 123); - d_mtx = Mtx::create(dpcpp); - d_mtx->copy_from(mtx.get()); dpcpp_idr_factory = Solver::build() .with_deterministic(true) @@ -110,11 +107,11 @@ class Idr : public ::testing::Test { rand_engine, ref); } - void initialize_data() + void initialize_data(int size = 597, int input_nrhs = 17) { - int size = 597; - nrhs = 17; + nrhs = input_nrhs; int s = 4; + mtx = gen_mtx(size, size); x = gen_mtx(size, nrhs); b = gen_mtx(size, nrhs); r = gen_mtx(size, nrhs); @@ -135,6 +132,7 @@ class Idr : public ::testing::Test { stop_status->get_data()[i].reset(); } + d_mtx = Mtx::create(dpcpp); d_x = Mtx::create(dpcpp); d_b = Mtx::create(dpcpp); d_r = Mtx::create(dpcpp); @@ -152,6 +150,7 @@ class Idr : public ::testing::Test { d_stop_status = std::unique_ptr>( new gko::Array(dpcpp)); + d_mtx->copy_from(mtx.get()); d_x->copy_from(x.get()); d_b->copy_from(b.get()); d_r->copy_from(r.get()); @@ -301,16 +300,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); auto ref_solver = ref_idr_factory->generate(mtx); auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(dpcpp); - auto d_x = Mtx::create(dpcpp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); @@ -322,8 +314,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); dpcpp_idr_factory = Solver::build() .with_deterministic(true) @@ -340,12 +331,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) .on(ref); auto ref_solver = ref_idr_factory->generate(mtx); auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(dpcpp); - auto d_x = Mtx::create(dpcpp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); @@ -357,16 +342,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(dpcpp); - auto d_x = Mtx::create(dpcpp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); @@ -378,8 +356,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 6); dpcpp_idr_factory = Solver::build() .with_deterministic(true) @@ -396,12 +373,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) .on(ref); auto dpcpp_solver = dpcpp_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(dpcpp); - auto d_x = Mtx::create(dpcpp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); dpcpp_solver->apply(d_b.get(), d_x.get()); diff --git a/hip/test/solver/idr_kernels.cpp b/hip/test/solver/idr_kernels.cpp index de7f5a14125..4f9cebeefa0 100644 --- a/hip/test/solver/idr_kernels.cpp +++ b/hip/test/solver/idr_kernels.cpp @@ -67,9 +67,6 @@ class Idr : public ::testing::Test { ref = gko::ReferenceExecutor::create(); hip = gko::HipExecutor::create(0, ref); - mtx = gen_mtx(123, 123); - d_mtx = Mtx::create(hip); - d_mtx->copy_from(mtx.get()); hip_idr_factory = Solver::build() .with_deterministic(true) @@ -100,11 +97,11 @@ class Idr : public ::testing::Test { std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } - void initialize_data() + void initialize_data(int size = 597, int input_nrhs = 17) { - int size = 597; - nrhs = 17; + nrhs = input_nrhs; int s = 4; + mtx = gen_mtx(size, size); x = gen_mtx(size, nrhs); b = gen_mtx(size, nrhs); r = gen_mtx(size, nrhs); @@ -125,6 +122,7 @@ class Idr : public ::testing::Test { stop_status->get_data()[i].reset(); } + d_mtx = Mtx::create(hip); d_x = Mtx::create(hip); d_b = Mtx::create(hip); d_r = Mtx::create(hip); @@ -142,6 +140,7 @@ class Idr : public ::testing::Test { d_stop_status = std::unique_ptr>( new gko::Array(hip)); + d_mtx->copy_from(mtx.get()); d_x->copy_from(x.get()); d_b->copy_from(b.get()); d_r->copy_from(r.get()); @@ -291,16 +290,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); auto ref_solver = ref_idr_factory->generate(mtx); auto hip_solver = hip_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(hip); - auto d_x = Mtx::create(hip); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); hip_solver->apply(d_b.get(), d_x.get()); @@ -312,8 +304,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); hip_idr_factory = Solver::build() .with_deterministic(true) @@ -330,12 +321,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) .on(ref); auto ref_solver = ref_idr_factory->generate(mtx); auto hip_solver = hip_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(hip); - auto d_x = Mtx::create(hip); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); hip_solver->apply(d_b.get(), d_x.get()); @@ -347,16 +332,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); auto hip_solver = hip_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(hip); - auto d_x = Mtx::create(hip); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); hip_solver->apply(d_b.get(), d_x.get()); @@ -368,8 +346,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); hip_idr_factory = Solver::build() .with_deterministic(true) @@ -386,12 +363,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) .on(ref); auto hip_solver = hip_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(hip); - auto d_x = Mtx::create(hip); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); hip_solver->apply(d_b.get(), d_x.get()); diff --git a/omp/test/solver/idr_kernels.cpp b/omp/test/solver/idr_kernels.cpp index 794d68cf3f2..bf2ec5d8e65 100644 --- a/omp/test/solver/idr_kernels.cpp +++ b/omp/test/solver/idr_kernels.cpp @@ -73,9 +73,6 @@ class Idr : public ::testing::Test { ref = gko::ReferenceExecutor::create(); omp = gko::OmpExecutor::create(); - mtx = gen_mtx(123, 123); - d_mtx = Mtx::create(omp); - d_mtx->copy_from(mtx.get()); omp_idr_factory = Solver::build() .with_deterministic(true) @@ -106,11 +103,11 @@ class Idr : public ::testing::Test { std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } - void initialize_data() + void initialize_data(int size = 597, int input_nrhs = 17) { - int size = 597; - nrhs = 17; + nrhs = input_nrhs; int s = 4; + mtx = gen_mtx(size, size); x = gen_mtx(size, nrhs); b = gen_mtx(size, nrhs); r = gen_mtx(size, nrhs); @@ -131,6 +128,7 @@ class Idr : public ::testing::Test { stop_status->get_data()[i].reset(); } + d_mtx = Mtx::create(omp); d_x = Mtx::create(omp); d_b = Mtx::create(omp); d_r = Mtx::create(omp); @@ -148,6 +146,7 @@ class Idr : public ::testing::Test { d_stop_status = std::unique_ptr>( new gko::Array(omp)); + d_mtx->copy_from(mtx.get()); d_x->copy_from(x.get()); d_b->copy_from(b.get()); d_r->copy_from(r.get()); @@ -297,16 +296,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef) TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); auto ref_solver = ref_idr_factory->generate(mtx); auto omp_solver = omp_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(omp); - auto d_x = Mtx::create(omp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); omp_solver->apply(d_b.get(), d_x.get()); @@ -318,8 +310,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) { - int m = 123; - int n = 1; + initialize_data(123, 1); omp_idr_factory = Solver::build() .with_deterministic(true) @@ -336,12 +327,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) .on(ref); auto ref_solver = ref_idr_factory->generate(mtx); auto omp_solver = omp_idr_factory->generate(d_mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(omp); - auto d_x = Mtx::create(omp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); omp_solver->apply(d_b.get(), d_x.get()); @@ -353,16 +338,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); auto omp_solver = omp_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(omp); - auto d_x = Mtx::create(omp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); omp_solver->apply(d_b.get(), d_x.get()); @@ -374,8 +352,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef) TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) { - int m = 123; - int n = 16; + initialize_data(123, 16); omp_idr_factory = Solver::build() .with_deterministic(true) @@ -392,12 +369,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef) .on(ref); auto omp_solver = omp_idr_factory->generate(d_mtx); auto ref_solver = ref_idr_factory->generate(mtx); - auto b = gen_mtx(m, n); - auto x = gen_mtx(m, n); - auto d_b = Mtx::create(omp); - auto d_x = Mtx::create(omp); - d_b->copy_from(b.get()); - d_x->copy_from(x.get()); ref_solver->apply(b.get(), x.get()); omp_solver->apply(d_b.get(), d_x.get()); From 1c2276f38ed8e733e427ba96e8b6b867dd5b1dc0 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 4 Aug 2021 19:19:17 +0200 Subject: [PATCH 7/8] pass reference not pointer of UA, overlap shared Co-authored-by: Tobias Ribizel --- common/solver/idr_kernels.hpp.inc | 7 ++-- dpcpp/components/prefix_sum.dp.hpp | 20 +++++----- dpcpp/components/reduction.dp.hpp | 8 ++-- dpcpp/matrix/dense_kernels.dp.cpp | 56 ++++++++++++++-------------- dpcpp/solver/cb_gmres_kernels.dp.cpp | 15 +++----- dpcpp/solver/gmres_kernels.dp.cpp | 21 +++++------ dpcpp/solver/idr_kernels.dp.cpp | 43 ++++++++++----------- dpcpp/test/solver/idr_kernels.cpp | 4 +- 8 files changed, 83 insertions(+), 91 deletions(-) diff --git a/common/solver/idr_kernels.hpp.inc b/common/solver/idr_kernels.hpp.inc index fd57a31afc7..ee6f0e58d67 100644 --- a/common/solver/idr_kernels.hpp.inc +++ b/common/solver/idr_kernels.hpp.inc @@ -59,9 +59,10 @@ __global__ const auto tidx = thread::get_thread_id_flat(); __shared__ UninitializedArray reduction_helper_array; - ValueType *__restrict__ reduction_helper = reduction_helper_array; - - __shared__ remove_complex reduction_helper_real[block_size]; + // they are not be used in the same time. + ValueType *reduction_helper = reduction_helper_array; + auto reduction_helper_real = + reinterpret_cast *>(reduction_helper); for (size_type row = 0; row < num_rows; row++) { for (size_type i = 0; i < row; i++) { diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 7eaea344641..0eaf5da3f76 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -129,13 +129,13 @@ template void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, ValueType *__restrict__ block_sum, sycl::nd_item<3> item_ct1, - UninitializedArray *prefix_helper) + UninitializedArray &prefix_helper) { const auto tidx = thread::get_thread_id_flat(item_ct1); const auto element_id = item_ct1.get_local_id(2); // do not need to access the last element when exclusive prefix sum - (*prefix_helper)[element_id] = + prefix_helper[element_id] = (tidx + 1 < num_elements) ? elements[tidx] : zero(); auto this_block = group::this_thread_block(item_ct1); this_block.sync(); @@ -146,7 +146,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, const auto ai = i * (2 * element_id + 1) - 1; const auto bi = i * (2 * element_id + 2) - 1; if (bi < block_size) { - (*prefix_helper)[bi] += (*prefix_helper)[ai]; + prefix_helper[bi] += prefix_helper[ai]; } this_block.sync(); } @@ -154,9 +154,9 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, if (element_id == 0) { // Store the total sum except the last block if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) { - block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1]; + block_sum[item_ct1.get_group(2)] = prefix_helper[block_size - 1]; } - (*prefix_helper)[block_size - 1] = zero(); + prefix_helper[block_size - 1] = zero(); } this_block.sync(); @@ -167,14 +167,14 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, const auto ai = i * (2 * element_id + 1) - 1; const auto bi = i * (2 * element_id + 2) - 1; if (bi < block_size) { - auto tmp = (*prefix_helper)[ai]; - (*prefix_helper)[ai] = (*prefix_helper)[bi]; - (*prefix_helper)[bi] += tmp; + auto tmp = prefix_helper[ai]; + prefix_helper[ai] = prefix_helper[bi]; + prefix_helper[bi] += tmp; } this_block.sync(); } if (tidx < num_elements) { - elements[tidx] = (*prefix_helper)[element_id]; + elements[tidx] = prefix_helper[element_id]; } } @@ -193,7 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, [=](sycl::nd_item<3> item_ct1) { start_prefix_sum( num_elements, elements, block_sum, item_ct1, - prefix_helper_acc_ct1.get_pointer().get()); + *prefix_helper_acc_ct1.get_pointer()); }); }); } diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 048dfa4f862..0d47da1490b 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -205,14 +205,14 @@ template void reduce_add_array( size_type size, const ValueType *__restrict__ source, ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)> *block_sum) + UninitializedArray(cfg)> &block_sum) { reduce_array(cfg)>( - size, source, static_cast((*block_sum)), item_ct1, + size, source, static_cast(block_sum), item_ct1, [](const ValueType &x, const ValueType &y) { return x + y; }); if (item_ct1.get_local_id(2) == 0) { - result[item_ct1.get_group(2)] = (*block_sum)[0]; + result[item_ct1.get_group(2)] = block_sum[0]; } } @@ -230,7 +230,7 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { reduce_add_array(size, source, result, item_ct1, - block_sum_acc_ct1.get_pointer().get()); + *block_sum_acc_ct1.get_pointer()); }); }); } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index fd14041b645..ea2f24c990e 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -89,7 +89,7 @@ template item_ct1, - UninitializedArray(cfg)> *tmp_work) + UninitializedArray(cfg)> &tmp_work) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -101,7 +101,7 @@ void compute_partial_reduce( const auto global_id = thread::get_thread_id(item_ct1); - OutType *tmp_work_array = *tmp_work; + OutType *tmp_work_array = tmp_work; auto tmp = zero(); for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) { tmp = reduce_op(tmp, get_value(i)); @@ -124,7 +124,7 @@ void finalize_reduce_computation( size_type size, const ValueType *work, ValueType *result, CallableReduce reduce_op, CallableFinalize finalize_op, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)> *tmp_work) + UninitializedArray(cfg)> &tmp_work) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -135,7 +135,7 @@ void finalize_reduce_computation( for (auto i = local_id; i < size; i += wg_size) { tmp = reduce_op(tmp, work[i]); } - ValueType *tmp_work_array = *tmp_work; + ValueType *tmp_work_array = tmp_work; tmp_work_array[local_id] = tmp; ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), @@ -152,7 +152,7 @@ void compute_partial_dot( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, const ValueType *__restrict__ y, size_type stride_y, ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)> *tmp_work) + UninitializedArray(cfg)> &tmp_work) { compute_partial_reduce( num_rows, work, @@ -181,7 +181,7 @@ void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { compute_partial_dot(num_rows, x, stride_x, y, stride_y, work, item_ct1, - tmp_work_acc_ct1.get_pointer().get()); + *tmp_work_acc_ct1.get_pointer()); }); }); } @@ -197,7 +197,7 @@ void compute_partial_conj_dot( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, const ValueType *__restrict__ y, size_type stride_y, ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)> *tmp_work) + UninitializedArray(cfg)> &tmp_work) { compute_partial_reduce( num_rows, work, @@ -225,9 +225,9 @@ void compute_partial_conj_dot(dim3 grid, dim3 block, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_conj_dot( - num_rows, x, stride_x, y, stride_y, work, item_ct1, - tmp_work_acc_ct1.get_pointer().get()); + compute_partial_conj_dot(num_rows, x, stride_x, y, + stride_y, work, item_ct1, + *tmp_work_acc_ct1.get_pointer()); }); }); } @@ -242,7 +242,7 @@ template void finalize_sum_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)> *tmp_work) + UninitializedArray(cfg)> &tmp_work) { finalize_reduce_computation( size, work, result, @@ -267,7 +267,7 @@ void finalize_sum_reduce_computation(dim3 grid, dim3 block, [=](sycl::nd_item<3> item_ct1) { finalize_sum_reduce_computation( size, work, result, item_ct1, - tmp_work_acc_ct1.get_pointer().get()); + *tmp_work_acc_ct1.get_pointer()); }); }); } @@ -283,7 +283,7 @@ void compute_partial_norm2( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, UninitializedArray, KCFG_1D::decode<0>(cfg)> - *tmp_work) + &tmp_work) { using norm_type = remove_complex; compute_partial_reduce( @@ -306,12 +306,12 @@ void compute_partial_norm2(dim3 grid, dim3 block, sycl::access::target::local> tmp_work_acc_ct1(cgh); - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - compute_partial_norm2( - num_rows, x, stride_x, work, item_ct1, - tmp_work_acc_ct1.get_pointer().get()); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_partial_norm2(num_rows, x, stride_x, work, + item_ct1, + *tmp_work_acc_ct1.get_pointer()); + }); }); } @@ -325,7 +325,7 @@ template void finalize_sqrt_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)> *tmp_work) + UninitializedArray(cfg)> &tmp_work) { finalize_reduce_computation( size, work, result, @@ -351,7 +351,7 @@ void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, [=](sycl::nd_item<3> item_ct1) { finalize_sqrt_reduce_computation( size, work, result, item_ct1, - tmp_work_acc_ct1.get_pointer().get()); + *tmp_work_acc_ct1.get_pointer()); }); }); } @@ -677,21 +677,21 @@ void transpose(const size_type nrows, const size_type ncols, const ValueType *__restrict__ in, const size_type in_stride, ValueType *__restrict__ out, const size_type out_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray *space) + UninitializedArray &space) { auto local_x = item_ct1.get_local_id(2); auto local_y = item_ct1.get_local_id(1); auto x = item_ct1.get_group(2) * sg_size + local_x; auto y = item_ct1.get_group(1) * sg_size + local_y; if (y < nrows && x < ncols) { - (*space)[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]); + space[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]); } item_ct1.barrier(sycl::access::fence_space::local_space); x = item_ct1.get_group(1) * sg_size + local_x; y = item_ct1.get_group(2) * sg_size + local_y; if (y < ncols && x < nrows) { - out[y * out_stride + x] = (*space)[local_x * (sg_size + 1) + local_y]; + out[y * out_stride + x] = space[local_x * (sg_size + 1) + local_y]; } } @@ -701,7 +701,7 @@ void transpose(const size_type nrows, const size_type ncols, const ValueType *__restrict__ in, const size_type in_stride, ValueType *__restrict__ out, const size_type out_stride, sycl::nd_item<3> item_ct1, - UninitializedArray *space) + UninitializedArray &space) { transpose( nrows, ncols, in, in_stride, out, out_stride, @@ -723,7 +723,7 @@ void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { transpose(nrows, ncols, in, in_stride, out, out_stride, - item_ct1, space_acc_ct1.get_pointer().get()); + item_ct1, *space_acc_ct1.get_pointer()); }); }); } @@ -739,7 +739,7 @@ void conj_transpose( const ValueType *__restrict__ in, const size_type in_stride, ValueType *__restrict__ out, const size_type out_stride, sycl::nd_item<3> item_ct1, - UninitializedArray *space) + UninitializedArray &space) { transpose( nrows, ncols, in, in_stride, out, out_stride, @@ -763,7 +763,7 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { conj_transpose(nrows, ncols, in, in_stride, out, out_stride, item_ct1, - space_acc_ct1.get_pointer().get()); + *space_acc_ct1.get_pointer()); }); }); } diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 0b8359402e6..2dc9d8b84b5 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -470,7 +470,7 @@ void multidot_kernel( size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray *reduction_helper_array) + UninitializedArray &reduction_helper_array) { /* * In general in this kernel: @@ -497,8 +497,7 @@ void multidot_kernel( const size_type k = item_ct1.get_group(0); // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - - ValueType *__restrict__ reduction_helper = (*reduction_helper_array); + ValueType *__restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { @@ -549,8 +548,7 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - (UninitializedArray *) - reduction_helper_array_acc_ct1.get_pointer()); + *reduction_helper_array_acc_ct1.get_pointer()); }); }); } @@ -562,7 +560,7 @@ void singledot_kernel( size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray *reduction_helper_array) + UninitializedArray &reduction_helper_array) { /* * In general in this kernel: @@ -585,7 +583,7 @@ void singledot_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - ValueType *__restrict__ reduction_helper = (*reduction_helper_array); + ValueType *__restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); if (!stop_status[col_idx].has_stopped()) { @@ -630,8 +628,7 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, num_rows, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - (UninitializedArray *) - reduction_helper_array_acc_ct1.get_pointer()); + *reduction_helper_array_acc_ct1.get_pointer()); }); }); } diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp index e27a7ef9f65..55369b09e9a 100644 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ b/dpcpp/solver/gmres_kernels.dp.cpp @@ -271,7 +271,7 @@ void update_hessenberg_2_kernel( size_type stride_next_krylov, ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg, const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray *reduction_helper_array) + UninitializedArray &reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto col_idx = item_ct1.get_group(2); @@ -279,7 +279,7 @@ void update_hessenberg_2_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - ValueType *__restrict__ reduction_helper = (*reduction_helper_array); + ValueType *__restrict__ reduction_helper = reduction_helper_array; if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { ValueType local_res{}; @@ -317,15 +317,14 @@ void update_hessenberg_2_kernel( sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - update_hessenberg_2_kernel( - iter, num_rows, num_cols, next_krylov_basis, - stride_next_krylov, hessenberg_iter, stride_hessenberg, - stop_status, item_ct1, - (UninitializedArray *) - reduction_helper_array_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + update_hessenberg_2_kernel( + iter, num_rows, num_cols, next_krylov_basis, + stride_next_krylov, hessenberg_iter, + stride_hessenberg, stop_status, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); + }); }); } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 05d63a840ff..782fb04a8f9 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -111,24 +111,25 @@ template void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType *__restrict__ values, size_type stride, sycl::nd_item<3> item_ct1, - UninitializedArray &reduction_helper_array, - remove_complex *reduction_helper_real) + UninitializedArray &reduction_helper_array) { const auto tidx = thread::get_thread_id_flat(item_ct1); - ValueType *__restrict__ reduction_helper = reduction_helper_array; + // they are not be used in the same time. + ValueType *reduction_helper = reduction_helper_array; + auto reduction_helper_real = + reinterpret_cast *>(reduction_helper); for (size_type row = 0; row < num_rows; row++) { for (size_type i = 0; i < row; i++) { auto dot = zero(); + for (size_type j = tidx; j < num_cols; j += block_size) { + dot += values[row * stride + j] * conj(values[i * stride + j]); + } // TODO: check with intel why we need this here. // Is it from we use updated the value even if it is on the same // thread? item_ct1.barrier(); - for (size_type j = tidx; j < num_cols; j += block_size) { - dot += values[row * stride + j] * conj(values[i * stride + j]); - } - reduction_helper[tidx] = dot; item_ct1.barrier(sycl::access::fence_space::local_space); @@ -174,18 +175,13 @@ void orthonormalize_subspace_vectors_kernel( sycl::access_mode::read_write, sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); - sycl::accessor, 1, - sycl::access_mode::read_write, - sycl::access::target::local> - reduction_helper_real_acc_ct1(sycl::range<1>(block_size), cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - orthonormalize_subspace_vectors_kernel( - num_rows, num_cols, values, stride, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer(), - reduction_helper_real_acc_ct1.get_pointer().get()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + orthonormalize_subspace_vectors_kernel( + num_rows, num_cols, values, stride, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); + }); }); } @@ -340,7 +336,7 @@ void multidot_kernel( ValueType *__restrict__ alpha, const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1, UninitializedArray - *reduction_helper_array) + &reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto tidy = item_ct1.get_local_id(1); @@ -352,7 +348,7 @@ void multidot_kernel( : (item_ct1.get_group(1) + 1) * num; // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - ValueType *__restrict__ reduction_helper = (*reduction_helper_array); + ValueType *__restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); if (rhs < nrhs && !stop_status[rhs].has_stopped()) { @@ -392,10 +388,9 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - multidot_kernel( - num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status, - item_ct1, - reduction_helper_array_acc_ct1.get_pointer().get()); + multidot_kernel(num_rows, nrhs, p_i, g_k, g_k_stride, alpha, + stop_status, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); }); }); } diff --git a/dpcpp/test/solver/idr_kernels.cpp b/dpcpp/test/solver/idr_kernels.cpp index a9a86b941e6..ef5afb132b1 100644 --- a/dpcpp/test/solver/idr_kernels.cpp +++ b/dpcpp/test/solver/idr_kernels.cpp @@ -276,8 +276,8 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef) GKO_ASSERT_MTX_NEAR(v, d_v, rr::value); GKO_ASSERT_MTX_NEAR(u, d_u, rr::value); GKO_ASSERT_MTX_NEAR(m, d_m, rr::value); - GKO_ASSERT_MTX_NEAR(f, d_f, 2 * rr::value); - GKO_ASSERT_MTX_NEAR(r, d_r, rr::value); + GKO_ASSERT_MTX_NEAR(f, d_f, 13 * rr::value); + GKO_ASSERT_MTX_NEAR(r, d_r, 2 * rr::value); GKO_ASSERT_MTX_NEAR(x, d_x, rr::value); } From aad7412336fca54bae5d9f1c4de96f7a024df616 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 5 Aug 2021 20:06:27 +0200 Subject: [PATCH 8/8] fix idr race condition Co-authored-by: Tobias Ribizel --- common/solver/idr_kernels.hpp.inc | 9 ++++----- dpcpp/solver/idr_kernels.dp.cpp | 11 ++++------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/common/solver/idr_kernels.hpp.inc b/common/solver/idr_kernels.hpp.inc index ee6f0e58d67..52e9a3313a7 100644 --- a/common/solver/idr_kernels.hpp.inc +++ b/common/solver/idr_kernels.hpp.inc @@ -71,9 +71,9 @@ __global__ dot += values[row * stride + j] * conj(values[i * stride + j]); } - reduction_helper[tidx] = dot; - + // Ensure already finish reading this shared memory __syncthreads(); + reduction_helper[tidx] = dot; reduce( group::this_thread_block(), reduction_helper, [](const ValueType &a, const ValueType &b) { return a + b; }); @@ -89,10 +89,9 @@ __global__ for (size_type j = tidx; j < num_cols; j += block_size) { norm += squared_norm(values[row * stride + j]); } - - reduction_helper_real[tidx] = norm; - + // Ensure already finish reading this shared memory __syncthreads(); + reduction_helper_real[tidx] = norm; reduce(group::this_thread_block(), reduction_helper_real, [](const remove_complex &a, const remove_complex &b) { return a + b; }); diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 782fb04a8f9..a4f18019128 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -126,13 +126,10 @@ void orthonormalize_subspace_vectors_kernel( for (size_type j = tidx; j < num_cols; j += block_size) { dot += values[row * stride + j] * conj(values[i * stride + j]); } - // TODO: check with intel why we need this here. - // Is it from we use updated the value even if it is on the same - // thread? - item_ct1.barrier(); - reduction_helper[tidx] = dot; + // Ensure already finish reading this shared memory item_ct1.barrier(sycl::access::fence_space::local_space); + reduction_helper[tidx] = dot; ::gko::kernels::dpcpp::reduce( group::this_thread_block(item_ct1), reduction_helper, [](const ValueType &a, const ValueType &b) { return a + b; }); @@ -149,9 +146,9 @@ void orthonormalize_subspace_vectors_kernel( norm += squared_norm(values[row * stride + j]); } - reduction_helper_real[tidx] = norm; - + // Ensure already finish reading this shared memory item_ct1.barrier(sycl::access::fence_space::local_space); + reduction_helper_real[tidx] = norm; ::gko::kernels::dpcpp::reduce( group::this_thread_block(item_ct1), reduction_helper_real, [](const remove_complex &a,