diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 3979caa905c..8c7f45e5174 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -142,7 +143,7 @@ bool validate(sycl::queue *queue, unsigned workgroup_size, template -ConfigSetType get_first_cfg(IterArr &arr, Validate verify) +std::uint32_t get_first_cfg(IterArr &arr, Validate verify) { for (auto &cfg : arr) { if (verify(cfg)) { diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 330fa297e58..07cdb5b38aa 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/prefix_sum.dp.hpp" @@ -52,7 +53,7 @@ namespace components { using BlockCfg = ConfigSet<11>; constexpr auto block_cfg_list = - ::gko::syn::value_list(); GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) @@ -73,8 +74,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, if (num_entries > 0) { auto queue = exec->get_queue(); constexpr auto block_cfg_array = as_array(block_cfg_list); - const ConfigSetType cfg = - get_first_cfg(block_cfg_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) { return validate(queue, BlockCfg::decode<0>(cfg), 16); }); const auto wg_size = BlockCfg::decode<0>(cfg); diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index fd9ff2ac263..f76f85135eb 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" @@ -125,7 +126,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, * @note To calculate the prefix sum over an array of size bigger than * `block_size`, `finalize_prefix_sum` has to be used as well. */ -template +template void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, ValueType *__restrict__ block_sum, sycl::nd_item<3> item_ct1, @@ -178,7 +179,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, } } -template +template void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_elements, ValueType *elements, ValueType *block_sum) @@ -214,7 +215,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, * * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. */ -template +template void finalize_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, const ValueType *__restrict__ block_sum, @@ -231,7 +232,7 @@ void finalize_prefix_sum(size_type num_elements, } } -template +template void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_elements, ValueType *elements, const ValueType *block_sum) diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index d3e925ee4ba..9c2387a7113 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -63,7 +64,7 @@ namespace dpcpp { constexpr int default_block_size = 256; using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); @@ -201,7 +202,7 @@ void reduce_array(size_type size, const ValueType *__restrict__ source, * `source` of any size. Has to be called a second time on `result` to reduce * an array larger than `block_size`. */ -template +template void reduce_add_array( size_type size, const ValueType *__restrict__ source, ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, @@ -216,7 +217,7 @@ void reduce_add_array( } } -template +template void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const ValueType *source, ValueType *result) @@ -263,8 +264,8 @@ ValueType reduce_add_array(std::shared_ptr exec, ValueType answer = zero(); auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 79357bd048f..eb261c6f35b 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" -#include - - #include #include @@ -70,12 +67,14 @@ namespace dpcpp { */ namespace dense { + using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); +constexpr auto subgroup_list = syn::value_list(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); constexpr auto default_block_size = 256; @@ -119,6 +118,7 @@ void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols, GKO_ENABLE_DEFAULT_HOST(scale, scale) + template void add_scaled(size_type num_rows, size_type num_cols, size_type num_alpha_cols, const ValueType *__restrict__ alpha, @@ -157,7 +157,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag) -template void compute_partial_reduce( size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value, @@ -191,7 +191,7 @@ void compute_partial_reduce( } -template void finalize_reduce_computation( size_type size, const ValueType *work, ValueType *result, @@ -220,7 +220,7 @@ void finalize_reduce_computation( } -template +template void compute_partial_dot( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, const ValueType *__restrict__ y, size_type stride_y, @@ -230,13 +230,13 @@ void compute_partial_dot( compute_partial_reduce( num_rows, work, [x, stride_x, y, stride_y](size_type i) { - return x[i * stride_x] * conj(y[i * stride_y]); + return x[i * stride_x] * y[i * stride_y]; }, [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, tmp_work); } -template +template void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, @@ -244,7 +244,6 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *work) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - std::cout << "partial " << cfg << std::endl; stream->submit([&](sycl::handler &cgh) { sycl::accessor, 0, sycl::access::mode::read_write, @@ -267,8 +266,54 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, kcfg_1d_list) -template -void finalize_dot_computation( +template +void compute_partial_conj_dot( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + compute_partial_reduce( + num_rows, work, + [x, stride_x, y, stride_y](size_type i) { + return conj(x[i * stride_x]) * y[i * stride_y]; + }, + [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_conj_dot(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, const ValueType *x, + size_type stride_x, const ValueType *y, + size_type stride_y, ValueType *work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_partial_conj_dot( + num_rows, x, stride_x, y, stride_y, work, item_ct1, + (UninitializedArray *) + tmp_work_acc_ct1.get_pointer()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot, + compute_partial_conj_dot) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call, + compute_partial_conj_dot, kcfg_1d_list) + + +template +void finalize_sum_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, UninitializedArray(cfg)> *tmp_work) @@ -279,14 +324,13 @@ void finalize_dot_computation( [](const ValueType &x) { return x; }, item_ct1, tmp_work); } -template -void finalize_dot_computation(dim3 grid, dim3 block, - size_t dynamic_shared_memory, sycl::queue *stream, - size_type size, const ValueType *work, - ValueType *result) +template +void finalize_sum_reduce_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - std::cout << "finalize " << cfg << std::endl; stream->submit([&](sycl::handler &cgh) { sycl::accessor, 0, sycl::access::mode::read_write, @@ -295,7 +339,7 @@ void finalize_dot_computation(dim3 grid, dim3 block, cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_dot_computation( + finalize_sum_reduce_computation( size, work, result, item_ct1, (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); @@ -303,13 +347,13 @@ void finalize_dot_computation(dim3 grid, dim3 block, }); } -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation, - finalize_dot_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call, - finalize_dot_computation, kcfg_1d_list) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation, + finalize_sum_reduce_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call, + finalize_sum_reduce_computation, kcfg_1d_list) -template +template void compute_partial_norm2( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, @@ -324,7 +368,7 @@ void compute_partial_norm2( tmp_work); } -template +template void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, @@ -353,8 +397,8 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, compute_partial_norm2, kcfg_1d_list) -template -void finalize_norm2_computation( +template +void finalize_sqrt_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, UninitializedArray(cfg)> *tmp_work) @@ -362,14 +406,14 @@ void finalize_norm2_computation( finalize_reduce_computation( size, work, result, [](const ValueType &x, const ValueType &y) { return x + y; }, - [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work); + [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work); } -template -void finalize_norm2_computation(dim3 grid, dim3 block, - size_t dynamic_shared_memory, - sycl::queue *stream, size_type size, - const ValueType *work, ValueType *result) +template +void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); stream->submit([&](sycl::handler &cgh) { @@ -381,7 +425,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block, cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_norm2_computation( + finalize_sqrt_reduce_computation( size, work, result, item_ct1, (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); @@ -389,13 +433,13 @@ void finalize_norm2_computation(dim3 grid, dim3 block, }); } -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation, - finalize_norm2_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call, - finalize_norm2_computation, kcfg_1d_list) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation, + finalize_sqrt_reduce_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call, + finalize_sqrt_reduce_computation, kcfg_1d_list) -template +template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, const size_type *__restrict__ row_ptrs, const ValueType *__restrict__ source, @@ -418,10 +462,12 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_coo, fill_in_coo) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_coo, fill_in_coo) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_coo_call, fill_in_coo, kcfg_1d_list) -template +template void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, const ValueType *__restrict__ work, IndexType *__restrict__ result, @@ -451,7 +497,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, kcfg_1d_list) -template +template void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs, @@ -472,10 +518,12 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_csr, fill_in_csr) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_csr, fill_in_csr) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_csr_call, fill_in_csr, kcfg_1d_list) -template +template void fill_in_ell(size_type num_rows, size_type num_cols, size_type source_stride, const ValueType *__restrict__ source, size_type max_nnz_per_row, size_type result_stride, @@ -505,10 +553,12 @@ void fill_in_ell(size_type num_rows, size_type num_cols, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_ell, fill_in_ell) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_ell, fill_in_ell) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_ell_call, fill_in_ell, kcfg_1d_list) -template +template void calculate_slice_lengths(size_type num_rows, size_type slice_size, int slice_num, size_type stride_factor, const size_type *__restrict__ nnz_per_row, @@ -516,7 +566,7 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size, size_type *__restrict__ slice_sets, sycl::nd_item<3> item_ct1) { - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + constexpr auto sg_size = cfg; const auto sliceid = item_ct1.get_group(2); const auto tid_in_warp = item_ct1.get_local_id(2); @@ -548,10 +598,10 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths) GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths, calculate_slice_lengths) GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, - calculate_slice_lengths, kcfg_1d_list) + calculate_slice_lengths, subgroup_list) -template +template void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, size_type stride, const ValueType *__restrict__ source, size_type *__restrict__ slice_lengths, @@ -584,9 +634,12 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_sellp, fill_in_sellp) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_sellp, fill_in_sellp) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_sellp_call, fill_in_sellp, kcfg_1d_list) -template + +template void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result, sycl::nd_item<3> item_ct1, uint8_t *dpct_local) @@ -603,7 +656,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, } } -template +template void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const size_type *nnz_per_row, size_type *result) @@ -626,7 +679,8 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz); GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, kcfg_1d_list) -template + +template void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, size_type stride_factor, const size_type *__restrict__ nnz_per_row, @@ -665,7 +719,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call, reduce_max_nnz_per_slice, kcfg_1d_list) -template +template void reduce_total_cols(size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, size_type *__restrict__ result, @@ -682,7 +736,7 @@ void reduce_total_cols(size_type num_slices, } } -template +template void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_slices, const size_type *max_nnz_per_slice, size_type *result) @@ -979,10 +1033,10 @@ void scale(std::shared_ptr exec, const matrix::Dense *alpha, matrix::Dense *x) { if (0) { - // oneapi::mkl::blas::row_major::scal( - // *exec->get_queue(), x->get_size()[0] * x->get_size()[1], - // exec->copy_val_to_host(alpha->get_const_values()), - // x->get_values(), x->get_stride()); + oneapi::mkl::blas::row_major::scal( + *exec->get_queue(), x->get_size()[0] * x->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), x->get_values(), + x->get_stride()); } else { // TODO: tune this parameter constexpr auto block_size = default_block_size; @@ -1005,11 +1059,11 @@ void add_scaled(std::shared_ptr exec, const matrix::Dense *x, matrix::Dense *y) { if (0) { - // oneapi::mkl::blas::row_major::axpy( - // *exec->get_queue(), x->get_size()[0], - // exec->copy_val_to_host(alpha->get_const_values()), - // x->get_const_values(), x->get_stride(), y->get_values(), - // y->get_stride()); + oneapi::mkl::blas::row_major::axpy( + *exec->get_queue(), x->get_size()[0], + exec->copy_val_to_host(alpha->get_const_values()), + x->get_const_values(), x->get_stride(), y->get_values(), + y->get_stride()); } else { // TODO: tune this parameter constexpr auto block_size = default_block_size; @@ -1048,10 +1102,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); namespace { -#define GKO_BIND_DOT(ValueType, Func) \ - void dot(::cl::sycl::queue &exec_queue, std::int64_t n, \ - const ValueType *x, std::int64_t incx, const ValueType *y, \ - std::int64_t incy, ValueType *result) \ +#define GKO_BIND_DOT(ValueType, Name, Func) \ + void Name(::cl::sycl::queue &exec_queue, std::int64_t n, \ + const ValueType *x, std::int64_t incx, const ValueType *y, \ + std::int64_t incy, ValueType *result) \ { \ Func(exec_queue, n, x, incx, y, incy, result); \ } \ @@ -1059,10 +1113,15 @@ namespace { "This assert is used to counter the false positive extra " \ "semi-colon warnings") -GKO_BIND_DOT(float, oneapi::mkl::blas::row_major::dot); -GKO_BIND_DOT(double, oneapi::mkl::blas::row_major::dot); -GKO_BIND_DOT(std::complex, oneapi::mkl::blas::row_major::dotc); -GKO_BIND_DOT(std::complex, oneapi::mkl::blas::row_major::dotc); +GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, conj_dot, oneapi::mkl::blas::row_major::dotc); +GKO_BIND_DOT(std::complex, conj_dot, + oneapi::mkl::blas::row_major::dotc); } // namespace @@ -1089,15 +1148,13 @@ void compute_dot(std::shared_ptr exec, constexpr auto work_per_thread = 32; auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); const auto wg_size = KCFG_1D::decode<0>(cfg); const auto sg_size = KCFG_1D::decode<1>(cfg); - std::cout << "dot " << cfg << " " << wg_size << " " << sg_size - << std::endl; const auto work_per_block = work_per_thread * wg_size; const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); const dim3 block_dim{sg_size, 1, wg_size / sg_size}; @@ -1108,7 +1165,7 @@ void compute_dot(std::shared_ptr exec, cfg, grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_dot_computation_call( + kernel::finalize_sum_reduce_computation_call( cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); } @@ -1122,7 +1179,47 @@ template void compute_conj_dot(std::shared_ptr exec, const matrix::Dense *x, const matrix::Dense *y, - matrix::Dense *result) GKO_NOT_IMPLEMENTED; + matrix::Dense *result) +{ + if (0) { + // TODO: write a custom kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + conj_dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); + } + } else { + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_conj_dot_call( + cfg, grid_dim, block_dim, 0, exec->get_queue(), + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), work.get_data()); + kernel::finalize_sum_reduce_computation_call( + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); @@ -1147,8 +1244,8 @@ void compute_norm2(std::shared_ptr exec, constexpr auto work_per_thread = 32; auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1165,7 +1262,7 @@ void compute_norm2(std::shared_ptr exec, cfg, grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), work.get_data()); - kernel::finalize_norm2_computation_call( + kernel::finalize_sqrt_reduce_computation_call( cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); } @@ -1194,12 +1291,21 @@ void convert_to_coo(std::shared_ptr exec, components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); - size_type grid_dim = ceildiv(num_rows, default_block_size); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, stride, - nnz_prefix_sum.get_const_data(), - source->get_const_values(), row_idxs, col_idxs, values); + kernel::fill_in_coo_call( + cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, num_cols, + stride, nnz_prefix_sum.get_const_data(), source->get_const_values(), + row_idxs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1213,8 +1319,8 @@ void convert_to_csr(std::shared_ptr exec, { auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1241,9 +1347,10 @@ void convert_to_csr(std::shared_ptr exec, size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_csr(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, stride, source->get_const_values(), row_ptrs, - col_idxs, values); + kernel::fill_in_csr_call(cfg, grid_dim, default_block_size, 0, + exec->get_queue(), num_rows, num_cols, stride, + source->get_const_values(), row_ptrs, col_idxs, + values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1265,11 +1372,20 @@ void convert_to_ell(std::shared_ptr exec, auto source_stride = source->get_stride(); auto result_stride = result->get_stride(); - auto grid_dim = ceildiv(result_stride, default_block_size); - kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, source_stride, - source->get_const_values(), max_nnz_per_row, - result_stride, col_ptrs, values); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + auto grid_dim = ceildiv(result_stride, wg_size); + kernel::fill_in_ell_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), + num_rows, num_cols, source_stride, + source->get_const_values(), max_nnz_per_row, + result_stride, col_ptrs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1293,8 +1409,8 @@ void convert_to_sellp(std::shared_ptr exec, { auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1319,24 +1435,25 @@ void convert_to_sellp(std::shared_ptr exec, const int slice_num = ceildiv(num_rows, slice_size); auto nnz_per_row = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_per_row); auto grid_dim = slice_num; if (grid_dim > 0) { kernel::calculate_slice_lengths_call( - cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size, - slice_num, stride_factor, nnz_per_row.get_const_data(), + sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows, + slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), slice_lengths, slice_sets); } + components::prefix_sum(exec, slice_sets, slice_num + 1); + grid_dim = ceildiv(num_rows, wg_size); if (grid_dim > 0) { - kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, slice_size, stride, - source->get_const_values(), slice_lengths, - slice_sets, col_idxs, vals); + kernel::fill_in_sellp_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), + num_rows, num_cols, slice_size, stride, + source->get_const_values(), slice_lengths, + slice_sets, col_idxs, vals); } } @@ -1380,14 +1497,12 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, calculate_nonzeros_per_row(exec, source, &nnz_per_row); auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); const auto wg_size = KCFG_1D::decode<0>(cfg); - std::cout << "wg_size " << wg_size << "sg_size " << KCFG_1D::decode<1>(cfg) - << std::endl; const auto n = ceildiv(num_rows, wg_size); const size_type grid_dim = (n <= wg_size) ? n : wg_size; @@ -1417,8 +1532,8 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, { auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1463,8 +1578,8 @@ void calculate_total_cols(std::shared_ptr exec, auto max_nnz_per_slice = Array(exec, slice_num); auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1502,27 +1617,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void transpose(std::shared_ptr exec, const matrix::Dense *orig, - matrix::Dense *trans) -{ - // if (cublas::is_supported::value) { - // auto handle = exec->get_cublas_handle(); - // { - // cublas::pointer_mode_guard pm_guard(handle); - // auto alpha = one(); - // auto beta = zero(); - // cublas::geam( - // handle, oneapi::mkl::transpose::trans, - // oneapi::mkl::transpose::nontrans, orig->get_size()[0], - // orig->get_size()[1], &alpha, orig->get_const_values(), - // orig->get_stride(), &beta, static_cast(nullptr), trans->get_size()[1], trans->get_values(), - // trans->get_stride()); - // } - // } else { - // GKO_NOT_IMPLEMENTED; - // } - GKO_NOT_IMPLEMENTED; -}; + matrix::Dense *trans) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); @@ -1530,27 +1625,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, const matrix::Dense *orig, - matrix::Dense *trans) -{ - // if (cublas::is_supported::value) { - // auto handle = exec->get_cublas_handle(); - // { - // cublas::pointer_mode_guard pm_guard(handle); - // auto alpha = one(); - // auto beta = zero(); - // cublas::geam( - // handle, oneapi::mkl::transpose::conjtrans, - // oneapi::mkl::transpose::nontrans, orig->get_size()[0], - // orig->get_size()[1], &alpha, orig->get_const_values(), - // orig->get_stride(), &beta, static_cast(nullptr), trans->get_size()[1], trans->get_values(), - // trans->get_stride()); - // } - // } else { - // GKO_NOT_IMPLEMENTED; - // } - GKO_NOT_IMPLEMENTED; -} + matrix::Dense *trans) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp index 2ae72880443..402192d0b77 100644 --- a/dpcpp/test/components/prefix_sum.cpp +++ b/dpcpp/test/components/prefix_sum.cpp @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index e47de0a6487..2b9af16732a 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/test/utils.hpp" namespace { @@ -64,9 +65,12 @@ class Dense : public ::testing::Test { using vtype = double; #endif // GINKGO_DPCPP_SINGLE_MODE using Mtx = gko::matrix::Dense; + using MixedMtx = gko::matrix::Dense>; using NormVector = gko::matrix::Dense>; using Arr = gko::Array; - // using ComplexMtx = gko::matrix::Dense>; + using ComplexMtx = gko::matrix::Dense>; + using MixedComplexMtx = + gko::matrix::Dense>>; Dense() : rand_engine(15) {} @@ -116,15 +120,16 @@ class Dense : public ::testing::Test { void set_up_apply_data() { x = gen_mtx(65, 25); - // c_x = gen_mtx(65, 25); + c_x = gen_mtx(65, 25); y = gen_mtx(25, 35); expected = gen_mtx(65, 35); alpha = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + square = gen_mtx(x->get_size()[0], x->get_size()[0]); dx = Mtx::create(dpcpp); dx->copy_from(x.get()); - // dc_x = ComplexMtx::create(dpcpp); - // dc_x->copy_from(c_x.get()); + dc_x = ComplexMtx::create(dpcpp); + dc_x->copy_from(c_x.get()); dy = Mtx::create(dpcpp); dy->copy_from(y.get()); dresult = Mtx::create(dpcpp); @@ -133,6 +138,8 @@ class Dense : public ::testing::Test { dalpha->copy_from(alpha.get()); dbeta = Mtx::create(dpcpp); dbeta->copy_from(beta.get()); + dsquare = Mtx::create(dpcpp); + dsquare->copy_from(square.get()); std::vector tmp(x->get_size()[0], 0); auto rng = std::default_random_engine{}; @@ -141,14 +148,25 @@ class Dense : public ::testing::Test { std::vector tmp2(x->get_size()[1], 0); std::iota(tmp2.begin(), tmp2.end(), 0); std::shuffle(tmp2.begin(), tmp2.end(), rng); + std::vector tmp3(x->get_size()[0] / 10); + std::uniform_int_distribution row_dist(0, x->get_size()[0] - 1); + for (auto &i : tmp3) { + i = row_dist(rng); + } rpermute_idxs = std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); - drpermute_idxs = - std::unique_ptr(new Arr{dpcpp, tmp.begin(), tmp.end()}); cpermute_idxs = std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); - dcpermute_idxs = - std::unique_ptr(new Arr{dpcpp, tmp2.begin(), tmp2.end()}); + rgather_idxs = + std::unique_ptr(new Arr{ref, tmp3.begin(), tmp3.end()}); + } + + template + std::unique_ptr convert(InputType &&input) + { + auto result = ConvertedType::create(input->get_executor()); + input->convert_to(result.get()); + return result; } std::shared_ptr ref; @@ -157,21 +175,22 @@ class Dense : public ::testing::Test { std::ranlux48 rand_engine; std::unique_ptr x; - // std::unique_ptr c_x; + std::unique_ptr c_x; std::unique_ptr y; std::unique_ptr alpha; std::unique_ptr beta; std::unique_ptr expected; + std::unique_ptr square; std::unique_ptr dresult; std::unique_ptr dx; - // std::unique_ptr dc_x; + std::unique_ptr dc_x; std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; + std::unique_ptr dsquare; std::unique_ptr rpermute_idxs; - std::unique_ptr drpermute_idxs; std::unique_ptr cpermute_idxs; - std::unique_ptr dcpermute_idxs; + std::unique_ptr rgather_idxs; }; @@ -201,7 +220,7 @@ TEST_F(Dense, DpcppStridedFillIsEquivalentToRef) dx->fill(42); result->copy_from(dx.get()); - GKO_ASSERT_MTX_NEAR(result, x, r::value); + GKO_ASSERT_MTX_NEAR(result, x, r::value); } @@ -318,6 +337,28 @@ TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) } +TEST_F(Dense, SingleVectorDpcppComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->compute_conj_dot(y.get(), expected.get()); + dx->compute_conj_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->compute_conj_dot(y.get(), expected.get()); + dx->compute_conj_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); @@ -343,6 +384,23 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef) } +#if !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(convert(y).get(), convert(expected).get()); + dx->apply(convert(dy).get(), convert(dresult).get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7); +} + + +#endif // !GINKGO_DPCPP_SINGLE_MODE + + TEST_F(Dense, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -354,38 +412,142 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef) } -// TEST_F(Dense, ApplyToComplexIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto complex_b = gen_mtx(25, 1); -// auto dcomplex_b = ComplexMtx::create(dpcpp); -// dcomplex_b->copy_from(complex_b.get()); -// auto complex_x = gen_mtx(65, 1); -// auto dcomplex_x = ComplexMtx::create(dpcpp); -// dcomplex_x->copy_from(complex_x.get()); +#if !GINKGO_DPCPP_SINGLE_MODE -// x->apply(complex_b.get(), complex_x.get()); -// dx->apply(dcomplex_b.get(), dcomplex_x.get()); -// GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); -// } +TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef) +{ + set_up_apply_data(); + x->apply(convert(alpha).get(), convert(y).get(), + convert(beta).get(), convert(expected).get()); + dx->apply(convert(dalpha).get(), convert(dy).get(), + convert(dbeta).get(), convert(dresult).get()); -// TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto complex_b = gen_mtx(25, 1); -// auto dcomplex_b = ComplexMtx::create(dpcpp); -// dcomplex_b->copy_from(complex_b.get()); -// auto complex_x = gen_mtx(65, 1); -// auto dcomplex_x = ComplexMtx::create(dpcpp); -// dcomplex_x->copy_from(complex_x.get()); + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7); +} -// x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); -// dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); -// GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); -// } +#endif // !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, ApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r::value); +} + + +#if !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = MixedComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = MixedComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); +} + +#endif // !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); + dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r::value); +} + + +#if !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = MixedComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = MixedComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(convert(alpha).get(), complex_b.get(), + convert(beta).get(), complex_x.get()); + dx->apply(convert(dalpha).get(), dcomplex_b.get(), + convert(dbeta).get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); +} + + +#endif // !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, ComputeDotComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(1234, 2); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(1234, 2); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2}); + auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2}); + + complex_b->compute_dot(complex_x.get(), result.get()); + dcomplex_b->compute_dot(dcomplex_x.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(result, dresult, r::value); +} + + +TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(1234, 2); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(1234, 2); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2}); + auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2}); + + complex_b->compute_conj_dot(complex_x.get(), result.get()); + dcomplex_b->compute_conj_dot(dcomplex_x.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(result, dresult, r::value); +} // TEST_F(Dense, IsTransposable) @@ -494,42 +656,42 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef) } -// TEST_F(Dense, ConvertToSellpIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto sellp_mtx = gko::matrix::Sellp<>::create(ref); -// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +TEST_F(Dense, ConvertToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp::create(ref); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); -// x->convert_to(sellp_mtx.get()); -// dx->convert_to(dsellp_mtx.get()); + x->convert_to(sellp_mtx.get()); + dx->convert_to(dsellp_mtx.get()); -// GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6); -// } + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); +} -// TEST_F(Dense, MoveToSellpIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto sellp_mtx = gko::matrix::Sellp<>::create(ref); -// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +TEST_F(Dense, MoveToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp::create(ref); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); -// x->move_to(sellp_mtx.get()); -// dx->move_to(dsellp_mtx.get()); + x->move_to(sellp_mtx.get()); + dx->move_to(dsellp_mtx.get()); -// GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6); -// } + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); +} -// TEST_F(Dense, ConvertsEmptyToSellp) -// { -// auto dempty_mtx = Mtx::create(dpcpp); -// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(dpcpp); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); -// dempty_mtx->convert_to(dsellp_mtx.get()); + dempty_mtx->convert_to(dsellp_mtx.get()); -// ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), -// 0); ASSERT_FALSE(dsellp_mtx->get_size()); -// } + ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} TEST_F(Dense, CountNNZIsEquivalentToRef) @@ -595,12 +757,63 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) } +TEST_F(Dense, CanGatherRows) +{ + set_up_apply_data(); + + auto r_gather = x->row_gather(rgather_idxs.get()); + auto dr_gather = dx->row_gather(rgather_idxs.get()); + + GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); +} + + +TEST_F(Dense, CanGatherRowsIntoDense) +{ + set_up_apply_data(); + auto gather_size = + gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]}; + auto r_gather = Mtx::create(ref, gather_size); + // test make_temporary_clone and non-default stride + auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2); + + x->row_gather(rgather_idxs.get(), r_gather.get()); + dx->row_gather(rgather_idxs.get(), dr_gather.get()); + + GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); +} + + +TEST_F(Dense, IsPermutable) +{ + set_up_apply_data(); + + auto permuted = square->permute(rpermute_idxs.get()); + auto dpermuted = dsquare->permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), + static_cast(dpermuted.get()), 0); +} + + +TEST_F(Dense, IsInversePermutable) +{ + set_up_apply_data(); + + auto permuted = square->inverse_permute(rpermute_idxs.get()); + auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), + static_cast(dpermuted.get()), 0); +} + + TEST_F(Dense, IsRowPermutable) { set_up_apply_data(); auto r_permute = x->row_permute(rpermute_idxs.get()); - auto dr_permute = dx->row_permute(drpermute_idxs.get()); + auto dr_permute = dx->row_permute(rpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), static_cast(dr_permute.get()), 0); @@ -612,7 +825,7 @@ TEST_F(Dense, IsColPermutable) set_up_apply_data(); auto c_permute = x->column_permute(cpermute_idxs.get()); - auto dc_permute = dx->column_permute(dcpermute_idxs.get()); + auto dc_permute = dx->column_permute(cpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), static_cast(dc_permute.get()), 0); @@ -624,7 +837,7 @@ TEST_F(Dense, IsInverseRowPermutable) set_up_apply_data(); auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); - auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), static_cast(d_inverse_r_permute.get()), 0); @@ -636,14 +849,14 @@ TEST_F(Dense, IsInverseColPermutable) set_up_apply_data(); auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); - auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), static_cast(d_inverse_c_permute.get()), 0); } -TEST_F(Dense, ExtractDiagonalIsEquivalentToRef) +TEST_F(Dense, ExtractDiagonalOnTallSkinnyIsEquivalentToRef) { set_up_apply_data(); @@ -654,6 +867,17 @@ TEST_F(Dense, ExtractDiagonalIsEquivalentToRef) } +TEST_F(Dense, ExtractDiagonalOnShortFatIsEquivalentToRef) +{ + set_up_apply_data(); + + auto diag = y->extract_diagonal(); + auto ddiag = dy->extract_diagonal(); + + GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0); +} + + TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef) { set_up_apply_data(); @@ -676,4 +900,76 @@ TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef) } +TEST_F(Dense, MakeComplexIsEquivalentToRef) +{ + set_up_apply_data(); + + auto complex_x = x->make_complex(); + auto dcomplex_x = dx->make_complex(); + + GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); +} + + +TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto complex_x = ComplexMtx::create(ref, x->get_size()); + x->make_complex(complex_x.get()); + auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size()); + dx->make_complex(dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); +} + + +TEST_F(Dense, GetRealIsEquivalentToRef) +{ + set_up_apply_data(); + + auto real_x = x->get_real(); + auto dreal_x = dx->get_real(); + + GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); +} + + +TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto real_x = Mtx::create(ref, x->get_size()); + x->get_real(real_x.get()); + auto dreal_x = Mtx::create(dpcpp, dx->get_size()); + dx->get_real(dreal_x.get()); + + GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); +} + + +TEST_F(Dense, GetImagIsEquivalentToRef) +{ + set_up_apply_data(); + + auto imag_x = x->get_imag(); + auto dimag_x = dx->get_imag(); + + GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); +} + + +TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto imag_x = Mtx::create(ref, x->get_size()); + x->get_imag(imag_x.get()); + auto dimag_x = Mtx::create(dpcpp, dx->get_size()); + dx->get_imag(dimag_x.get()); + + GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); +} + + } // namespace