diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc new file mode 100644 index 00000000000..f18d0aafc88 --- /dev/null +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc @@ -0,0 +1,110 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__device__ void device_classical_spmv(const size_type num_rows, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, + acc::range b, + acc::range c, + Closure scale) +{ + using arithmetic_type = typename output_accessor::arithmetic_type; + auto subwarp_tile = + group::tiled_partition(group::this_thread_block()); + const auto subrow = thread::get_subwarp_num_flat(); + const auto subid = subwarp_tile.thread_rank(); + const IndexType column_id = blockIdx.y; + const arithmetic_type value = val[0]; + auto row = thread::get_subwarp_id_flat(); + for (; row < num_rows; row += subrow) { + const auto ind_end = row_ptrs[row + 1]; + arithmetic_type temp_val = zero(); + for (auto ind = row_ptrs[row] + subid; ind < ind_end; + ind += subwarp_size) { + temp_val += value * b(col_idxs[ind], column_id); + } + auto subwarp_result = + reduce(subwarp_tile, temp_val, + [](const arithmetic_type& a, const arithmetic_type& b) { + return a + b; + }); + if (subid == 0) { + c(row, column_id) = scale(subwarp_result, c(row, column_id)); + } + } +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + acc::range c) +{ + using type = typename output_accessor::arithmetic_type; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [](const type& x, const type& y) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ alpha, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + const typename output_accessor::storage_type* __restrict__ beta, + acc::range c) +{ + using type = typename output_accessor::arithmetic_type; + const type alpha_val = alpha[0]; + const type beta_val = beta[0]; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [&alpha_val, &beta_val](const type& x, const type& y) { + return alpha_val * x + beta_val * y; + }); +} + + +} // namespace kernel diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 2f7e7e9fe2a..5ffb7aad173 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -452,8 +452,9 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); namespace sparsity_csr { -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); +GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL); diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 6ef49c6f1fc..6e3ecc5c319 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -76,7 +76,7 @@ template void SparsityCsr::apply_impl(const LinOp* b, LinOp* x) const { - precision_dispatch_real_complex( + mixed_precision_dispatch_real_complex( [this](auto dense_b, auto dense_x) { this->get_executor()->run( sparsity_csr::make_spmv(this, dense_b, dense_x)); @@ -91,12 +91,15 @@ void SparsityCsr::apply_impl(const LinOp* alpha, const LinOp* beta, LinOp* x) const { - precision_dispatch_real_complex( - [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { + mixed_precision_dispatch_real_complex( + [this, alpha, beta](auto dense_b, auto dense_x) { + auto dense_alpha = make_temporary_conversion(alpha); + auto dense_beta = make_temporary_conversion< + typename std::decay_t::value_type>(beta); this->get_executor()->run(sparsity_csr::make_advanced_spmv( - dense_alpha, this, dense_b, dense_beta, dense_x)); + dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x)); }, - alpha, b, beta, x); + b, x); } diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index 5bb81585230..6933b61313d 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -48,18 +48,22 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType) \ - void spmv(std::shared_ptr exec, \ - const matrix::SparsityCsr* a, \ - const matrix::Dense* b, matrix::Dense* c) - -#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType) \ - void advanced_spmv(std::shared_ptr exec, \ - const matrix::Dense* alpha, \ - const matrix::SparsityCsr* a, \ - const matrix::Dense* b, \ - const matrix::Dense* beta, \ - matrix::Dense* c) +#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \ + OutputValueType, IndexType) \ + void spmv(std::shared_ptr exec, \ + const matrix::SparsityCsr* a, \ + const matrix::Dense* b, \ + matrix::Dense* c) + +#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL( \ + MatrixValueType, InputValueType, OutputValueType, IndexType) \ + void advanced_spmv( \ + std::shared_ptr exec, \ + const matrix::Dense* alpha, \ + const matrix::SparsityCsr* a, \ + const matrix::Dense* b, \ + const matrix::Dense* beta, \ + matrix::Dense* c) #define GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType) \ void fill_in_dense(std::shared_ptr exec, \ @@ -98,10 +102,14 @@ namespace kernels { bool* is_sorted) #define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL( \ + MatrixValueType, InputValueType, OutputValueType, IndexType); \ template \ GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \ template \ diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp index 4233e45a2ff..dfc25749a95 100644 --- a/core/multigrid/amgx_pgm.cpp +++ b/core/multigrid/amgx_pgm.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/base/utils.hpp" @@ -161,6 +162,16 @@ void AmgxPgm::generate() one())); // TODO: implement the restrict_op from aggregation. auto restrict_op = gko::as(share(prolong_csr->transpose())); + auto restrict_sparsity = + share(matrix::SparsityCsr::create( + exec, restrict_op->get_size(), + restrict_op->get_num_stored_elements())); + exec->copy_from(exec.get(), static_cast(num_agg + 1), + restrict_op->get_const_row_ptrs(), + restrict_sparsity->get_row_ptrs()); + exec->copy_from(exec.get(), restrict_op->get_num_stored_elements(), + restrict_op->get_const_col_idxs(), + restrict_sparsity->get_col_idxs()); // Construct the coarse matrix // TODO: use less memory footprint to improve it @@ -170,7 +181,8 @@ void AmgxPgm::generate() amgxpgm_op->apply(prolong_csr.get(), tmp.get()); restrict_op->apply(tmp.get(), coarse_matrix.get()); - this->set_multigrid_level(prolong_row_gather, coarse_matrix, restrict_op); + this->set_multigrid_level(prolong_row_gather, coarse_matrix, + restrict_sparsity); } diff --git a/core/test/matrix/fbcsr_sample.hpp b/core/test/matrix/fbcsr_sample.hpp index f8b09fa701c..0cb410c2bd7 100644 --- a/core/test/matrix/fbcsr_sample.hpp +++ b/core/test/matrix/fbcsr_sample.hpp @@ -114,10 +114,11 @@ class FbcsrSample { static_cast(bs)}, v); - if (mtx->get_size()[0] % bs != 0) + if (mtx->get_size()[0] % bs != 0) { throw gko::BadDimension(__FILE__, __LINE__, __func__, "test fbcsr", mtx->get_size()[0], mtx->get_size()[1], "block size does not divide the size!"); + } for (index_type ibrow = 0; ibrow < mtx->get_num_block_rows(); ibrow++) { const index_type* const browptr = mtx->get_row_ptrs(); @@ -125,11 +126,13 @@ class FbcsrSample { inz++) { const index_type bcolind = mtx->get_col_idxs()[inz]; const value_type base = (ibrow + 1) * (bcolind + 1); - for (int ival = 0; ival < bs; ival++) - for (int jval = 0; jval < bs; jval++) + for (int ival = 0; ival < bs; ival++) { + for (int jval = 0; jval < bs; jval++) { vals(inz, ival, jval) = base + static_cast>( ival * bs + jval); + } + } } } @@ -175,10 +178,12 @@ class FbcsrSample { gko::Array colids(exec, nbnz); gko::Array rowptrs(exec, nbrows + 1); const std::unique_ptr fbmat = generate_fbcsr(); - for (index_type i = 0; i < nbrows + 1; i++) + for (index_type i = 0; i < nbrows + 1; i++) { rowptrs.get_data()[i] = fbmat->get_const_row_ptrs()[i]; - for (index_type i = 0; i < nbnz; i++) + } + for (index_type i = 0; i < nbnz; i++) { colids.get_data()[i] = fbmat->get_const_col_idxs()[i]; + } return SparCsr::create(exec, gko::dim<2>{nbrows, nbcols}, colids, rowptrs); } @@ -302,7 +307,9 @@ class FbcsrSample2 { gko::Array c(exec, {0, 0, 3, 2}); gko::Array vals(exec, nnz); value_type* const v = vals.get_data(); - for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset; + for (IndexType i = 0; i < nnz; i++) { + v[i] = 0.15 + fbcsr_test_offset; + } v[0] = 1; v[1] = 3; @@ -328,7 +335,9 @@ class FbcsrSample2 { exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5}); gko::Array vals(exec, nnz); value_type* const v = vals.get_data(); - for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset; + for (IndexType i = 0; i < nnz; i++) { + v[i] = 0.15 + fbcsr_test_offset; + } v[0] = 1; v[1] = 2; v[2] = 3; @@ -401,7 +410,9 @@ class FbcsrSampleSquare { gko::Array r(exec, {0, 1, 2}); gko::Array vals(exec, nnz); value_type* const v = vals.get_data(); - for (IndexType i = 0; i < nnz; i++) v[i] = i; + for (IndexType i = 0; i < nnz; i++) { + v[i] = i; + } return Fbcsr::create(exec, gko::dim<2>{static_cast(nrows), @@ -447,7 +458,9 @@ class FbcsrSampleComplex { gko::Array c(exec, {0, 0, 3, 2}); gko::Array vals(exec, nnz); value_type* const v = vals.get_data(); - for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset; + for (IndexType i = 0; i < nnz; i++) { + v[i] = 0.15 + fbcsr_test_offset; + } using namespace std::complex_literals; v[0] = 1.0 + 1.15i; @@ -474,7 +487,9 @@ class FbcsrSampleComplex { exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5}); gko::Array vals(exec, nnz); value_type* const v = vals.get_data(); - for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset; + for (IndexType i = 0; i < nnz; i++) { + v[i] = 0.15 + fbcsr_test_offset; + } using namespace std::complex_literals; v[0] = 1.0 + 1.15i; diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu index 30a778572ad..71ac5d9c664 100644 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ b/cuda/matrix/sparsity_csr_kernels.cu @@ -36,6 +36,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/cuda_helper.hpp" +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" + + namespace gko { namespace kernels { namespace cuda { @@ -47,25 +60,115 @@ namespace cuda { namespace sparsity_csr { -template +constexpr int classical_overweight = 32; +constexpr int spmv_block_size = 128; +constexpr int warps_in_block = 4; + + +using classical_kernels = syn::value_list; + + +#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" + + +namespace host_kernel { + + +template +void classical_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + using arithmetic_type = + highest_precision; + using input_accessor = + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; + using output_accessor = + gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>; + + const auto nwarps = exec->get_num_warps_per_sm() * + exec->get_num_multiprocessor() * classical_overweight; + const auto gridx = + std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + int64(nwarps / warps_in_block)); + const dim3 grid(gridx, b->get_size()[1]); + const auto block = spmv_block_size; + + const auto b_vals = gko::acc::range( + std::array{ + {static_cast(b->get_size()[0]), + static_cast(b->get_size()[1])}}, + b->get_const_values(), + std::array{ + {static_cast(b->get_stride())}}); + auto c_vals = gko::acc::range( + std::array{ + {static_cast(c->get_size()[0]), + static_cast(c->get_size()[1])}}, + c->get_values(), + std::array{ + {static_cast(c->get_stride())}}); + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (alpha == nullptr && beta == nullptr) { + kernel::abstract_classical_spmv<<>>( + a->get_size()[0], as_cuda_type(a->get_const_value()), + a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()), + acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); + } else if (alpha != nullptr && beta != nullptr) { + kernel::abstract_classical_spmv<<>>( + a->get_size()[0], as_cuda_type(alpha->get_const_values()), + as_cuda_type(a->get_const_value()), a->get_const_col_idxs(), + as_cuda_type(a->get_const_row_ptrs()), acc::as_cuda_range(b_vals), + as_cuda_type(beta->get_const_values()), acc::as_cuda_range(c_vals)); + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); + + +} // namespace host_kernel + +template void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c) +{ + host_kernel::select_classical_spmv( + classical_kernels(), [](int compiled_info) { return true; }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + const matrix::Dense* alpha, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + const matrix::Dense* beta, + matrix::Dense* c) +{ + host_kernel::select_classical_spmv( + classical_kernels(), [](int compiled_info) { return true; }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); diff --git a/cuda/test/matrix/CMakeLists.txt b/cuda/test/matrix/CMakeLists.txt index be2ebd7ef03..0c0442f61e0 100644 --- a/cuda/test/matrix/CMakeLists.txt +++ b/cuda/test/matrix/CMakeLists.txt @@ -1,9 +1,9 @@ ginkgo_create_test(coo_kernels) ginkgo_create_test(csr_kernels) -ginkgo_create_test(fbcsr_kernels) ginkgo_create_test(dense_kernels) ginkgo_create_test(diagonal_kernels) ginkgo_create_test(ell_kernels) +ginkgo_create_test(fbcsr_kernels) ginkgo_create_test(fft_kernels) ginkgo_create_test(hybrid_kernels) ginkgo_create_test(sellp_kernels) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 2a7120ffa0b..97c01e0fac5 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -237,11 +237,14 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], one(), - a->get_const_values(), a->get_stride(), b->get_const_values(), - b->get_stride(), zero(), c->get_values(), c->get_stride()); + if (b->get_stride() != 0 && c->get_stride() != 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, transpose::nontrans, + c->get_size()[0], c->get_size()[1], a->get_size()[1], + one(), a->get_const_values(), a->get_stride(), + b->get_const_values(), b->get_stride(), zero(), + c->get_values(), c->get_stride()); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); @@ -254,13 +257,15 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - exec->copy_val_to_host(alpha->get_const_values()), - a->get_const_values(), a->get_stride(), b->get_const_values(), - b->get_stride(), exec->copy_val_to_host(beta->get_const_values()), - c->get_values(), c->get_stride()); + if (b->get_stride() != 0 && c->get_stride() != 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, transpose::nontrans, + c->get_size()[0], c->get_size()[1], a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), b->get_const_values(), + b->get_stride(), exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index 81ffcb87cd3..61610538a1c 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -33,9 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/sparsity_csr_kernels.hpp" +#include + + #include +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" + + namespace gko { namespace kernels { namespace dpcpp { @@ -47,25 +61,232 @@ namespace dpcpp { namespace sparsity_csr { -template +constexpr int classical_overweight = 32; +constexpr int spmv_block_size = 128; + + +using classical_kernels = syn::value_list; + + +namespace kernel { + + +template +void device_classical_spmv(const size_type num_rows, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, + acc::range b, + acc::range c, Closure scale, + sycl::nd_item<3> item_ct1) +{ + using arithmetic_type = typename output_accessor::arithmetic_type; + auto subgroup_tile = group::tiled_partition( + group::this_thread_block(item_ct1)); + const auto subrow = thread::get_subwarp_num_flat(item_ct1); + const auto subid = subgroup_tile.thread_rank(); + const IndexType column_id = item_ct1.get_group(1); + const arithmetic_type value = static_cast(val[0]); + auto row = thread::get_subwarp_id_flat(item_ct1); + for (; row < num_rows; row += subrow) { + const auto ind_end = row_ptrs[row + 1]; + arithmetic_type temp_val = zero(); + for (auto ind = row_ptrs[row] + subid; ind < ind_end; + ind += subgroup_size) { + temp_val += value * b(col_idxs[ind], column_id); + } + auto subgroup_result = ::gko::kernels::dpcpp::reduce( + subgroup_tile, temp_val, + [](const arithmetic_type& a, const arithmetic_type& b) { + return a + b; + }); + // TODO: check the barrier + subgroup_tile.sync(); + if (subid == 0) { + c(row, column_id) = scale(subgroup_result, c(row, column_id)); + } + } +} + + +template +void abstract_classical_spmv(const size_type num_rows, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, + acc::range b, + acc::range c, + sycl::nd_item<3> item_ct1) +{ + using type = typename output_accessor::arithmetic_type; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [](const type& x, const type& y) { return x; }, item_ct1); +} + +template +void abstract_classical_spmv( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const size_type num_rows, const MatrixValueType* val, + const IndexType* col_idxs, const IndexType* row_ptrs, + acc::range b, acc::range c) +{ + // only subgroup = 1, so does not need sycl::reqd_sub_group_size + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + abstract_classical_spmv(num_rows, val, col_idxs, + row_ptrs, b, c, item_ct1); + }); +} + + +template +void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ alpha, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + const typename output_accessor::storage_type* __restrict__ beta, + acc::range c, sycl::nd_item<3> item_ct1) +{ + using type = typename output_accessor::arithmetic_type; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [&alpha_val, &beta_val](const type& x, const type& y) { + return alpha_val * x + beta_val * y; + }, + item_ct1); +} + +template +void abstract_classical_spmv( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + const size_type num_rows, const MatrixValueType* alpha, + const MatrixValueType* val, const IndexType* col_idxs, + const IndexType* row_ptrs, acc::range b, + const typename output_accessor::storage_type* beta, + acc::range c) +{ + // only subgroup = 1, so does not need sycl::reqd_sub_group_size + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, item_ct1); + }); +} + + +} // namespace kernel + + +namespace host_kernel { + + +template +void classical_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + using arithmetic_type = + highest_precision; + using input_accessor = + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; + using output_accessor = + gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>; + constexpr int threads_per_cu = 7; + const auto num_subgroup = + exec->get_num_computing_units() * threads_per_cu * classical_overweight; + const auto nsg_in_group = spmv_block_size / subgroup_size; + const auto gridx = + std::min(ceildiv(a->get_size()[0], spmv_block_size / subgroup_size), + int64(num_subgroup / nsg_in_group)); + const dim3 grid(gridx, b->get_size()[1]); + const auto block = spmv_block_size; + + const auto b_vals = gko::acc::range( + std::array{ + {static_cast(b->get_size()[0]), + static_cast(b->get_size()[1])}}, + b->get_const_values(), + std::array{ + {static_cast(b->get_stride())}}); + auto c_vals = gko::acc::range( + std::array{ + {static_cast(c->get_size()[0]), + static_cast(c->get_size()[1])}}, + c->get_values(), + std::array{ + {static_cast(c->get_stride())}}); + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (alpha == nullptr && beta == nullptr) { + kernel::abstract_classical_spmv( + grid, block, 0, exec->get_queue(), a->get_size()[0], + a->get_const_value(), a->get_const_col_idxs(), + a->get_const_row_ptrs(), b_vals, c_vals); + } else if (alpha != nullptr && beta != nullptr) { + kernel::abstract_classical_spmv( + grid, block, 0, exec->get_queue(), a->get_size()[0], + alpha->get_const_values(), a->get_const_value(), + a->get_const_col_idxs(), a->get_const_row_ptrs(), b_vals, + beta->get_const_values(), c_vals); + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); + + +} // namespace host_kernel + + +template void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c) +{ + host_kernel::select_classical_spmv( + classical_kernels(), [](int compiled_info) { return true; }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + const matrix::Dense* alpha, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + const matrix::Dense* beta, + matrix::Dense* c) +{ + host_kernel::select_classical_spmv( + classical_kernels(), [](int compiled_info) { return true; }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp index f7e4d7643e2..68daf706fbc 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/hip/matrix/sparsity_csr_kernels.hip.cpp @@ -33,9 +33,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/sparsity_csr_kernels.hpp" +#include + + #include +#include "accessor/hip_helper.hpp" +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" + + namespace gko { namespace kernels { namespace hip { @@ -47,25 +63,119 @@ namespace hip { namespace sparsity_csr { -template +constexpr int classical_overweight = 32; +constexpr int spmv_block_size = 256; +constexpr int warps_in_block = 4; + + +using classical_kernels = syn::value_list; + + +#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" + + +namespace host_kernel { + + +template +void classical_spmv(syn::value_list, + std::shared_ptr exec, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c, + const matrix::Dense* alpha = nullptr, + const matrix::Dense* beta = nullptr) +{ + using arithmetic_type = + highest_precision; + using input_accessor = + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; + using output_accessor = + gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>; + + const auto nwarps = exec->get_num_warps_per_sm() * + exec->get_num_multiprocessor() * classical_overweight; + const auto gridx = + std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), + int64(nwarps / warps_in_block)); + const dim3 grid(gridx, b->get_size()[1]); + const auto block = spmv_block_size; + + const auto b_vals = gko::acc::range( + std::array{ + {static_cast(b->get_size()[0]), + static_cast(b->get_size()[1])}}, + b->get_const_values(), + std::array{ + {static_cast(b->get_stride())}}); + auto c_vals = gko::acc::range( + std::array{ + {static_cast(c->get_size()[0]), + static_cast(c->get_size()[1])}}, + c->get_values(), + std::array{ + {static_cast(c->get_stride())}}); + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (alpha == nullptr && beta == nullptr) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::abstract_classical_spmv), + grid, block, 0, 0, a->get_size()[0], + as_hip_type(a->get_const_value()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), acc::as_hip_range(b_vals), + acc::as_hip_range(c_vals)); + } else if (alpha != nullptr && beta != nullptr) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::abstract_classical_spmv), + grid, block, 0, 0, a->get_size()[0], + as_hip_type(alpha->get_const_values()), + as_hip_type(a->get_const_value()), a->get_const_col_idxs(), + as_hip_type(a->get_const_row_ptrs()), acc::as_hip_range(b_vals), + as_hip_type(beta->get_const_values()), acc::as_hip_range(c_vals)); + } else { + GKO_KERNEL_NOT_FOUND; + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); + + +} // namespace host_kernel + +template void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c) +{ + host_kernel::select_classical_spmv( + classical_kernels(), [](int compiled_info) { return true; }, + syn::value_list(), syn::type_list<>(), exec, a, b, c); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + const matrix::Dense* alpha, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + const matrix::Dense* beta, + matrix::Dense* c) +{ + host_kernel::select_classical_spmv( + classical_kernels(), [](int compiled_info) { return true; }, + syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp index 07cf2c6e689..9c1c909c25a 100644 --- a/include/ginkgo/core/matrix/sparsity_csr.hpp +++ b/include/ginkgo/core/matrix/sparsity_csr.hpp @@ -253,7 +253,9 @@ class SparsityCsr col_idxs_(exec, num_nonzeros), row_ptrs_(exec, size[0] + 1), value_(exec, {one()}) - {} + { + row_ptrs_.fill(0); + } /** * Creates a SparsityCsr matrix from already allocated (and initialized) row diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp index 3ed0bc507ce..92db204f534 100644 --- a/omp/matrix/sparsity_csr_kernels.cpp +++ b/omp/matrix/sparsity_csr_kernels.cpp @@ -46,7 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -61,64 +63,71 @@ namespace omp { namespace sparsity_csr { -template +template void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, matrix::Dense* c) + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c) { + using arithmetic_type = + highest_precision; auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto val = a->get_const_value()[0]; + const auto val = static_cast(a->get_const_value()[0]); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; ++row) { for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) = zero(); - } - for (size_type k = row_ptrs[row]; - k < static_cast(row_ptrs[row + 1]); ++k) { - auto col = col_idxs[k]; - for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) += val * b->at(col, j); + auto temp_val = gko::zero(); + for (size_type k = row_ptrs[row]; + k < static_cast(row_ptrs[row + 1]); ++k) { + temp_val += + val * static_cast(b->at(col_idxs[k], j)); } + c->at(row, j) = static_cast(temp_val); } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) + const matrix::Dense* alpha, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + const matrix::Dense* beta, + matrix::Dense* c) { + using arithmetic_type = + highest_precision; auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto valpha = alpha->at(0, 0); - auto vbeta = beta->at(0, 0); - auto val = a->get_const_value()[0]; + const auto valpha = static_cast(alpha->at(0, 0)); + const auto vbeta = static_cast(beta->at(0, 0)); + const auto val = static_cast(a->get_const_value()[0]); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; ++row) { for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) *= vbeta; - } - for (size_type k = row_ptrs[row]; - k < static_cast(row_ptrs[row + 1]); ++k) { - auto col = col_idxs[k]; - for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) += valpha * val * b->at(col, j); + auto temp_val = gko::zero(); + for (size_type k = row_ptrs[row]; + k < static_cast(row_ptrs[row + 1]); ++k) { + temp_val += + val * static_cast(b->at(col_idxs[k], j)); } + c->at(row, j) = static_cast( + vbeta * static_cast(c->at(row, j)) + + valpha * temp_val); } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); diff --git a/omp/test/matrix/CMakeLists.txt b/omp/test/matrix/CMakeLists.txt index d2843b7e459..0c0442f61e0 100644 --- a/omp/test/matrix/CMakeLists.txt +++ b/omp/test/matrix/CMakeLists.txt @@ -7,4 +7,3 @@ ginkgo_create_test(fbcsr_kernels) ginkgo_create_test(fft_kernels) ginkgo_create_test(hybrid_kernels) ginkgo_create_test(sellp_kernels) -ginkgo_create_test(sparsity_csr_kernels) diff --git a/omp/test/matrix/sparsity_csr_kernels.cpp b/omp/test/matrix/sparsity_csr_kernels.cpp deleted file mode 100644 index 3075b12d483..00000000000 --- a/omp/test/matrix/sparsity_csr_kernels.cpp +++ /dev/null @@ -1,297 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include -#include -#include - - -#include - - -#include -#include -#include - - -#include "core/matrix/sparsity_csr_kernels.hpp" -#include "core/test/utils.hpp" - - -namespace { - - -class SparsityCsr : public ::testing::Test { -protected: - using Mtx = gko::matrix::SparsityCsr<>; - using Vec = gko::matrix::Dense<>; - using ComplexVec = gko::matrix::Dense>; - using ComplexMtx = gko::matrix::SparsityCsr>; - - SparsityCsr() : mtx_size(532, 231), rand_engine(42) {} - - void SetUp() - { - ref = gko::ReferenceExecutor::create(); - omp = gko::OmpExecutor::create(); - } - - void TearDown() - { - if (omp != nullptr) { - ASSERT_NO_THROW(omp->synchronize()); - } - } - - template - std::unique_ptr gen_mtx(int num_rows, int num_cols, - int min_nnz_row) - { - return gko::test::generate_random_matrix( - num_rows, num_cols, - std::uniform_int_distribution<>(min_nnz_row, num_cols), - std::uniform_real_distribution<>(0.0, 1.0), rand_engine, ref); - } - - void set_up_apply_data(int num_vectors = 1) - { - mtx = gen_mtx(mtx_size[0], mtx_size[1], 1); - complex_mtx = gen_mtx(mtx_size[0], mtx_size[1], 1); - expected = gen_mtx(mtx_size[0], num_vectors, 1); - y = gen_mtx(mtx_size[1], num_vectors, 1); - alpha = gko::initialize({2.0}, ref); - beta = gko::initialize({-1.0}, ref); - dmtx = gko::clone(omp, mtx); - complex_dmtx = gko::clone(omp, complex_mtx); - dresult = gko::clone(omp, expected); - dy = gko::clone(omp, y); - dalpha = gko::clone(omp, alpha); - dbeta = gko::clone(omp, beta); - } - - struct matrix_pair { - std::unique_ptr ref; - std::unique_ptr omp; - }; - - matrix_pair gen_unsorted_mtx() - { - constexpr int min_nnz_per_row = 2; // Must be at least 2 - auto local_mtx_ref = - gen_mtx(mtx_size[0], mtx_size[1], min_nnz_per_row); - for (size_t row = 0; row < mtx_size[0]; ++row) { - const auto row_ptrs = local_mtx_ref->get_const_row_ptrs(); - const auto start_row = row_ptrs[row]; - auto col_idx = local_mtx_ref->get_col_idxs() + start_row; - const auto nnz_in_this_row = row_ptrs[row + 1] - row_ptrs[row]; - auto swap_idx_dist = - std::uniform_int_distribution<>(0, nnz_in_this_row - 1); - // shuffle `nnz_in_this_row / 2` times - for (size_t perm = 0; perm < nnz_in_this_row; perm += 2) { - const auto idx1 = swap_idx_dist(rand_engine); - const auto idx2 = swap_idx_dist(rand_engine); - std::swap(col_idx[idx1], col_idx[idx2]); - } - } - auto local_mtx_omp = gko::clone(omp, local_mtx_ref); - - return {std::move(local_mtx_ref), std::move(local_mtx_omp)}; - } - - std::shared_ptr ref; - std::shared_ptr omp; - - const gko::dim<2> mtx_size; - std::default_random_engine rand_engine; - - std::unique_ptr mtx; - std::unique_ptr complex_mtx; - std::unique_ptr expected; - std::unique_ptr y; - std::unique_ptr alpha; - std::unique_ptr beta; - - std::unique_ptr dmtx; - std::unique_ptr complex_dmtx; - std::unique_ptr dresult; - std::unique_ptr dy; - std::unique_ptr dalpha; - std::unique_ptr dbeta; -}; - - -TEST_F(SparsityCsr, SimpleApplyIsEquivalentToRef) -{ - set_up_apply_data(); - - mtx->apply(y.get(), expected.get()); - dmtx->apply(dy.get(), dresult.get()); - - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); -} - - -TEST_F(SparsityCsr, AdvancedApplyIsEquivalentToRef) -{ - set_up_apply_data(); - - mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); - dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); - - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); -} - - -TEST_F(SparsityCsr, SimpleApplyToDenseMatrixIsEquivalentToRef) -{ - set_up_apply_data(3); - - mtx->apply(y.get(), expected.get()); - dmtx->apply(dy.get(), dresult.get()); - - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); -} - - -TEST_F(SparsityCsr, AdvancedApplyToDenseMatrixIsEquivalentToRef) -{ - set_up_apply_data(3); - - mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); - dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); - - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); -} - - -TEST_F(SparsityCsr, TransposeIsEquivalentToRef) -{ - set_up_apply_data(); - - auto trans = mtx->transpose(); - auto d_trans = dmtx->transpose(); - - GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), - static_cast(trans.get()), 0.0); -} - - -TEST_F(SparsityCsr, CountsNumberOfDiagElementsIsEqualToRef) -{ - set_up_apply_data(); - gko::size_type num_diags = 0; - gko::size_type d_num_diags = 0; - - gko::kernels::reference::sparsity_csr::count_num_diagonal_elements( - ref, mtx.get(), &num_diags); - gko::kernels::omp::sparsity_csr::count_num_diagonal_elements( - omp, dmtx.get(), &d_num_diags); - - ASSERT_EQ(d_num_diags, num_diags); -} - - -TEST_F(SparsityCsr, RemovesDiagElementsKernelIsEquivalentToRef) -{ - set_up_apply_data(); - gko::size_type num_diags = 0; - gko::kernels::reference::sparsity_csr::count_num_diagonal_elements( - ref, mtx.get(), &num_diags); - auto tmp = - Mtx::create(ref, mtx->get_size(), mtx->get_num_nonzeros() - num_diags); - auto d_tmp = Mtx::create(omp, dmtx->get_size(), - dmtx->get_num_nonzeros() - num_diags); - - gko::kernels::reference::sparsity_csr::remove_diagonal_elements( - ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), tmp.get()); - gko::kernels::omp::sparsity_csr::remove_diagonal_elements( - omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - d_tmp.get()); - - GKO_ASSERT_MTX_NEAR(tmp.get(), d_tmp.get(), 0.0); -} - - -TEST_F(SparsityCsr, RecognizeSortedMatrixIsEquivalentToRef) -{ - set_up_apply_data(); - bool is_sorted_omp{}; - bool is_sorted_ref{}; - - is_sorted_ref = mtx->is_sorted_by_column_index(); - is_sorted_omp = dmtx->is_sorted_by_column_index(); - - ASSERT_EQ(is_sorted_ref, is_sorted_omp); -} - - -TEST_F(SparsityCsr, RecognizeUnsortedMatrixIsEquivalentToRef) -{ - auto uns_mtx = gen_unsorted_mtx(); - bool is_sorted_omp{}; - bool is_sorted_ref{}; - - is_sorted_ref = uns_mtx.ref->is_sorted_by_column_index(); - is_sorted_omp = uns_mtx.omp->is_sorted_by_column_index(); - - ASSERT_EQ(is_sorted_ref, is_sorted_omp); -} - - -TEST_F(SparsityCsr, SortSortedMatrixIsEquivalentToRef) -{ - set_up_apply_data(); - - mtx->sort_by_column_index(); - dmtx->sort_by_column_index(); - - // Values must be unchanged, therefore, tolerance is `0` - GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0); -} - - -TEST_F(SparsityCsr, SortUnsortedMatrixIsEquivalentToRef) -{ - auto uns_mtx = gen_unsorted_mtx(); - - uns_mtx.ref->sort_by_column_index(); - uns_mtx.omp->sort_by_column_index(); - - // Values must be unchanged, therefore, tolerance is `0` - GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.omp, 0); -} - - -} // namespace diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp index 71e63f18554..dfb7bc38bfc 100644 --- a/reference/matrix/sparsity_csr_kernels.cpp +++ b/reference/matrix/sparsity_csr_kernels.cpp @@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -58,62 +60,70 @@ namespace reference { namespace sparsity_csr { -template +template void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, matrix::Dense* c) + const matrix::SparsityCsr* a, + const matrix::Dense* b, + matrix::Dense* c) { + using arithmetic_type = + highest_precision; auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto val = a->get_const_value()[0]; + const auto val = static_cast(a->get_const_value()[0]); for (size_type row = 0; row < a->get_size()[0]; ++row) { for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) = zero(); - } - for (size_type k = row_ptrs[row]; - k < static_cast(row_ptrs[row + 1]); ++k) { - auto col = col_idxs[k]; - for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) += val * b->at(col, j); + auto temp_val = gko::zero(); + for (size_type k = row_ptrs[row]; + k < static_cast(row_ptrs[row + 1]); ++k) { + temp_val += + val * static_cast(b->at(col_idxs[k], j)); } + c->at(row, j) = static_cast(temp_val); } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) + const matrix::Dense* alpha, + const matrix::SparsityCsr* a, + const matrix::Dense* b, + const matrix::Dense* beta, + matrix::Dense* c) { + using arithmetic_type = + highest_precision; + auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto valpha = alpha->at(0, 0); - auto vbeta = beta->at(0, 0); - auto val = a->get_const_value()[0]; + const auto valpha = static_cast(alpha->at(0, 0)); + const auto vbeta = static_cast(beta->at(0, 0)); + const auto val = static_cast(a->get_const_value()[0]); for (size_type row = 0; row < a->get_size()[0]; ++row) { for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) *= vbeta; - } - for (size_type k = row_ptrs[row]; - k < static_cast(row_ptrs[row + 1]); ++k) { - auto col = col_idxs[k]; - for (size_type j = 0; j < c->get_size()[1]; ++j) { - c->at(row, j) += valpha * val * b->at(col, j); + auto temp_val = gko::zero(); + for (size_type k = row_ptrs[row]; + k < static_cast(row_ptrs[row + 1]); ++k) { + temp_val += + val * static_cast(b->at(col_idxs[k], j)); } + c->at(row, j) = static_cast( + vbeta * static_cast(c->at(row, j)) + + valpha * temp_val); } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); diff --git a/reference/test/multigrid/amgx_pgm_kernels.cpp b/reference/test/multigrid/amgx_pgm_kernels.cpp index 9dc075d79a3..6f71d43ab56 100644 --- a/reference/test/multigrid/amgx_pgm_kernels.cpp +++ b/reference/test/multigrid/amgx_pgm_kernels.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ class AmgxPgm : public ::testing::Test { typename std::tuple_element<1, decltype(ValueIndexType())>::type; using Mtx = gko::matrix::Csr; using Vec = gko::matrix::Dense; + using SparsityCsr = gko::matrix::SparsityCsr; using MgLevel = gko::multigrid::AmgxPgm; using RowGatherer = gko::matrix::RowGatherer; using VT = value_type; @@ -511,6 +513,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevel) using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; using Mtx = typename TestFixture::Mtx; + using SparsityCsr = typename TestFixture::SparsityCsr; using RowGatherer = typename TestFixture::RowGatherer; auto prolong_op = gko::share(Mtx::create(this->exec, gko::dim<2>{5, 2}, 0)); // 0-2-4, 1-3 @@ -526,7 +529,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevel) auto expected_row_gather = gko::Array(this->exec, {0, 1, 0, 1, 0}); - GKO_ASSERT_MTX_NEAR(gko::as(coarse_fine->get_restrict_op()), + GKO_ASSERT_MTX_NEAR(gko::as(coarse_fine->get_restrict_op()), restrict_op, r::value); GKO_ASSERT_MTX_NEAR(gko::as(coarse_fine->get_coarse_op()), this->coarse, r::value); @@ -539,6 +542,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevelOnUnsortedMatrix) using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; using Mtx = typename TestFixture::Mtx; + using SparsityCsr = typename TestFixture::SparsityCsr; using MgLevel = typename TestFixture::MgLevel; using RowGatherer = typename TestFixture::RowGatherer; auto mglevel_sort = MgLevel::build() @@ -572,7 +576,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevelOnUnsortedMatrix) auto expected_row_gather = gko::Array(this->exec, {0, 1, 0, 1, 0}); - GKO_ASSERT_MTX_NEAR(gko::as(coarse_fine->get_restrict_op()), + GKO_ASSERT_MTX_NEAR(gko::as(coarse_fine->get_restrict_op()), restrict_op, r::value); GKO_ASSERT_MTX_NEAR(gko::as(coarse_fine->get_coarse_op()), this->coarse, r::value); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 951f1f1d3d4..f64b4d9673b 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/test/utils.hpp" @@ -74,6 +75,11 @@ struct SimpleMatrixTest { return matrix_type::create(exec->get_master(), size); } + static void modify_data( + gko::matrix_data& data) + {} + static void check_property(const std::unique_ptr&) {} }; @@ -348,6 +354,19 @@ struct HybridAutomaticStrategy }; +struct SparsityCsr + : SimpleMatrixTest> { + static void modify_data(gko::matrix_data& data) + { + using entry_type = + gko::matrix_data::nonzero_type; + for (auto& entry : data.nonzeros) { + entry.value = gko::one(); + } + } +}; + + template struct test_pair { std::unique_ptr ref; @@ -633,7 +652,7 @@ using MatrixTypes = ::testing::Types< SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy, HybridColumnLimitStrategy, HybridImbalanceLimitStrategy, HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy, - HybridAutomaticStrategy>; + HybridAutomaticStrategy, SparsityCsr>; TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator); @@ -794,6 +813,7 @@ TYPED_TEST(Matrix, ReadWriteRoundtrip) auto new_mtx = TestConfig::create(this->exec, data.size); gko::matrix_data out_data; + TestConfig::modify_data(data); new_mtx->read(data); new_mtx->write(out_data); diff --git a/test/multigrid/amgx_pgm_kernels.cpp b/test/multigrid/amgx_pgm_kernels.cpp index 4694f0375ee..46e13b94d10 100644 --- a/test/multigrid/amgx_pgm_kernels.cpp +++ b/test/multigrid/amgx_pgm_kernels.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ class AmgxPgm : public ::testing::Test { using index_type = gko::int32; using Mtx = gko::matrix::Dense; using Csr = gko::matrix::Csr; + using SparsityCsr = gko::matrix::SparsityCsr; using RowGatherer = gko::matrix::RowGatherer; using Diag = gko::matrix::Diagonal; @@ -315,8 +317,8 @@ TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef) d_row_gatherer->get_executor(), d_row_gatherer->get_size()[0], d_row_gatherer->get_const_row_idxs()); - GKO_ASSERT_MTX_NEAR(gko::as(d_mg_level->get_restrict_op()), - gko::as(mg_level->get_restrict_op()), + GKO_ASSERT_MTX_NEAR(gko::as(d_mg_level->get_restrict_op()), + gko::as(mg_level->get_restrict_op()), r::value); GKO_ASSERT_MTX_NEAR(gko::as(d_mg_level->get_coarse_op()), gko::as(mg_level->get_coarse_op()), @@ -348,8 +350,8 @@ TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRefOnUnsortedMatrix) d_row_gatherer->get_executor(), d_row_gatherer->get_size()[0], d_row_gatherer->get_const_row_idxs()); - GKO_ASSERT_MTX_NEAR(gko::as(d_mg_level->get_restrict_op()), - gko::as(mg_level->get_restrict_op()), + GKO_ASSERT_MTX_NEAR(gko::as(d_mg_level->get_restrict_op()), + gko::as(mg_level->get_restrict_op()), r::value); GKO_ASSERT_MTX_NEAR(gko::as(d_mg_level->get_coarse_op()), gko::as(mg_level->get_coarse_op()),