diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc new file mode 100644 index 00000000000..b374850d347 --- /dev/null +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -0,0 +1,75 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void transpose_blocks( + const IndexType nbnz, ValueType* const values) +{ + const auto total_subwarp_count = + thread::get_subwarp_num_flat(); + const IndexType begin_blk = + thread::get_subwarp_id_flat(); + + auto thread_block = group::this_thread_block(); + auto subwarp_grp = group::tiled_partition(thread_block); + const int sw_threadidx = subwarp_grp.thread_rank(); + + constexpr int mat_blk_sz_2{mat_blk_sz * mat_blk_sz}; + constexpr int num_entries_per_thread{(mat_blk_sz_2 - 1) / subwarp_size + 1}; + ValueType orig_vals[num_entries_per_thread]; + + for (auto ibz = begin_blk; ibz < nbnz; ibz += total_subwarp_count) { + for (int i = sw_threadidx; i < mat_blk_sz_2; i += subwarp_size) { + orig_vals[i / subwarp_size] = values[ibz * mat_blk_sz_2 + i]; + } + subwarp_grp.sync(); + + for (int i = 0; i < num_entries_per_thread; i++) { + const int orig_pos = i * subwarp_size + sw_threadidx; + if (orig_pos >= mat_blk_sz_2) { + break; + } + const int orig_row = orig_pos % mat_blk_sz; + const int orig_col = orig_pos / mat_blk_sz; + const int new_pos = orig_row * mat_blk_sz + orig_col; + values[ibz * mat_blk_sz_2 + new_pos] = orig_vals[i]; + } + subwarp_grp.sync(); + } +} + + +} // namespace kernel diff --git a/core/base/block_sizes.hpp b/core/base/block_sizes.hpp new file mode 100644 index 00000000000..62ec387d8e3 --- /dev/null +++ b/core/base/block_sizes.hpp @@ -0,0 +1,64 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_BLOCK_SIZES_HPP_ +#define GKO_CORE_BASE_BLOCK_SIZES_HPP_ + + +#include +#include + + +namespace gko { +namespace fixedblock { + + +/** + * @def GKO_FIXED_BLOCK_CUSTOM_SIZES + * Optionally-defined comma-separated list of fixed block sizes to compile. + */ +#ifdef GKO_FIXED_BLOCK_CUSTOM_SIZES +/** + * A compile-time list of block sizes for which dedicated fixed-block matrix + * and corresponding preconditioner kernels should be compiled. + */ +using compiled_kernels = syn::value_list; +#else +using compiled_kernels = syn::value_list; +#endif + + +} // namespace fixedblock +} // namespace gko + + +#endif // GKO_CORE_BASE_BLOCK_SIZES_HPP_ diff --git a/core/base/utils.hpp b/core/base/utils.hpp index 911945cef78..a42312ead37 100644 --- a/core/base/utils.hpp +++ b/core/base/utils.hpp @@ -219,6 +219,20 @@ std::shared_ptr convert_to_with_sorting( skip_sorting); } +/** + * Converts the given arguments into an array of entries of the requested + * template type. + * + * @tparam T The requested type of entries in the output array. + * + * @param args Entities to be filled into an array after casting to type T. + */ +template +constexpr std::array to_std_array(Args&&... args) +{ + return {static_cast(args)...}; +} + } // namespace gko diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 8c0aa3649cd..8e28d1f38bb 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -155,14 +156,17 @@ template void Fbcsr::apply_impl(const LinOp* const b, LinOp* const x) const { - using Dense = Dense; if (auto b_fbcsr = dynamic_cast*>(b)) { // if b is a FBCSR matrix, we need an SpGeMM GKO_NOT_SUPPORTED(b_fbcsr); } else { // otherwise we assume that b is dense and compute a SpMV/SpMM - this->get_executor()->run( - fbcsr::make_spmv(this, as(b), as(x))); + precision_dispatch_real_complex( + [this](auto dense_b, auto dense_x) { + this->get_executor()->run( + fbcsr::make_spmv(this, dense_b, dense_x)); + }, + b, x); } } @@ -173,7 +177,6 @@ void Fbcsr::apply_impl(const LinOp* const alpha, const LinOp* const beta, LinOp* const x) const { - using Dense = Dense; if (auto b_fbcsr = dynamic_cast*>(b)) { // if b is a FBCSR matrix, we need an SpGeMM GKO_NOT_SUPPORTED(b_fbcsr); @@ -182,9 +185,13 @@ void Fbcsr::apply_impl(const LinOp* const alpha, GKO_NOT_SUPPORTED(b_ident); } else { // otherwise we assume that b is dense and compute a SpMV/SpMM - this->get_executor()->run( - fbcsr::make_advanced_spmv(as(alpha), this, as(b), - as(beta), as(x))); + precision_dispatch_real_complex( + [this](auto dense_alpha, auto dense_b, auto dense_beta, + auto dense_x) { + this->get_executor()->run(fbcsr::make_advanced_spmv( + dense_alpha, this, dense_b, dense_beta, dense_x)); + }, + alpha, b, beta, x); } } diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index 2965ceb34a2..16604c7c9da 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -68,7 +68,10 @@ namespace syn { _name(::gko::syn::value_list(), is_eligible, \ int_args, type_args, std::forward(args)...); \ } \ - } + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") #define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable) \ template ****************************** +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_TEST_UTILS_FB_MATRIX_GENERATOR_HPP_ +#define GKO_CORE_TEST_UTILS_FB_MATRIX_GENERATOR_HPP_ + + +#include +#include +#include +#include + + +#include +#include +#include +#include + + +#include "core/factorization/factorization_kernels.hpp" +#include "core/test/utils/matrix_generator.hpp" +#include "core/test/utils/unsort_matrix.hpp" + + +namespace gko { +namespace test { + + +/** + * Generates a random matrix, ensuring the existence of diagonal entries. + * + * @tparam MatrixType type of matrix to generate (matrix::Dense must implement + * the interface `ConvertibleTo`) + * @tparam NonzeroDistribution type of nonzero distribution + * @tparam ValueDistribution type of value distribution + * @tparam Engine type of random engine + * @tparam MatrixArgs the arguments from the matrix to be forwarded. + * + * @param num_rows number of rows + * @param num_cols number of columns + * @param nonzero_dist distribution of nonzeros per row + * @param value_dist distribution of matrix values + * @param engine a random engine + * @param exec executor where the matrix should be allocated + * @param args additional arguments for the matrix constructor + * + * @return the unique pointer to generated matrix of type MatrixType + */ +template , typename NonzeroDistribution, + typename ValueDistribution, typename Engine, typename... MatrixArgs> +std::unique_ptr generate_random_matrix_with_diag( + typename MatrixType::index_type num_rows, + typename MatrixType::index_type num_cols, + NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist, + Engine&& engine, std::shared_ptr exec, MatrixArgs&&... args) +{ + using value_type = typename MatrixType::value_type; + using index_type = typename MatrixType::index_type; + + matrix_data data{gko::dim<2>(num_rows, num_cols), + {}}; + + for (index_type row = 0; row < num_rows; ++row) { + std::vector col_idx(num_cols); + std::iota(col_idx.begin(), col_idx.end(), size_type(0)); + // randomly generate number of nonzeros in this row + auto nnz_in_row = static_cast(nonzero_dist(engine)); + nnz_in_row = std::max(1, std::min(nnz_in_row, num_cols)); + // select a subset of `nnz_in_row` column indexes, and fill these + // locations with random values + std::shuffle(col_idx.begin(), col_idx.end(), engine); + // add diagonal if it does not exist + auto it = std::find(col_idx.begin(), col_idx.begin() + nnz_in_row, row); + if (it == col_idx.begin() + nnz_in_row) { + col_idx[nnz_in_row - 1] = row; + } + std::for_each( + begin(col_idx), begin(col_idx) + nnz_in_row, [&](index_type col) { + data.nonzeros.emplace_back( + row, col, + detail::get_rand_value(value_dist, engine)); + }); + } + + data.ensure_row_major_order(); + + // convert to the correct matrix type + auto result = MatrixType::create(exec, std::forward(args)...); + result->read(data); + return result; +} + + +/** + * Generates a block CSR matrix having the same sparsity pattern as + * a given CSR matrix. + * + * @param exec Reference executor. + * @param csrmat The CSR matrix to use for the block sparsity pattern of the + * generated FBCSR matrix. + * @param block_size Block size of output Fbcsr matrix. + * @param row_diag_dominant If true, a row-diagonal-dominant Fbcsr matrix is + * generated. Note that in this case, the intput Csr + * matrix must have diagonal entries in all rows. + * @param rand_engine Random number engine to use, such as std::ranlux48. + */ +template +std::unique_ptr> generate_fbcsr_from_csr( + const std::shared_ptr exec, + const matrix::Csr* const csrmat, const int block_size, + const bool row_diag_dominant, RandEngine&& rand_engine) +{ + const auto nbrows = static_cast(csrmat->get_size()[0]); + const auto nbcols = static_cast(csrmat->get_size()[1]); + const auto nbnz_temp = + static_cast(csrmat->get_num_stored_elements()); + const int bs2 = block_size * block_size; + + auto fmtx = matrix::Fbcsr::create( + exec, + dim<2>{static_cast(nbrows * block_size), + static_cast(nbcols * block_size)}, + nbnz_temp * bs2, block_size); + exec->copy(nbrows + 1, csrmat->get_const_row_ptrs(), fmtx->get_row_ptrs()); + exec->copy(nbnz_temp, csrmat->get_const_col_idxs(), fmtx->get_col_idxs()); + + // We assume diagonal blocks are present for the diagonally-dominant case + + const IndexType nbnz = fmtx->get_num_stored_elements() / bs2; + + const IndexType* const row_ptrs = fmtx->get_const_row_ptrs(); + const IndexType* const col_idxs = fmtx->get_const_col_idxs(); + const IndexType nnz = nbnz * bs2; + ValueType* const vals = fmtx->get_values(); + std::uniform_real_distribution> + off_diag_dist(-1.0, 1.0); + + for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) { + if (row_diag_dominant) { + const IndexType nrownz = + (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size; + + std::uniform_real_distribution> + diag_dist(1.01 * nrownz, 2 * nrownz); + + for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1]; + ibz++) { + for (int i = 0; i < block_size * block_size; i++) { + vals[ibz * bs2 + i] = + gko::test::detail::get_rand_value( + off_diag_dist, rand_engine); + } + if (col_idxs[ibz] == ibrow) { + for (int i = 0; i < block_size; i++) { + vals[ibz * bs2 + i * block_size + i] = + static_cast(pow(-1, i)) * + gko::test::detail::get_rand_value( + diag_dist, rand_engine); + } + } + } + } else { + for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1]; + ibz++) { + for (int i = 0; i < bs2; i++) { + vals[ibz * bs2 + i] = + gko::test::detail::get_rand_value( + off_diag_dist, rand_engine); + } + } + } + } + + return fmtx; +} + + +/** + * Generates a random block CSR matrix. + * + * The block sparsity pattern of the generated matrix always has the diagonal + * entry in each block-row. + * + * @param exec Reference executor. + * @param nbrows The number of block-rows in the generated matrix. + * @param nbcols The number of block-columns in the generated matrix. + * @param mat_blk_sz Block size of the generated matrix. + * @param diag_dominant If true, a row-diagonal-dominant Fbcsr matrix is + * generated. + * @param unsort If true, the blocks of the generated matrix within each + * block-row are ordered randomly. Otherwise, blocks in each row + * are ordered by block-column index. + * @param engine Random number engine to use, such as std::ranlux48. + */ +template +std::unique_ptr> generate_random_fbcsr( + std::shared_ptr ref, const IndexType nbrows, + const IndexType nbcols, const int mat_blk_sz, const bool diag_dominant, + const bool unsort, RandEngine&& engine) +{ + using real_type = gko::remove_complex; + std::unique_ptr> rand_csr_ref = + diag_dominant + ? generate_random_matrix_with_diag< + matrix::Csr>( + nbrows, nbcols, + std::uniform_int_distribution(0, nbcols - 1), + std::normal_distribution(0.0, 1.0), + std::move(engine), ref) + : generate_random_matrix>( + nbrows, nbcols, + std::uniform_int_distribution(0, nbcols - 1), + std::normal_distribution(0.0, 1.0), + std::move(engine), ref); + if (unsort && rand_csr_ref->is_sorted_by_column_index()) { + unsort_matrix(rand_csr_ref.get(), engine); + } + return generate_fbcsr_from_csr(ref, rand_csr_ref.get(), mat_blk_sz, + diag_dominant, std::move(engine)); +} + + +} // namespace test +} // namespace gko + +#endif // GKO_CORE_TEST_UTILS_FB_MATRIX_GENERATOR_HPP_ diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp new file mode 100644 index 00000000000..a37f370f8fd --- /dev/null +++ b/core/test/utils/fb_matrix_generator_test.cpp @@ -0,0 +1,175 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/test/utils/fb_matrix_generator.hpp" + + +#include +#include +#include +#include + + +#include + + +#include "accessor/block_col_major.hpp" +#include "core/base/utils.hpp" +#include "core/test/utils/matrix_generator.hpp" + + +namespace { + + +class BlockMatrixGenerator : public ::testing::Test { +protected: + using real_type = double; + using value_type = std::complex; + + BlockMatrixGenerator() + : exec(gko::ReferenceExecutor::create()), + mtx(gko::test::generate_random_matrix< + gko::matrix::Csr>( + nbrows, nbcols, std::normal_distribution(10, 5), + std::normal_distribution(20.0, 5.0), std::ranlux48(42), + exec)), + rbmtx(gko::test::generate_fbcsr_from_csr(exec, mtx.get(), blk_sz, + false, std::ranlux48(42))), + rbmtx_dd(gko::test::generate_fbcsr_from_csr(exec, mtx.get(), blk_sz, + true, std::ranlux48(42))), + cbmtx(gko::test::generate_random_fbcsr( + exec, nbrows, nbcols, blk_sz, true, false, std::ranlux48(42))) + {} + + const int nbrows = 100; + const int nbcols = nbrows; + const int blk_sz = 5; + std::shared_ptr exec; + std::unique_ptr> mtx; + std::unique_ptr> rbmtx; + std::unique_ptr> rbmtx_dd; + std::unique_ptr, int>> cbmtx; + + template + ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end) + { + using std::pow; + ValueType res = 0; + ValueType num_elems = 0; + while (sample_start != sample_end) { + auto tmp = *(sample_start++); + res += pow(tmp - c, n); + num_elems += 1; + } + return res / num_elems; + } +}; + + +TEST_F(BlockMatrixGenerator, OutputHasCorrectSize) +{ + ASSERT_EQ(rbmtx->get_size(), gko::dim<2>(nbrows * blk_sz, nbcols * blk_sz)); + ASSERT_EQ(rbmtx_dd->get_size(), + gko::dim<2>(nbrows * blk_sz, nbcols * blk_sz)); + ASSERT_EQ(cbmtx->get_size(), gko::dim<2>(nbrows * blk_sz, nbcols * blk_sz)); + ASSERT_EQ(rbmtx->get_block_size(), blk_sz); + ASSERT_EQ(rbmtx_dd->get_block_size(), blk_sz); + ASSERT_EQ(cbmtx->get_block_size(), blk_sz); +} + + +TEST_F(BlockMatrixGenerator, OutputHasCorrectSparsityPattern) +{ + ASSERT_EQ(mtx->get_num_stored_elements(), + rbmtx->get_num_stored_elements() / blk_sz / blk_sz); + for (int irow = 0; irow < nbrows; irow++) { + const int start = mtx->get_const_row_ptrs()[irow]; + const int end = mtx->get_const_row_ptrs()[irow + 1]; + ASSERT_EQ(start, rbmtx->get_const_row_ptrs()[irow]); + ASSERT_EQ(end, rbmtx->get_const_row_ptrs()[irow + 1]); + for (int iz = start; iz < end; iz++) { + ASSERT_EQ(mtx->get_const_col_idxs()[iz], + rbmtx->get_const_col_idxs()[iz]); + } + } +} + + +TEST_F(BlockMatrixGenerator, ComplexOutputIsRowDiagonalDominantWhenRequested) +{ + using Dbv_t = + gko::acc::range>; + const auto nbnz = cbmtx->get_num_stored_blocks(); + const Dbv_t vals(gko::to_std_array(nbnz, blk_sz, blk_sz), + cbmtx->get_const_values()); + const int* const row_ptrs = cbmtx->get_const_row_ptrs(); + const int* const col_idxs = cbmtx->get_const_col_idxs(); + + for (int irow = 0; irow < nbrows; irow++) { + std::vector row_del_sum(blk_sz, 0.0); + std::vector diag_val(blk_sz, 0.0); + bool diagfound{false}; + for (int iz = row_ptrs[irow]; iz < row_ptrs[irow + 1]; iz++) { + if (col_idxs[iz] == irow) { + diagfound = true; + for (int i = 0; i < blk_sz; i++) { + for (int j = 0; j < blk_sz; j++) { + if (i == j) { + diag_val[i] = abs(vals(iz, i, i)); + } else { + row_del_sum[i] += abs(vals(iz, i, j)); + } + } + } + } else { + for (int i = 0; i < blk_sz; i++) { + for (int j = 0; j < blk_sz; j++) { + row_del_sum[i] += abs(vals(iz, i, j)); + } + } + } + } + std::vector diag_dom(blk_sz); + for (int i = 0; i < blk_sz; i++) { + diag_dom[i] = diag_val[i] - row_del_sum[i]; + } + + ASSERT_TRUE(diagfound); + for (int i = 0; i < blk_sz; i++) { + ASSERT_GT(diag_val[i], row_del_sum[i]); + } + } +} + + +} // namespace diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp new file mode 100644 index 00000000000..32e9238e758 --- /dev/null +++ b/cuda/base/cusparse_block_bindings.hpp @@ -0,0 +1,496 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_BASE_CUSPARSE_BLOCK_BINDINGS_HPP_ +#define GKO_CUDA_BASE_CUSPARSE_BLOCK_BINDINGS_HPP_ + + +#include +#include + + +#include + + +#include "cuda/base/cusparse_bindings.hpp" +#include "cuda/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The CUSPARSE namespace. + * + * @ingroup cusparse + */ +namespace cusparse { + + +/// Default storage layout within each small dense block +constexpr cusparseDirection_t blockDir = CUSPARSE_DIRECTION_COLUMN; + + +#define GKO_BIND_CUSPARSE32_BSRMV(ValueType, CusparseName) \ + inline void bsrmv(cusparseHandle_t handle, cusparseOperation_t transA, \ + int32 mb, int32 nb, int32 nnzb, const ValueType* alpha, \ + const cusparseMatDescr_t descrA, const ValueType* valA, \ + const int32* rowPtrA, const int32* colIndA, \ + int block_size, const ValueType* x, \ + const ValueType* beta, ValueType* y) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName( \ + handle, blockDir, transA, mb, nb, nnzb, as_culibs_type(alpha), \ + descrA, as_culibs_type(valA), rowPtrA, colIndA, block_size, \ + as_culibs_type(x), as_culibs_type(beta), as_culibs_type(y))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_CUSPARSE64_BSRMV(ValueType, CusparseName) \ + inline void bsrmv(cusparseHandle_t handle, cusparseOperation_t transA, \ + int64 mb, int64 nb, int64 nnzb, const ValueType* alpha, \ + const cusparseMatDescr_t descrA, const ValueType* valA, \ + const int64* rowPtrA, const int64* colIndA, \ + int block_size, const ValueType* x, \ + const ValueType* beta, ValueType* y) \ + GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE32_BSRMV(float, cusparseSbsrmv); +GKO_BIND_CUSPARSE32_BSRMV(double, cusparseDbsrmv); +GKO_BIND_CUSPARSE32_BSRMV(std::complex, cusparseCbsrmv); +GKO_BIND_CUSPARSE32_BSRMV(std::complex, cusparseZbsrmv); +GKO_BIND_CUSPARSE64_BSRMV(float, cusparseSbsrmv); +GKO_BIND_CUSPARSE64_BSRMV(double, cusparseDbsrmv); +GKO_BIND_CUSPARSE64_BSRMV(std::complex, cusparseCbsrmv); +GKO_BIND_CUSPARSE64_BSRMV(std::complex, cusparseZbsrmv); +template +GKO_BIND_CUSPARSE32_BSRMV(ValueType, detail::not_implemented); +template +GKO_BIND_CUSPARSE64_BSRMV(ValueType, detail::not_implemented); + + +#undef GKO_BIND_CUSPARSE32_BSRMV +#undef GKO_BIND_CUSPARSE64_BSRMV + + +#define GKO_BIND_CUSPARSE32_BSRMM(ValueType, CusparseName) \ + inline void bsrmm(cusparseHandle_t handle, cusparseOperation_t transA, \ + cusparseOperation_t transB, int32 mb, int32 n, int32 kb, \ + int32 nnzb, const ValueType* alpha, \ + const cusparseMatDescr_t descrA, const ValueType* valA, \ + const int32* rowPtrA, const int32* colIndA, \ + int block_size, const ValueType* B, int32 ldb, \ + const ValueType* beta, ValueType* C, int32 ldc) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, blockDir, transA, transB, mb, n, kb, nnzb, \ + as_culibs_type(alpha), descrA, as_culibs_type(valA), \ + rowPtrA, colIndA, block_size, as_culibs_type(B), ldb, \ + as_culibs_type(beta), as_culibs_type(C), ldc)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_CUSPARSE64_BSRMM(ValueType, CusparseName) \ + inline void bsrmm( \ + cusparseHandle_t handle, cusparseOperation_t transA, \ + cusparseOperation_t transB, int64 mb, int64 n, int64 kb, int64 nnzb, \ + const ValueType* alpha, const cusparseMatDescr_t descrA, \ + const ValueType* valA, const int64* rowPtrA, const int64* colIndA, \ + int block_size, const ValueType* B, int64 ldb, const ValueType* beta, \ + ValueType* C, int64 ldc) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE32_BSRMM(float, cusparseSbsrmm); +GKO_BIND_CUSPARSE32_BSRMM(double, cusparseDbsrmm); +GKO_BIND_CUSPARSE32_BSRMM(std::complex, cusparseCbsrmm); +GKO_BIND_CUSPARSE32_BSRMM(std::complex, cusparseZbsrmm); +GKO_BIND_CUSPARSE64_BSRMM(float, cusparseSbsrmm); +GKO_BIND_CUSPARSE64_BSRMM(double, cusparseDbsrmm); +GKO_BIND_CUSPARSE64_BSRMM(std::complex, cusparseCbsrmm); +GKO_BIND_CUSPARSE64_BSRMM(std::complex, cusparseZbsrmm); +template +GKO_BIND_CUSPARSE32_BSRMM(ValueType, detail::not_implemented); +template +GKO_BIND_CUSPARSE64_BSRMM(ValueType, detail::not_implemented); + + +#undef GKO_BIND_CUSPARSE32_BSRMM +#undef GKO_BIND_CUSPARSE64_BSRMM + + +template +inline int bsr_transpose_buffersize(cusparseHandle_t handle, IndexType mb, + IndexType nb, IndexType nnzb, + const ValueType* origValA, + const IndexType* origRowPtrA, + const IndexType* origColIndA, + int rowblocksize, + int colblocksize) GKO_NOT_IMPLEMENTED; + +template +inline void bsr_transpose(cusparseHandle_t handle, IndexType mb, IndexType nb, + IndexType nnzb, const ValueType* origValA, + const IndexType* origRowPtrA, + const IndexType* origColIndA, int rowblocksize, + int colblocksize, ValueType* TransValA, + IndexType* transRowIndA, IndexType* transColPtrA, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + void* pBuffer) GKO_NOT_IMPLEMENTED; + +// cuSparse does not transpose the blocks themselves, +// only the sparsity pattern +#define GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(ValueType, CusparseName) \ + template <> \ + inline int bsr_transpose_buffersize( \ + cusparseHandle_t handle, int32 mb, int32 nb, int32 nnzb, \ + const ValueType* origValA, const int32* origRowPtrA, \ + const int32* origColIndA, int rowblocksize, int colblocksize) \ + { \ + int pBufferSize = -1; \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName##_bufferSize( \ + handle, mb, nb, nnzb, as_culibs_type(origValA), origRowPtrA, \ + origColIndA, rowblocksize, colblocksize, &pBufferSize)); \ + return pBufferSize; \ + } \ + template <> \ + inline void bsr_transpose( \ + cusparseHandle_t handle, int32 mb, int32 nb, int32 nnzb, \ + const ValueType* origValA, const int32* origRowPtrA, \ + const int32* origColIndA, int rowblocksize, int colblocksize, \ + ValueType* transValA, int32* transRowIdxA, int32* transColPtrA, \ + cusparseAction_t copyValues, cusparseIndexBase_t idxBase, \ + void* pBuffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, mb, nb, nnzb, as_culibs_type(origValA), \ + origRowPtrA, origColIndA, rowblocksize, colblocksize, \ + as_culibs_type(transValA), transRowIdxA, \ + transColPtrA, copyValues, idxBase, pBuffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(float, cusparseSgebsr2gebsc); +GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(double, cusparseDgebsr2gebsc); +GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex, cusparseCgebsr2gebsc); +GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex, cusparseZgebsr2gebsc); + +#undef GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32 + + +inline std::unique_ptr, + std::function> +create_bsr_trsm_info() +{ + bsrsm2Info_t info{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateBsrsm2Info(&info)); + return {info, [](bsrsm2Info_t info) { cusparseDestroyBsrsm2Info(info); }}; +} + + +inline std::unique_ptr, + std::function> +create_bilu0_info() +{ + bsrilu02Info_t info{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateBsrilu02Info(&info)); + return {info, + [](bsrilu02Info_t info) { cusparseDestroyBsrilu02Info(info); }}; +} + + +#define GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(ValueType, CusparseName) \ + inline int bsrsm2_buffer_size( \ + cusparseHandle_t handle, cusparseOperation_t transA, \ + cusparseOperation_t transX, int32 mb, int32 n, int32 nnzb, \ + const cusparseMatDescr_t descr, ValueType* val, const int32* rowPtr, \ + const int32* colInd, int block_sz, bsrsm2Info_t factor_info) \ + { \ + int factor_work_size = -1; \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, blockDir, transA, transX, mb, n, nnzb, descr, \ + as_culibs_type(val), rowPtr, colInd, block_sz, \ + factor_info, &factor_work_size)); \ + return factor_work_size; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(ValueType, CusparseName) \ + inline int64 bsrsm2_buffer_size( \ + cusparseHandle_t handle, cusparseOperation_t transA, \ + cusparseOperation_t transX, int64 mb, int64 n, int64 nnzb, \ + const cusparseMatDescr_t descr, ValueType* val, const int64* rowPtr, \ + const int64* colInd, int block_size, bsrsm2Info_t factor_info) \ + GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(float, cusparseSbsrsm2_bufferSize); +GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(double, cusparseDbsrsm2_bufferSize); +GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(std::complex, + cusparseCbsrsm2_bufferSize); +GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(std::complex, + cusparseZbsrsm2_bufferSize); +GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(float, cusparseSbsrsm2_bufferSize); +GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(double, cusparseDbsrsm2_bufferSize); +GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(std::complex, + cusparseCbsrsm2_bufferSize); +GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(std::complex, + cusparseZbsrsm2_bufferSize); +template +GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(ValueType, detail::not_implemented); +template +GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(ValueType, detail::not_implemented); +#undef GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE +#undef GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE + + +#define GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(ValueType, CusparseName) \ + inline void bsrsm2_analysis( \ + cusparseHandle_t handle, cusparseOperation_t trans1, \ + cusparseOperation_t trans2, int32 mb, int32 n, int32 nnzb, \ + const cusparseMatDescr_t descr, const ValueType* val, \ + const int32* rowPtr, const int32* colInd, int block_size, \ + bsrsm2Info_t factor_info, cusparseSolvePolicy_t policy, \ + void* factor_work_vec) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS( \ + CusparseName(handle, blockDir, trans1, trans2, mb, n, nnzb, descr, \ + as_culibs_type(val), rowPtr, colInd, block_size, \ + factor_info, policy, factor_work_vec)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(ValueType, CusparseName) \ + inline void bsrsm2_analysis( \ + cusparseHandle_t handle, cusparseOperation_t trans1, \ + cusparseOperation_t trans2, size_type mb, size_type n, size_type nnzb, \ + const cusparseMatDescr_t descr, const ValueType* val, \ + const int64* rowPtr, const int64* colInd, int block_size, \ + bsrsm2Info_t factor_info, cusparseSolvePolicy_t policy, \ + void* factor_work_vec) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(float, cusparseSbsrsm2_analysis); +GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(double, cusparseDbsrsm2_analysis); +GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(std::complex, + cusparseCbsrsm2_analysis); +GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(std::complex, + cusparseZbsrsm2_analysis); +GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(float, cusparseSbsrsm2_analysis); +GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(double, cusparseDbsrsm2_analysis); +GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(std::complex, + cusparseCbsrsm2_analysis); +GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(std::complex, + cusparseZbsrsm2_analysis); +template +GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(ValueType, detail::not_implemented); +template +GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(ValueType, detail::not_implemented); +#undef GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS +#undef GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS + + +#define GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(ValueType, CusparseName) \ + inline void bsrsm2_solve( \ + cusparseHandle_t handle, cusparseOperation_t transA, \ + cusparseOperation_t transX, int32 mb, int32 n, int32 nnzb, \ + const ValueType* alpha, const cusparseMatDescr_t descrA, \ + const ValueType* valA, const int32* rowPtrA, const int32* colIndA, \ + int blockSizeA, bsrsm2Info_t factor_info, const ValueType* b, \ + int32 ldb, ValueType* x, int32 ldx, cusparseSolvePolicy_t policy, \ + void* factor_work_vec) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName( \ + handle, blockDir, transA, transX, mb, n, nnzb, \ + as_culibs_type(alpha), descrA, as_culibs_type(valA), rowPtrA, \ + colIndA, blockSizeA, factor_info, as_culibs_type(b), ldb, \ + as_culibs_type(x), ldx, policy, factor_work_vec)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(ValueType, CusparseName) \ + inline void bsrsm2_solve( \ + cusparseHandle_t handle, cusparseOperation_t trans1, \ + cusparseOperation_t trans2, size_type mb, size_type n, size_type nnzb, \ + const ValueType* alpha, const cusparseMatDescr_t descr, \ + const ValueType* val, const int64* rowPtr, const int64* colInd, \ + int block_size, bsrsm2Info_t factor_info, const ValueType* b, \ + int64 ldb, ValueType* x, int64 ldx, cusparseSolvePolicy_t policy, \ + void* factor_work_vec) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(float, cusparseSbsrsm2_solve); +GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(double, cusparseDbsrsm2_solve); +GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(std::complex, cusparseCbsrsm2_solve); +GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(std::complex, cusparseZbsrsm2_solve); +GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(float, cusparseSbsrsm2_solve); +GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(double, cusparseDbsrsm2_solve); +GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(std::complex, cusparseCbsrsm2_solve); +GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(std::complex, cusparseZbsrsm2_solve); +template +GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(ValueType, detail::not_implemented); +template +GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(ValueType, detail::not_implemented); +#undef GKO_BIND_CUSPARSE32_BSRSM2_SOLVE +#undef GKO_BIND_CUSPARSE64_BSRSM2_SOLVE + + +template +int bilu0_buffer_size(cusparseHandle_t handle, IndexType mb, IndexType nnzb, + const cusparseMatDescr_t descr, const ValueType* vals, + const IndexType* row_ptrs, const IndexType* col_idxs, + int block_sz, bsrilu02Info_t info) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(ValueType, CusparseName) \ + template <> \ + inline int bilu0_buffer_size( \ + cusparseHandle_t handle, int32 mb, int32 nnzb, \ + const cusparseMatDescr_t descr, const ValueType* vals, \ + const int32* row_ptrs, const int32* col_idxs, int block_size, \ + bsrilu02Info_t info) \ + { \ + int tmp_buffer_sz{}; \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName( \ + handle, blockDir, mb, nnzb, descr, \ + as_culibs_type(const_cast(vals)), row_ptrs, col_idxs, \ + block_size, info, &tmp_buffer_sz)); \ + return tmp_buffer_sz; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(float, cusparseSbsrilu02_bufferSize); +GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(double, cusparseDbsrilu02_bufferSize); +GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(std::complex, + cusparseCbsrilu02_bufferSize); +GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(std::complex, + cusparseZbsrilu02_bufferSize); + +#undef GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE + + +template +inline void bilu0_analysis(cusparseHandle_t handle, IndexType mb, + IndexType nnzb, const cusparseMatDescr_t descr, + ValueType* vals, const IndexType* row_ptrs, + const IndexType* col_idxs, int block_size, + bsrilu02Info_t info, cusparseSolvePolicy_t policy, + void* buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_BILU0_ANALYSIS(ValueType, CusparseName) \ + template <> \ + inline void bilu0_analysis( \ + cusparseHandle_t handle, int32 mb, int32 nnzb, \ + const cusparseMatDescr_t descr, ValueType* vals, \ + const int32* row_ptrs, const int32* col_idxs, int block_size, \ + bsrilu02Info_t info, cusparseSolvePolicy_t policy, void* buffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName( \ + handle, blockDir, mb, nnzb, descr, as_culibs_type(vals), row_ptrs, \ + col_idxs, block_size, info, policy, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_BILU0_ANALYSIS(float, cusparseSbsrilu02_analysis); +GKO_BIND_CUSPARSE_BILU0_ANALYSIS(double, cusparseDbsrilu02_analysis); +GKO_BIND_CUSPARSE_BILU0_ANALYSIS(std::complex, + cusparseCbsrilu02_analysis); +GKO_BIND_CUSPARSE_BILU0_ANALYSIS(std::complex, + cusparseZbsrilu02_analysis); + +#undef GKO_BIND_CUSPARSE_BILU0_ANALYSIS + + +template +void bilu0(cusparseHandle_t handle, IndexType mb, IndexType nnzb, + const cusparseMatDescr_t descr, ValueType* vals, + const IndexType* row_ptrs, const IndexType* col_idxs, int block_size, + bsrilu02Info_t info, cusparseSolvePolicy_t policy, + void* buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_BILU0(ValueType, CusparseName) \ + template <> \ + inline void bilu0( \ + cusparseHandle_t handle, int32 mb, int32 nnzb, \ + const cusparseMatDescr_t descr, ValueType* vals, \ + const int32* row_ptrs, const int32* col_idxs, int block_size, \ + bsrilu02Info_t info, cusparseSolvePolicy_t policy, void* buffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName( \ + handle, blockDir, mb, nnzb, descr, as_culibs_type(vals), row_ptrs, \ + col_idxs, block_size, info, policy, buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_CUSPARSE_BILU0(float, cusparseSbsrilu02); +GKO_BIND_CUSPARSE_BILU0(double, cusparseDbsrilu02); +GKO_BIND_CUSPARSE_BILU0(std::complex, cusparseCbsrilu02); +GKO_BIND_CUSPARSE_BILU0(std::complex, cusparseZbsrilu02); + +#undef GKO_BIND_CUSPARSE_BILU0 + + +} // namespace cusparse +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_BASE_CUSPARSE_BLOCK_BINDINGS_HPP_ diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 30f7fa1ba96..d1d6285e839 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -376,7 +376,7 @@ void run_generic_kernel_row_reduction(syn::value_list, } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, - run_generic_kernel_row_reduction) + run_generic_kernel_row_reduction); template +#include #include #include #include diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index 2fb777b2cac..bb775cd7405 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -43,12 +43,41 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/block_sizes.hpp" +#include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" +#include "cuda/base/cublas_bindings.hpp" +#include "cuda/base/cusparse_bindings.hpp" +#include "cuda/base/cusparse_block_bindings.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/atomic.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/merging.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { namespace cuda { + + +namespace csr_reuse { + + +constexpr int warps_in_block = 4; +constexpr int spmv_block_size = warps_in_block * config::warp_size; +constexpr int default_block_size{512}; + + +#include "common/cuda_hip/matrix/csr_kernels.hpp.inc" + + +} // namespace csr_reuse + + /** * @brief The fixed-size block compressed sparse row matrix format namespace. * @@ -57,22 +86,129 @@ namespace cuda { namespace fbcsr { +constexpr int default_block_size{512}; + + +#include "common/cuda_hip/components/uninitialized_array.hpp.inc" +#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" + + +namespace { + +template +void dense_transpose(std::shared_ptr exec, + const size_type nrows, const size_type ncols, + const size_type orig_stride, const ValueType* const orig, + const size_type trans_stride, ValueType* const trans) +{ + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + { + cublas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha, + orig, orig_stride, &beta, + static_cast(nullptr), nrows, trans, + trans_stride); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +} // namespace + + template void spmv(std::shared_ptr exec, - const matrix::Fbcsr* a, - const matrix::Dense* b, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + matrix::Dense* const c) +{ + if (cusparse::is_supported::value) { + auto handle = exec->get_cusparse_handle(); + cusparse::pointer_mode_guard pm_guard(handle); + const auto alpha = one(); + const auto beta = zero(); + auto descr = cusparse::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + assert(nrhs == c->get_size()[1]); + if (nrhs == 1) { + cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), &beta, c->get_values()); + } else { + auto trans_c = + Array(exec, c->get_size()[0] * c->get_size()[1]); + cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), nrhs, &beta, + trans_c.get_data(), mb * bs); + dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(), + c->get_stride(), c->get_values()); + } + cusparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Fbcsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; + const matrix::Dense* const alpha, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + const matrix::Dense* const beta, + matrix::Dense* const c) +{ + if (cusparse::is_supported::value) { + auto handle = exec->get_cusparse_handle(); + const auto alphp = alpha->get_const_values(); + const auto betap = beta->get_const_values(); + auto descr = cusparse::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + assert(nrhs == c->get_size()[1]); + if (nrhs == 1) { + cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); + } else { + auto trans_c = + Array(exec, c->get_size()[0] * c->get_size()[1]); + dense_transpose(exec, mb * bs, nrhs, c->get_stride(), + c->get_values(), mb * bs, trans_c.get_data()); + cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), nrhs, betap, + trans_c.get_data(), mb * bs); + dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(), + c->get_stride(), c->get_values()); + } + cusparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -103,10 +239,63 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); +namespace { + + +template +void transpose_blocks_impl(syn::value_list, + matrix::Fbcsr* const mat) +{ + constexpr int subwarp_size = config::warp_size; + const size_type nbnz = mat->get_num_stored_blocks(); + const size_type numthreads = nbnz * subwarp_size; + const size_type numblocks = ceildiv(numthreads, default_block_size); + const dim3 block_size{static_cast(default_block_size), 1, 1}; + const dim3 grid_dim{static_cast(numblocks), 1, 1}; + kernel::transpose_blocks + <<>>(nbnz, mat->get_values()); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, + transpose_blocks_impl); + + +} // namespace + + template -void transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) GKO_NOT_IMPLEMENTED; +void transpose(const std::shared_ptr exec, + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) +{ + if (cusparse::is_supported::value) { + const int bs = orig->get_block_size(); + const IndexType nnzb = + static_cast(orig->get_num_stored_blocks()); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + const IndexType buffer_size = cusparse::bsr_transpose_buffersize( + exec->get_cusparse_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::bsr_transpose( + exec->get_cusparse_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, + trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), + copyValues, idxBase, buffer); + + // transpose blocks + select_transpose_blocks( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), trans); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -116,7 +305,13 @@ template void conj_transpose(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Fbcsr* trans) - GKO_NOT_IMPLEMENTED; +{ + const int grid_size = + ceildiv(trans->get_num_stored_elements(), default_block_size); + transpose(exec, orig, trans); + csr_reuse::conjugate_kernel<<>>( + trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); @@ -125,8 +320,31 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void calculate_max_nnz_per_row( std::shared_ptr exec, - const matrix::Fbcsr* source, - size_type* result) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const source, + size_type* const result) +{ + const auto num_b_rows = source->get_num_block_rows(); + const auto bs = source->get_block_size(); + + auto nnz_per_row = Array(exec, num_b_rows); + auto block_results = Array(exec, default_block_size); + auto d_result = Array(exec, 1); + + const auto grid_dim = ceildiv(num_b_rows, default_block_size); + csr_reuse::kernel::calculate_nnz_per_row<<>>( + num_b_rows, as_cuda_type(source->get_const_row_ptrs()), + nnz_per_row.get_data()); + + const auto n = ceildiv(num_b_rows, default_block_size); + const auto reduce_dim = n <= default_block_size ? n : default_block_size; + csr_reuse::kernel::reduce_max_nnz<<>>( + num_b_rows, nnz_per_row.get_const_data(), block_results.get_data()); + + csr_reuse::kernel::reduce_max_nnz<<<1, default_block_size>>>( + reduce_dim, block_results.get_const_data(), d_result.get_data()); + + *result = bs * exec->copy_val_to_host(d_result.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); @@ -145,8 +363,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void is_sorted_by_column_index( std::shared_ptr exec, - const matrix::Fbcsr* to_check, - bool* is_sorted) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const to_check, + bool* const is_sorted) +{ + *is_sorted = true; + auto gpu_array = Array(exec, 1); + // need to initialize the GPU value to true + exec->copy_from(exec->get_master().get(), 1, is_sorted, + gpu_array.get_data()); + auto block_size = default_block_size; + const auto num_brows = + static_cast(to_check->get_num_block_rows()); + const auto num_blocks = ceildiv(num_brows, block_size); + csr_reuse::kernel::check_unsorted<<>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_brows, gpu_array.get_data()); + *is_sorted = exec->copy_val_to_host(gpu_array.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/cuda/test/matrix/fbcsr_kernels.cpp b/cuda/test/matrix/fbcsr_kernels.cpp index 860d6554584..8e34e93de81 100644 --- a/cuda/test/matrix/fbcsr_kernels.cpp +++ b/cuda/test/matrix/fbcsr_kernels.cpp @@ -33,28 +33,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #include #include +#include "core/matrix/fbcsr_kernels.hpp" #include "core/test/matrix/fbcsr_sample.hpp" +#include "core/test/utils.hpp" +#include "core/test/utils/fb_matrix_generator.hpp" #include "cuda/test/utils.hpp" namespace { +template class Fbcsr : public ::testing::Test { protected: - using Mtx = gko::matrix::Fbcsr<>; + using value_type = T; + using index_type = int; + using real_type = gko::remove_complex; + using Mtx = gko::matrix::Fbcsr; + using Dense = gko::matrix::Dense; + + Fbcsr() : distb(), engine(42) {} void SetUp() { ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); ref = gko::ReferenceExecutor::create(); cuda = gko::CudaExecutor::create(0, ref); + const index_type rand_brows = 100; + const index_type rand_bcols = 70; + const int block_size = 3; + rsorted_ref = gko::test::generate_random_fbcsr( + ref, rand_brows, rand_bcols, block_size, false, false, + std::ranlux48(43)); } void TearDown() @@ -67,18 +86,39 @@ class Fbcsr : public ::testing::Test { std::shared_ptr ref; std::shared_ptr cuda; - std::unique_ptr mtx; + std::unique_ptr rsorted_ref; + + std::normal_distribution> distb; + std::ranlux48 engine; + + value_type get_random_value() + { + return gko::test::detail::get_rand_value(distb, engine); + } + + void generate_sin(Dense* const x) + { + value_type* const xarr = x->get_values(); + for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) { + xarr[i] = + static_cast(2.0) * + std::sin(static_cast(i / 2.0) + get_random_value()); + } + } }; +TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypes, TypenameNameGenerator); -TEST_F(Fbcsr, CanWriteFromMatrixOnDevice) + +TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice) { - using value_type = Mtx::value_type; - using index_type = Mtx::index_type; + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + using index_type = typename Mtx::index_type; using MatData = gko::matrix_data; - gko::testing::FbcsrSample sample(ref); + gko::testing::FbcsrSample sample(this->ref); auto refmat = sample.generate_fbcsr(); - auto cudamat = gko::clone(cuda, refmat); + auto cudamat = gko::clone(this->cuda, refmat); MatData refdata; MatData cudadata; @@ -89,4 +129,267 @@ TEST_F(Fbcsr, CanWriteFromMatrixOnDevice) } +TYPED_TEST(Fbcsr, TransposeIsEquivalentToRefSortedBS3) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + using index_type = typename Mtx::index_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + auto trans_ref_linop = this->rsorted_ref->transpose(); + std::unique_ptr trans_ref = + gko::as(std::move(trans_ref_linop)); + + auto trans_cuda_linop = rand_cuda->transpose(); + std::unique_ptr trans_cuda = + gko::as(std::move(trans_cuda_linop)); + + GKO_ASSERT_MTX_EQ_SPARSITY(trans_ref, trans_cuda); + GKO_ASSERT_MTX_NEAR(trans_ref, trans_cuda, 0.0); +} + + +TYPED_TEST(Fbcsr, TransposeIsEquivalentToRefSortedBS7) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + using index_type = typename Mtx::index_type; + auto rand_cuda = Mtx::create(this->cuda); + const index_type rand_brows = 50; + const index_type rand_bcols = 40; + const int block_size = 7; + auto rsorted_ref2 = + gko::test::generate_random_fbcsr( + this->ref, rand_brows, rand_bcols, block_size, false, false, + std::ranlux48(43)); + rand_cuda->copy_from(gko::lend(rsorted_ref2)); + + auto trans_ref_linop = rsorted_ref2->transpose(); + std::unique_ptr trans_ref = + gko::as(std::move(trans_ref_linop)); + auto trans_cuda_linop = rand_cuda->transpose(); + std::unique_ptr trans_cuda = + gko::as(std::move(trans_cuda_linop)); + + GKO_ASSERT_MTX_EQ_SPARSITY(trans_ref, trans_cuda); + GKO_ASSERT_MTX_NEAR(trans_ref, trans_cuda, 0.0); +} + + +TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) +{ + using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Dense; + using value_type = typename Mtx::value_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + auto x_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 1)); + this->generate_sin(x_ref.get()); + auto x_cuda = Dense::create(this->cuda); + x_cuda->copy_from(x_ref.get()); + auto prod_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); + auto prod_cuda = Dense::create(this->cuda, prod_ref->get_size()); + + rand_cuda->apply(x_cuda.get(), prod_cuda.get()); + this->rsorted_ref->apply(x_ref.get(), prod_ref.get()); + + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol); +} + + +TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) +{ + using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Dense; + using value_type = typename Mtx::value_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + auto x_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 3)); + this->generate_sin(x_ref.get()); + auto x_cuda = Dense::create(this->cuda); + x_cuda->copy_from(x_ref.get()); + auto prod_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); + auto prod_cuda = Dense::create(this->cuda, prod_ref->get_size()); + + rand_cuda->apply(x_cuda.get(), prod_cuda.get()); + this->rsorted_ref->apply(x_ref.get(), prod_ref.get()); + + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol); +} + + +TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) +{ + using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Dense; + using value_type = typename TestFixture::value_type; + using real_type = typename TestFixture::real_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + auto x_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 1)); + this->generate_sin(x_ref.get()); + auto x_cuda = Dense::create(this->cuda); + x_cuda->copy_from(x_ref.get()); + auto prod_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); + this->generate_sin(prod_ref.get()); + auto prod_cuda = Dense::create(this->cuda); + prod_cuda->copy_from(prod_ref.get()); + auto alpha_ref = Dense::create(this->ref, gko::dim<2>(1, 1)); + alpha_ref->get_values()[0] = + static_cast(2.4) + this->get_random_value(); + auto beta_ref = Dense::create(this->ref, gko::dim<2>(1, 1)); + beta_ref->get_values()[0] = -1.2; + auto alpha = Dense::create(this->cuda); + alpha->copy_from(alpha_ref.get()); + auto beta = Dense::create(this->cuda); + beta->copy_from(beta_ref.get()); + + rand_cuda->apply(alpha.get(), x_cuda.get(), beta.get(), prod_cuda.get()); + this->rsorted_ref->apply(alpha_ref.get(), x_ref.get(), beta_ref.get(), + prod_ref.get()); + + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol); +} + + +TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) +{ + using Mtx = typename TestFixture::Mtx; + using Dense = typename TestFixture::Dense; + using value_type = typename TestFixture::value_type; + using real_type = typename TestFixture::real_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + auto x_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 3)); + this->generate_sin(x_ref.get()); + auto x_cuda = Dense::create(this->cuda); + x_cuda->copy_from(x_ref.get()); + auto prod_ref = Dense::create( + this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); + this->generate_sin(prod_ref.get()); + auto prod_cuda = Dense::create(this->cuda); + prod_cuda->copy_from(prod_ref.get()); + auto alpha_ref = Dense::create(this->ref, gko::dim<2>(1, 1)); + alpha_ref->get_values()[0] = + static_cast(2.4) + this->get_random_value(); + auto beta_ref = Dense::create(this->ref, gko::dim<2>(1, 1)); + beta_ref->get_values()[0] = -1.2; + auto alpha = Dense::create(this->cuda); + alpha->copy_from(alpha_ref.get()); + auto beta = Dense::create(this->cuda); + beta->copy_from(beta_ref.get()); + + rand_cuda->apply(alpha.get(), x_cuda.get(), beta.get(), prod_cuda.get()); + this->rsorted_ref->apply(alpha_ref.get(), x_ref.get(), beta_ref.get(), + prod_ref.get()); + + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol); +} + + +TYPED_TEST(Fbcsr, ConjTransposeIsEquivalentToRefSortedBS3) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + using index_type = typename Mtx::index_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + auto trans_ref_linop = this->rsorted_ref->conj_transpose(); + std::unique_ptr trans_ref = + gko::as(std::move(trans_ref_linop)); + + auto trans_cuda_linop = rand_cuda->conj_transpose(); + std::unique_ptr trans_cuda = + gko::as(std::move(trans_cuda_linop)); + + GKO_ASSERT_MTX_EQ_SPARSITY(trans_ref, trans_cuda); + GKO_ASSERT_MTX_NEAR(trans_ref, trans_cuda, 0.0); +} + + +TYPED_TEST(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + using index_type = typename Mtx::index_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + gko::size_type ref_max_nnz{}, cuda_max_nnz{}; + + gko::kernels::cuda::fbcsr::calculate_max_nnz_per_row( + this->cuda, rand_cuda.get(), &cuda_max_nnz); + gko::kernels::reference::fbcsr::calculate_max_nnz_per_row( + this->ref, this->rsorted_ref.get(), &ref_max_nnz); + + ASSERT_EQ(ref_max_nnz, cuda_max_nnz); +} + + +TYPED_TEST(Fbcsr, RecognizeSortedMatrix) +{ + using Mtx = typename TestFixture::Mtx; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + + ASSERT_TRUE(rand_cuda->is_sorted_by_column_index()); +} + + +TYPED_TEST(Fbcsr, RecognizeUnsortedMatrix) +{ + using Mtx = typename TestFixture::Mtx; + using index_type = typename TestFixture::index_type; + auto mat = this->rsorted_ref->clone(); + index_type* const colinds = mat->get_col_idxs(); + std::swap(colinds[0], colinds[1]); + auto unsrt_cuda = Mtx::create(this->cuda); + unsrt_cuda->copy_from(gko::lend(mat)); + + ASSERT_FALSE(unsrt_cuda->is_sorted_by_column_index()); +} + + +TYPED_TEST(Fbcsr, InplaceAbsoluteMatrixIsEquivalentToRef) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + auto rand_ref = Mtx::create(this->ref); + rand_ref->copy_from(this->rsorted_ref.get()); + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + + rand_ref->compute_absolute_inplace(); + rand_cuda->compute_absolute_inplace(); + + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(rand_ref, rand_cuda, tol); +} + + +TYPED_TEST(Fbcsr, OutplaceAbsoluteMatrixIsEquivalentToRef) +{ + using Mtx = typename TestFixture::Mtx; + using value_type = typename Mtx::value_type; + auto rand_cuda = Mtx::create(this->cuda); + rand_cuda->copy_from(gko::lend(this->rsorted_ref)); + + auto abs_mtx = this->rsorted_ref->compute_absolute(); + auto dabs_mtx = rand_cuda->compute_absolute(); + + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, tol); +} + + } // namespace diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 40e4268dccb..610f89673a9 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -382,7 +382,7 @@ void run_generic_kernel_row_reduction(syn::value_list, } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, - run_generic_kernel_row_reduction) + run_generic_kernel_row_reduction); template , } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized, - run_kernel_sized_impl) + run_kernel_sized_impl); template diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 030da93c245..7815e55badc 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -146,7 +146,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, - run_kernel_reduction_sized_impl) + run_kernel_reduction_sized_impl); } // namespace @@ -358,7 +358,7 @@ void run_kernel_col_reduction_sized_impl( } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized, - run_kernel_col_reduction_sized_impl) + run_kernel_col_reduction_sized_impl); } // namespace diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index 3396a18dc9e..5bf5272ec73 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -47,6 +47,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/block_col_major.hpp" +#include "core/base/block_sizes.hpp" +#include "core/base/iterator_factory.hpp" +#include "core/base/utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "omp/components/format_conversion.hpp" + + namespace gko { namespace kernels { namespace omp { @@ -62,7 +70,38 @@ template void spmv(std::shared_ptr exec, const matrix::Fbcsr* const a, const matrix::Dense* const b, - matrix::Dense* const c) GKO_NOT_IMPLEMENTED; + matrix::Dense* const c) +{ + const int bs = a->get_block_size(); + const auto nvecs = static_cast(b->get_size()[1]); + const IndexType nbrows = a->get_num_block_rows(); + const size_type nbnz = a->get_num_stored_blocks(); + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); + const acc::range> avalues{ + to_std_array(nbnz, bs, bs), a->get_const_values()}; + +#pragma omp parallel for + for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { + for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; + ++i) { + c->get_values()[i] = zero(); + } + for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; + ++inz) { + for (int ib = 0; ib < bs; ib++) { + const IndexType row = ibrow * bs + ib; + for (int jb = 0; jb < bs; jb++) { + const auto val = avalues(inz, ib, jb); + const auto col = col_idxs[inz] * bs + jb; + for (size_type j = 0; j < nvecs; ++j) { + c->at(row, j) += val * b->at(col, j); + } + } + } + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); @@ -73,7 +112,39 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Fbcsr* const a, const matrix::Dense* const b, const matrix::Dense* const beta, - matrix::Dense* const c) GKO_NOT_IMPLEMENTED; + matrix::Dense* const c) +{ + const int bs = a->get_block_size(); + const auto nvecs = static_cast(b->get_size()[1]); + const IndexType nbrows = a->get_num_block_rows(); + const size_type nbnz = a->get_num_stored_blocks(); + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); + auto valpha = alpha->at(0, 0); + auto vbeta = beta->at(0, 0); + const acc::range> avalues{ + to_std_array(nbnz, bs, bs), a->get_const_values()}; + +#pragma omp parallel for + for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { + for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; + ++i) + c->get_values()[i] *= vbeta; + + for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; + ++inz) { + for (int ib = 0; ib < bs; ib++) { + const IndexType row = ibrow * bs + ib; + for (int jb = 0; jb < bs; jb++) { + const auto val = avalues(inz, ib, jb); + const auto col = col_idxs[inz] * bs + jb; + for (size_type j = 0; j < nvecs; ++j) + c->at(row, j) += valpha * val * b->at(col, j); + } + } + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -99,27 +170,72 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); -template -inline void convert_fbcsr_to_csc( - size_type num_rows, const IndexType* const row_ptrs, - const IndexType* const col_idxs, const ValueType* const fbcsr_vals, - IndexType* const row_idxs, IndexType* const col_ptrs, - ValueType* const csc_vals, UnaryOperator op) GKO_NOT_IMPLEMENTED; +template +void convert_fbcsr_to_fbcsc(const IndexType num_blk_rows, const int blksz, + const IndexType* const row_ptrs, + const IndexType* const col_idxs, + const ValueType* const fbcsr_vals, + IndexType* const row_idxs, + IndexType* const col_ptrs, + ValueType* const csc_vals, UnaryOperator op) +{ + auto sizes = + gko::to_std_array(row_ptrs[num_blk_rows], blksz, blksz); + const acc::range> rvalues( + sizes, fbcsr_vals); + acc::range> cvalues(sizes, csc_vals); + for (IndexType brow = 0; brow < num_blk_rows; ++brow) { + for (auto i = row_ptrs[brow]; i < row_ptrs[brow + 1]; ++i) { + const auto dest_idx = col_ptrs[col_idxs[i]]; + col_ptrs[col_idxs[i]]++; + row_idxs[dest_idx] = brow; + for (int ib = 0; ib < blksz; ib++) { + for (int jb = 0; jb < blksz; jb++) { + cvalues(dest_idx, ib, jb) = + op(transpose_blocks ? rvalues(i, jb, ib) + : rvalues(i, ib, jb)); + } + } + } + } +} template void transpose_and_transform( - std::shared_ptr exec, matrix::Fbcsr* const trans, - const matrix::Fbcsr* const orig, - UnaryOperator op) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const orig, UnaryOperator op) +{ + const int bs = orig->get_block_size(); + auto trans_row_ptrs = trans->get_row_ptrs(); + auto orig_row_ptrs = orig->get_const_row_ptrs(); + auto trans_col_idxs = trans->get_col_idxs(); + auto orig_col_idxs = orig->get_const_col_idxs(); + auto trans_vals = trans->get_values(); + auto orig_vals = orig->get_const_values(); + + const IndexType nbcols = orig->get_num_block_cols(); + const IndexType nbrows = orig->get_num_block_rows(); + auto orig_nbnz = orig_row_ptrs[nbrows]; + + trans_row_ptrs[0] = 0; + convert_unsorted_idxs_to_ptrs(orig_col_idxs, orig_nbnz, trans_row_ptrs + 1, + nbcols); + + convert_fbcsr_to_fbcsc( + nbrows, bs, orig_row_ptrs, orig_col_idxs, orig_vals, trans_col_idxs, + trans_row_ptrs + 1, trans_vals, op); +} template void transpose(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) - GKO_NOT_IMPLEMENTED; +{ + transpose_and_transform(trans, orig, [](const ValueType x) { return x; }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -129,7 +245,10 @@ template void conj_transpose(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Fbcsr* const trans) - GKO_NOT_IMPLEMENTED; +{ + transpose_and_transform(trans, orig, + [](const ValueType x) { return conj(x); }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL); @@ -139,7 +258,22 @@ template void calculate_max_nnz_per_row( std::shared_ptr exec, const matrix::Fbcsr* const source, - size_type* const result) GKO_NOT_IMPLEMENTED; + size_type* const result) +{ + const auto num_rows = source->get_size()[0]; + const auto row_ptrs = source->get_const_row_ptrs(); + const int bs = source->get_block_size(); + IndexType max_nnz = 0; + +#pragma omp parallel for reduction(max : max_nnz) + for (size_type i = 0; i < num_rows; i++) { + const size_type ibrow = i / bs; + max_nnz = + std::max((row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs, max_nnz); + } + + *result = max_nnz; +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); @@ -149,7 +283,19 @@ template void calculate_nonzeros_per_row( std::shared_ptr exec, const matrix::Fbcsr* const source, - Array* const result) GKO_NOT_IMPLEMENTED; + Array* const result) +{ + const auto row_ptrs = source->get_const_row_ptrs(); + auto row_nnz_val = result->get_data(); + const int bs = source->get_block_size(); + assert(result->get_num_elems() == source->get_size()[0]); + +#pragma omp parallel for + for (size_type i = 0; i < result->get_num_elems(); i++) { + const size_type ibrow = i / bs; + row_nnz_val[i] = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL); @@ -159,16 +305,81 @@ template void is_sorted_by_column_index( std::shared_ptr exec, const matrix::Fbcsr* const to_check, - bool* const is_sorted) GKO_NOT_IMPLEMENTED; + bool* const is_sorted) +{ + const auto row_ptrs = to_check->get_const_row_ptrs(); + const auto col_idxs = to_check->get_const_col_idxs(); + const auto size = to_check->get_num_block_rows(); + bool local_is_sorted = true; +#pragma omp parallel for reduction(&& : local_is_sorted) + for (size_type i = 0; i < size; ++i) { + // Skip comparison if any thread detects that it is not sorted + if (local_is_sorted) { + for (auto idx = row_ptrs[i] + 1; idx < row_ptrs[i + 1]; ++idx) { + if (col_idxs[idx - 1] > col_idxs[idx]) { + local_is_sorted = false; + break; + } + } + } + } + *is_sorted = local_is_sorted; +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); +namespace { + +template +void sort_by_column_index_impl( + syn::value_list, + matrix::Fbcsr* const to_sort) +{ + auto row_ptrs = to_sort->get_const_row_ptrs(); + auto col_idxs = to_sort->get_col_idxs(); + auto values = to_sort->get_values(); + const auto nbrows = to_sort->get_num_block_rows(); + constexpr int bs2 = mat_blk_sz * mat_blk_sz; +#pragma omp parallel for + for (IndexType irow = 0; irow < nbrows; ++irow) { + IndexType* const brow_col_idxs = col_idxs + row_ptrs[irow]; + ValueType* const brow_vals = values + row_ptrs[irow] * bs2; + const IndexType nbnz_brow = row_ptrs[irow + 1] - row_ptrs[irow]; + + std::vector col_permute(nbnz_brow); + std::iota(col_permute.begin(), col_permute.end(), 0); + auto helper = detail::IteratorFactory( + brow_col_idxs, col_permute.data(), nbnz_brow); + std::sort(helper.begin(), helper.end()); + + std::vector oldvalues(nbnz_brow * bs2); + std::copy(brow_vals, brow_vals + nbnz_brow * bs2, oldvalues.begin()); + for (IndexType ibz = 0; ibz < nbnz_brow; ibz++) { + for (int i = 0; i < bs2; i++) { + brow_vals[ibz * bs2 + i] = + oldvalues[col_permute[ibz] * bs2 + i]; + } + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx, + sort_by_column_index_impl); + +} // namespace + template void sort_by_column_index(const std::shared_ptr exec, matrix::Fbcsr* const to_sort) - GKO_NOT_IMPLEMENTED; +{ + const int bs = to_sort->get_block_size(); + select_sort_col_idx( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), to_sort); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX); @@ -178,7 +389,34 @@ template void extract_diagonal(std::shared_ptr exec, const matrix::Fbcsr* const orig, matrix::Diagonal* const diag) - GKO_NOT_IMPLEMENTED; +{ + const auto row_ptrs = orig->get_const_row_ptrs(); + const auto col_idxs = orig->get_const_col_idxs(); + const auto values = orig->get_const_values(); + const int bs = orig->get_block_size(); + const IndexType nbrows = orig->get_num_block_rows(); + const IndexType nbdim_min = + std::min(orig->get_num_block_rows(), orig->get_num_block_cols()); + auto diag_values = diag->get_values(); + + assert(diag->get_size()[0] == nbdim_min * bs); + + const acc::range> vblocks( + gko::to_std_array(row_ptrs[nbrows], bs, bs), values); + +#pragma omp parallel for + for (IndexType ibrow = 0; ibrow < nbdim_min; ++ibrow) { + for (IndexType idx = row_ptrs[ibrow]; idx < row_ptrs[ibrow + 1]; + ++idx) { + if (col_idxs[idx] == ibrow) { + for (int ib = 0; ib < bs; ib++) { + diag_values[ibrow * bs + ib] = vblocks(idx, ib, ib); + } + break; + } + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL); diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp index 4fe014a46cb..49d61a4c26a 100644 --- a/omp/test/matrix/fbcsr_kernels.cpp +++ b/omp/test/matrix/fbcsr_kernels.cpp @@ -33,14 +33,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include +#include + + #include +#include #include +#include +#include +#include +#include "core/matrix/fbcsr_kernels.hpp" #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/fb_matrix_generator.hpp" namespace { @@ -48,7 +60,15 @@ namespace { class Fbcsr : public ::testing::Test { protected: - using Mtx = gko::matrix::Fbcsr<>; + using real_type = double; + using index_type = int; + using Arr = gko::Array; + using Mtx = gko::matrix::Fbcsr; + using Vec = gko::matrix::Dense; + using ComplexVec = gko::matrix::Dense>; + using ComplexMtx = gko::matrix::Fbcsr>; + + Fbcsr() : rand_engine(42) {} void SetUp() { @@ -63,10 +83,85 @@ class Fbcsr : public ::testing::Test { } } - std::shared_ptr ref; + + template + std::unique_ptr gen_mtx(int num_rows, int num_cols, + int min_nnz_row) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(min_nnz_row, num_cols), + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); + } + + void set_up_apply_data(int num_vectors = 1) + { + mtx = gko::test::generate_random_fbcsr( + ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); + complex_mtx = gko::test::generate_random_fbcsr>( + ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); + square_mtx = gko::test::generate_random_fbcsr( + ref, num_brows, num_brows, blk_sz, false, false, rand_engine); + dmtx = Mtx::create(omp); + dmtx->copy_from(mtx.get()); + complex_dmtx = ComplexMtx::create(omp); + complex_dmtx->copy_from(complex_mtx.get()); + square_dmtx = Mtx::create(omp); + square_dmtx->copy_from(square_mtx.get()); + expected = gen_mtx(num_brows * blk_sz, num_vectors, 1); + y = gen_mtx(num_bcols * blk_sz, num_vectors, 1); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + dresult = Vec::create(omp); + dresult->copy_from(expected.get()); + dy = Vec::create(omp); + dy->copy_from(y.get()); + dalpha = Vec::create(omp); + dalpha->copy_from(alpha.get()); + dbeta = Vec::create(omp); + dbeta->copy_from(beta.get()); + } + + struct matrix_pair { + std::unique_ptr ref; + std::unique_ptr omp; + }; + + matrix_pair gen_unsorted_mtx() + { + constexpr int min_nnz_per_row{2}; + auto local_mtx_ref = gko::test::generate_random_fbcsr( + ref, num_brows, num_bcols, blk_sz, false, true, rand_engine); + + auto local_mtx_omp = Mtx::create(omp); + local_mtx_omp->copy_from(local_mtx_ref.get()); + + return matrix_pair{std::move(local_mtx_ref), std::move(local_mtx_omp)}; + } + + std::shared_ptr ref; std::shared_ptr omp; + const index_type num_brows = 112; + const index_type num_bcols = 31; + const int blk_sz = 3; + std::ranlux48 rand_engine; + std::unique_ptr mtx; + std::unique_ptr complex_mtx; + std::unique_ptr square_mtx; + std::unique_ptr expected; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + + std::unique_ptr dmtx; + std::unique_ptr complex_dmtx; + std::unique_ptr square_dmtx; + std::unique_ptr dresult; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; }; @@ -88,4 +183,240 @@ TEST_F(Fbcsr, CanWriteFromMatrixOnDevice) } +TEST_F(Fbcsr, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Fbcsr, SimpleApplyToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply(y.get(), expected.get()); + dmtx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Fbcsr, SimpleApplyToDenseMatrixIsEquivalentToRefUnsorted) +{ + set_up_apply_data(3); + auto pair = gen_unsorted_mtx(); + + pair.ref->apply(y.get(), expected.get()); + pair.omp->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Fbcsr, AdvancedApplyToDenseMatrixIsEquivalentToRef) +{ + set_up_apply_data(3); + + mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Fbcsr, ApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(3); + auto complex_b = gen_mtx(num_bcols * blk_sz, 3, 1); + auto dcomplex_b = ComplexVec::create(omp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(num_brows * blk_sz, 3, 1); + auto dcomplex_x = ComplexVec::create(omp); + dcomplex_x->copy_from(complex_x.get()); + + mtx->apply(complex_b.get(), complex_x.get()); + dmtx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); +} + + +TEST_F(Fbcsr, AdvancedApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(3); + auto complex_b = gen_mtx(num_bcols * blk_sz, 3, 1); + auto dcomplex_b = ComplexVec::create(omp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(num_brows * blk_sz, 3, 1); + auto dcomplex_x = ComplexVec::create(omp); + dcomplex_x->copy_from(complex_x.get()); + + mtx->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); + dmtx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); +} + + +TEST_F(Fbcsr, TransposeIsEquivalentToRef) +{ + set_up_apply_data(); + + auto trans = gko::as(mtx->transpose()); + auto d_trans = gko::as(dmtx->transpose()); + + GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0); + ASSERT_TRUE(d_trans->is_sorted_by_column_index()); +} + + +TEST_F(Fbcsr, ConjugateTransposeIsEquivalentToRef) +{ + set_up_apply_data(); + + auto trans = gko::as(complex_mtx->conj_transpose()); + auto d_trans = gko::as(complex_dmtx->conj_transpose()); + + GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0); + ASSERT_TRUE(d_trans->is_sorted_by_column_index()); +} + + +TEST_F(Fbcsr, CalculatesNonzerosPerRow) +{ + set_up_apply_data(); + gko::Array row_nnz(ref, mtx->get_size()[0]); + gko::Array drow_nnz(omp, dmtx->get_size()[0]); + + gko::kernels::reference::fbcsr::calculate_nonzeros_per_row(ref, mtx.get(), + &row_nnz); + gko::kernels::omp::fbcsr::calculate_nonzeros_per_row(omp, dmtx.get(), + &drow_nnz); + + GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz); +} + + +TEST_F(Fbcsr, RecognizeSortedMatrix) +{ + set_up_apply_data(); + bool is_sorted_omp{}; + + is_sorted_omp = dmtx->is_sorted_by_column_index(); + + ASSERT_TRUE(is_sorted_omp); +} + + +TEST_F(Fbcsr, RecognizeUnsortedMatrix) +{ + auto uns_mtx = gen_unsorted_mtx(); + bool is_sorted_omp{}; + + is_sorted_omp = uns_mtx.omp->is_sorted_by_column_index(); + + ASSERT_FALSE(is_sorted_omp); +} + + +TEST_F(Fbcsr, SortSortedMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->sort_by_column_index(); + dmtx->sort_by_column_index(); + + GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0); + ASSERT_TRUE(dmtx->is_sorted_by_column_index()); +} + + +TEST_F(Fbcsr, SortUnsortedMatrixIsEquivalentToRef) +{ + auto uns_mtx = gen_unsorted_mtx(); + + uns_mtx.ref->sort_by_column_index(); + uns_mtx.omp->sort_by_column_index(); + + GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.omp, 0); + ASSERT_TRUE(uns_mtx.omp->is_sorted_by_column_index()); +} + + +TEST_F(Fbcsr, ExtractDiagonalIsEquivalentToRef) +{ + set_up_apply_data(); + + auto diag = mtx->extract_diagonal(); + auto ddiag = dmtx->extract_diagonal(); + + GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0); +} + + +TEST_F(Fbcsr, InplaceAbsoluteMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + mtx->compute_absolute_inplace(); + dmtx->compute_absolute_inplace(); + + GKO_ASSERT_MTX_NEAR(mtx, dmtx, 1e-14); +} + + +TEST_F(Fbcsr, OutplaceAbsoluteMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + auto abs_mtx = mtx->compute_absolute(); + auto dabs_mtx = dmtx->compute_absolute(); + + GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, 1e-14); +} + + +TEST_F(Fbcsr, InplaceAbsoluteComplexMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + complex_mtx->compute_absolute_inplace(); + complex_dmtx->compute_absolute_inplace(); + + GKO_ASSERT_MTX_NEAR(complex_mtx, complex_dmtx, 1e-14); +} + + +TEST_F(Fbcsr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + auto abs_mtx = complex_mtx->compute_absolute(); + auto dabs_mtx = complex_dmtx->compute_absolute(); + + GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, 1e-14); +} + + +TEST_F(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3) +{ + auto mtx_ref = gko::test::generate_random_fbcsr( + ref, num_brows, num_bcols, blk_sz, false, false, rand_engine); + auto rand_omp = Mtx::create(omp); + rand_omp->copy_from(gko::lend(mtx_ref)); + gko::size_type ref_max_nnz{}, omp_max_nnz{}; + + gko::kernels::omp::fbcsr::calculate_max_nnz_per_row( + this->omp, rand_omp.get(), &omp_max_nnz); + gko::kernels::reference::fbcsr::calculate_max_nnz_per_row( + this->ref, mtx_ref.get(), &ref_max_nnz); + + ASSERT_EQ(ref_max_nnz, omp_max_nnz); +} + + } // namespace diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index a7fbed91a78..feab61f3805 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -46,11 +46,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/block_col_major.hpp" -#include "accessor/range.hpp" #include "core/base/allocator.hpp" +#include "core/base/block_sizes.hpp" #include "core/base/iterator_factory.hpp" +#include "core/base/utils.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/fbcsr_builder.hpp" +#include "core/synthesizer/implementation_selection.hpp" #include "reference/components/format_conversion.hpp" @@ -77,11 +79,8 @@ void spmv(const std::shared_ptr, const size_type nbnz = a->get_num_stored_blocks(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto vals = a->get_const_values(); const acc::range> avalues{ - std::array{nbnz, static_cast(bs), - static_cast(bs)}, - vals}; + to_std_array(nbnz, bs, bs), a->get_const_values()}; for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; @@ -118,16 +117,13 @@ void advanced_spmv(const std::shared_ptr, const int bs = a->get_block_size(); const auto nvecs = static_cast(b->get_size()[1]); const IndexType nbrows = a->get_num_block_rows(); + const size_type nbnz = a->get_num_stored_blocks(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto vals = a->get_const_values(); auto valpha = alpha->at(0, 0); auto vbeta = beta->at(0, 0); const acc::range> avalues{ - std::array{a->get_num_stored_blocks(), - static_cast(bs), - static_cast(bs)}, - vals}; + to_std_array(nbnz, bs, bs), a->get_const_values()}; for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; @@ -414,8 +410,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); +namespace { + + template -static void sort_by_column_index_impl( +void sort_by_column_index_impl( + syn::value_list, matrix::Fbcsr* const to_sort) { auto row_ptrs = to_sort->get_const_row_ptrs(); @@ -423,10 +423,10 @@ static void sort_by_column_index_impl( auto values = to_sort->get_values(); const auto nbrows = to_sort->get_num_block_rows(); constexpr int bs2 = mat_blk_sz * mat_blk_sz; - for (IndexType i = 0; i < nbrows; ++i) { - IndexType* const brow_col_idxs = col_idxs + row_ptrs[i]; - ValueType* const brow_vals = values + row_ptrs[i] * bs2; - const IndexType nbnz_brow = row_ptrs[i + 1] - row_ptrs[i]; + for (IndexType irow = 0; irow < nbrows; ++irow) { + IndexType* const brow_col_idxs = col_idxs + row_ptrs[irow]; + ValueType* const brow_vals = values + row_ptrs[irow] * bs2; + const IndexType nbnz_brow = row_ptrs[irow + 1] - row_ptrs[irow]; std::vector col_permute(nbnz_brow); std::iota(col_permute.begin(), col_permute.end(), 0); @@ -445,20 +445,22 @@ static void sort_by_column_index_impl( } } +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx, + sort_by_column_index_impl); + + +} // namespace + + template void sort_by_column_index(const std::shared_ptr exec, matrix::Fbcsr* const to_sort) { const int bs = to_sort->get_block_size(); - if (bs == 2) { - sort_by_column_index_impl<2>(to_sort); - } else if (bs == 3) { - sort_by_column_index_impl<3>(to_sort); - } else if (bs == 4) { - sort_by_column_index_impl<4>(to_sort); - } else { - GKO_NOT_IMPLEMENTED; - } + select_sort_col_idx( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), to_sort); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index 19330b476dc..8897959055a 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/fbcsr_kernels.hpp" #include "core/test/matrix/fbcsr_sample.hpp" #include "core/test/utils.hpp" +#include "core/test/utils/value_generator.hpp" namespace { @@ -139,18 +140,16 @@ TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator); template -constexpr typename std::enable_if_t::value, T> -get_some_number() -{ - return static_cast(1.2); -} - -template -constexpr typename std::enable_if_t::value, T> -get_some_number() +std::unique_ptr> get_some_vectors( + std::shared_ptr exec, const size_t nrows, + const size_t nrhs) { using RT = gko::remove_complex; - return {static_cast(1.2), static_cast(3.4)}; + std::ranlux48 engine(39); + std::normal_distribution dist(0.0, 5.0); + std::uniform_int_distribution<> nnzdist(1, nrhs); + return gko::test::generate_random_matrix>( + nrows, nrhs, nnzdist, dist, engine, exec); } @@ -161,11 +160,7 @@ TYPED_TEST(Fbcsr, AppliesToDenseVector) using index_type = typename TestFixture::index_type; const index_type nrows = this->mtx2->get_size()[0]; const index_type ncols = this->mtx2->get_size()[1]; - auto x = Vec::create(this->exec, gko::dim<2>{(gko::size_type)ncols, 1}); - T* const xvals = x->get_values(); - for (index_type i = 0; i < ncols; i++) { - xvals[i] = std::sin(static_cast(static_cast((i + 1) ^ 2))); - } + auto x = get_some_vectors(this->exec, ncols, 1); auto y = Vec::create(this->exec, gko::dim<2>{(gko::size_type)nrows, 1}); auto yref = Vec::create(this->exec, gko::dim<2>{(gko::size_type)nrows, 1}); using Csr = typename TestFixture::Csr; @@ -190,13 +185,7 @@ TYPED_TEST(Fbcsr, AppliesToDenseMatrix) const gko::size_type nrows = this->mtx2->get_size()[0]; const gko::size_type ncols = this->mtx2->get_size()[1]; const gko::size_type nvecs = 3; - auto x = Vec::create(this->exec, gko::dim<2>{ncols, nvecs}); - for (index_type i = 0; i < ncols; i++) { - for (index_type j = 0; j < nvecs; j++) { - x->at(i, j) = (static_cast(3.0 * i) + get_some_number()) / - static_cast(j + 1.0); - } - } + auto x = get_some_vectors(this->exec, ncols, nvecs); auto y = Vec::create(this->exec, gko::dim<2>{nrows, nvecs}); auto yref = Vec::create(this->exec, gko::dim<2>{nrows, nvecs}); @@ -209,6 +198,28 @@ TYPED_TEST(Fbcsr, AppliesToDenseMatrix) } +TYPED_TEST(Fbcsr, AppliesToDenseComplexMatrix) +{ + using T = typename TestFixture::value_type; + using CT = typename gko::to_complex; + using CVec = gko::matrix::Dense; + using index_type = typename TestFixture::index_type; + const gko::size_type nrows = this->mtx2->get_size()[0]; + const gko::size_type ncols = this->mtx2->get_size()[1]; + const gko::size_type nvecs = 3; + auto x = get_some_vectors(this->exec, ncols, nvecs); + auto y = CVec::create(this->exec, gko::dim<2>{nrows, nvecs}); + auto yref = CVec::create(this->exec, gko::dim<2>{nrows, nvecs}); + + this->mtx2->apply(x.get(), y.get()); + this->ref2csrmtx->apply(x.get(), yref.get()); + + const double tolerance = + std::numeric_limits>::epsilon(); + GKO_ASSERT_MTX_NEAR(y, yref, tolerance); +} + + TYPED_TEST(Fbcsr, AppliesLinearCombinationToDenseVector) { using Vec = typename TestFixture::Vec; @@ -220,15 +231,8 @@ TYPED_TEST(Fbcsr, AppliesLinearCombinationToDenseVector) auto beta = gko::initialize({betav}, this->exec); const gko::size_type nrows = this->mtx2->get_size()[0]; const gko::size_type ncols = this->mtx2->get_size()[1]; - auto x = Vec::create(this->exec, gko::dim<2>{ncols, 1}); - auto y = Vec::create(this->exec, gko::dim<2>{nrows, 1}); - for (index_type i = 0; i < ncols; i++) { - x->at(i, 0) = (i + 1.0) * (i + 1.0); - } - for (index_type i = 0; i < nrows; i++) { - y->at(i, 0) = static_cast(std::sin(2 * 3.14 * (i + 0.1) / nrows)) + - get_some_number(); - } + auto x = get_some_vectors(this->exec, ncols, 1); + auto y = get_some_vectors(this->exec, nrows, 1); auto yref = y->clone(); this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get()); @@ -252,21 +256,8 @@ TYPED_TEST(Fbcsr, AppliesLinearCombinationToDenseMatrix) const gko::size_type nrows = this->mtx2->get_size()[0]; const gko::size_type ncols = this->mtx2->get_size()[1]; const gko::size_type nvecs = 3; - auto x = Vec::create(this->exec, gko::dim<2>{ncols, nvecs}); - auto y = Vec::create(this->exec, gko::dim<2>{nrows, nvecs}); - for (index_type i = 0; i < ncols; i++) { - for (index_type j = 0; j < nvecs; j++) { - x->at(i, j) = - std::log(static_cast(0.1 + static_cast((i + 1) ^ 2))); - } - } - for (index_type i = 0; i < nrows; i++) { - for (index_type j = 0; j < nvecs; j++) { - y->at(i, j) = - static_cast(std::sin(2 * 3.14 * (i + j + 0.1) / nrows)) + - get_some_number(); - } - } + auto x = get_some_vectors(this->exec, ncols, nvecs); + auto y = get_some_vectors(this->exec, nrows, nvecs); auto yref = y->clone(); this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());