From 1e5af84d019a47759797a33a01c4a0b9ebe592c6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 28 Feb 2022 15:27:13 +0100 Subject: [PATCH 01/17] fix OpenMP Fbcsr SpMV and move-read --- core/matrix/fbcsr.cpp | 9 +++-- omp/matrix/fbcsr_kernels.cpp | 56 +++++++++++++++++++++++++----- reference/matrix/fbcsr_kernels.cpp | 15 ++++---- test/matrix/matrix.cpp | 11 +++--- 4 files changed, 66 insertions(+), 25 deletions(-) diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 3325638fbde..a651fa41cec 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -232,9 +232,12 @@ void Fbcsr::read(device_mat_data&& data) this->set_size(data.get_size()); row_ptrs_.resize_and_reset(row_blocks + 1); auto exec = this->get_executor(); - auto local_data = make_temporary_clone(exec, &data); - exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_, - col_idxs_, values_)); + { + auto local_data = make_temporary_clone(exec, &data); + exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_, + col_idxs_, values_)); + } + // this needs to happen after the temporary clone copy-back data.empty_out(); } diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index ce647f79935..ea01355cffa 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -86,9 +86,10 @@ void spmv(std::shared_ptr exec, #pragma omp parallel for for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) { - c->get_values()[i] = zero(); + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) = zero(); + } } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { @@ -130,10 +131,11 @@ void advanced_spmv(std::shared_ptr exec, #pragma omp parallel for for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) - c->get_values()[i] *= vbeta; - + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) *= vbeta; + } + } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { for (int ib = 0; ib < bs; ib++) { @@ -164,7 +166,7 @@ void fill_in_matrix_data(std::shared_ptr exec, components::soa_to_aos(exec, data, block_ordered); const auto in_nnz = data.get_num_elems(); auto block_ordered_ptr = block_ordered.get_data(); - std::stable_sort( + std::sort( block_ordered_ptr, block_ordered_ptr + in_nnz, [block_size](auto a, auto b) { return std::make_tuple(a.row / block_size, a.column / block_size) < @@ -248,7 +250,43 @@ template void convert_to_csr(const std::shared_ptr exec, const matrix::Fbcsr* const source, matrix::Csr* const result) - GKO_NOT_IMPLEMENTED; +{ + const auto nbrows = source->get_num_block_rows(); + const auto bs = source->get_block_size(); + const auto block_row_ptrs = source->get_const_row_ptrs(); + const auto block_col_idxs = source->get_const_col_idxs(); + const auto row_ptrs = result->get_row_ptrs(); + const auto col_idxs = result->get_col_idxs(); + const auto vals = result->get_values(); + auto sizes = + gko::to_std_array(block_row_ptrs[nbrows], bs, bs); + const auto block_vals = + acc::range>( + sizes, source->get_const_values()); +#pragma omp parallel for + for (IndexType block_row = 0; block_row < nbrows; block_row++) { + const auto block_row_begin = block_row_ptrs[block_row]; + const auto block_row_end = block_row_ptrs[block_row + 1]; + const auto block_row_size = block_row_end - block_row_begin; + const auto row_size = block_row_size * bs; + const auto row_base = block_row_begin * bs * bs; + for (int local_row = 0; local_row < bs; local_row++) { + const auto row = block_row * bs + local_row; + row_ptrs[row] = row_base + row_size * local_row; + for (auto block = block_row_begin; block < block_row_end; block++) { + const auto block_base = + row_ptrs[row] + bs * (block - block_row_begin); + for (int local_col = 0; local_col < bs; local_col++) { + const auto col = block_col_idxs[block] * bs + local_col; + const auto out_idx = block_base + local_col; + col_idxs[out_idx] = col; + vals[out_idx] = block_vals(block, local_row, local_col); + } + } + } + } + row_ptrs[result->get_size()[0]] = source->get_num_stored_elements(); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 9b093a97b68..deca89b976a 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -85,9 +85,10 @@ void spmv(const std::shared_ptr, to_std_array(nbnz, bs, bs), a->get_const_values()}; for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) { - c->get_values()[i] = zero(); + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) = zero(); + } } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { @@ -128,9 +129,11 @@ void advanced_spmv(const std::shared_ptr, to_std_array(nbnz, bs, bs), a->get_const_values()}; for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) - c->get_values()[i] *= vbeta; + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) *= vbeta; + } + } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index c87874776c3..ed0bbc32c99 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -646,13 +646,10 @@ using MatrixTypes = ::testing::Types< CsrWithAutomaticalStrategy, #endif */ - Ell, - // Fbcsr is slightly broken - /*FbcsrBlocksize1, FbcsrBlocksize2,*/ - SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy, - HybridColumnLimitStrategy, HybridImbalanceLimitStrategy, - HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy, - HybridAutomaticStrategy, SparsityCsr>; + Ell, FbcsrBlocksize1, FbcsrBlocksize2, SellpDefaultParameters, + Sellp32Factor2, HybridDefaultStrategy, HybridColumnLimitStrategy, + HybridImbalanceLimitStrategy, HybridImbalanceBoundedLimitStrategy, + HybridMinStorageStrategy, HybridAutomaticStrategy, SparsityCsr>; TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator); From 3b0030f201703db2a651c4ff8d44c178b43e3482 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 1 Mar 2022 15:17:38 +0100 Subject: [PATCH 02/17] add CUDA Fbcsr -> Dense conversion --- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 32 ++++++++++++++++++++ cuda/matrix/fbcsr_kernels.cu | 14 ++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 51feaf82fe6..8af2d8eb27d 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -116,6 +116,38 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr( } +template +__global__ __launch_bounds__(default_block_size) void fill_in_dense( + const IndexType* block_row_ptrs, const IndexType* block_col_idxs, + const ValueType* blocks, ValueType* values, size_type stride, + size_type num_block_rows, int block_size) +{ + const auto block_row = thread::get_subwarp_id_flat(); + if (block_row >= num_block_rows) { + return; + } + const auto block_begin = block_row_ptrs[block_row]; + const auto block_end = block_row_ptrs[block_row + 1]; + const auto num_blocks = block_end - block_begin; + const auto block_row_begin = block_begin * block_size * block_size; + const auto num_entries = num_blocks * block_size * block_size; + const auto bs_sq = block_size * block_size; + const auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = warp.thread_rank(); + for (IndexType nz = lane; nz < num_entries; nz += config::warp_size) { + const auto local_id = nz % bs_sq; + const auto block = nz / bs_sq; + const auto local_row = local_id % block_size; + const auto local_col = local_id / block_size; + const auto col = + block_col_idxs[block + block_begin] * block_size + local_col; + const auto row = block_row * block_size + local_row; + values[row * stride + col] = blocks[nz + block_row_begin]; + } +} + + } // namespace kernel diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index c9c4782d262..b4fe2fc1325 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -230,7 +230,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; + matrix::Dense* result) +{ + constexpr auto warps_per_block = default_block_size / config::warp_size; + const auto num_blocks = + ceildiv(source->get_num_block_rows(), warps_per_block); + if (num_blocks > 0) { + kernel::fill_in_dense<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_cuda_type(source->get_const_values()), + as_cuda_type(result->get_values()), result->get_stride(), + source->get_num_block_rows(), source->get_block_size()); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); From 1f1f27688a4367d0ab8bff1d6bda071bfad316f3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 1 Mar 2022 19:08:59 +0100 Subject: [PATCH 03/17] fix (Fb)Csr strategies and add Fbcsr conversions --- common/cuda_hip/matrix/csr_common.hpp.inc | 111 ++++++++++ common/cuda_hip/matrix/csr_kernels.hpp.inc | 200 +++++++++--------- common/cuda_hip/matrix/dense_kernels.hpp.inc | 157 ++++++++++++++ common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 45 ++-- cuda/matrix/csr_kernels.cu | 37 ++-- cuda/matrix/dense_kernels.cu | 41 +++- cuda/matrix/fbcsr_kernels.cu | 88 ++++---- dpcpp/matrix/csr_kernels.dp.cpp | 4 - hip/base/hipsparse_block_bindings.hip.hpp | 164 +++++++++++++++ hip/matrix/csr_kernels.hip.cpp | 37 ++-- hip/matrix/dense_kernels.hip.cpp | 33 ++- hip/matrix/fbcsr_kernels.hip.cpp | 207 +++++++++++++++++-- test/matrix/matrix.cpp | 72 ++++--- 13 files changed, 934 insertions(+), 262 deletions(-) create mode 100644 common/cuda_hip/matrix/csr_common.hpp.inc create mode 100644 hip/base/hipsparse_block_bindings.hip.hpp diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc new file mode 100644 index 00000000000..d22981c1edf --- /dev/null +++ b/common/cuda_hip/matrix/csr_common.hpp.inc @@ -0,0 +1,111 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void check_unsorted( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag) +{ + __shared__ bool sh_flag; + auto block = group::this_thread_block(); + if (block.thread_rank() == 0) { + sh_flag = *flag; + } + block.sync(); + + auto row = thread::get_thread_id_flat(); + if (row >= num_rows) { + return; + } + + // fail early + if (sh_flag) { + for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) { + if (col_idxs[nz] > col_idxs[nz + 1]) { + *flag = false; + sh_flag = false; + return; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void conjugate( + size_type num_nonzeros, ValueType* __restrict__ val) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_nonzeros) { + val[tidx] = conj(val[tidx]); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void check_diagonal_entries( + const IndexType num_min_rows_cols, + const IndexType* const __restrict__ row_ptrs, + const IndexType* const __restrict__ col_idxs, + bool* const __restrict__ has_all_diags) +{ + constexpr int warp_size = config::warp_size; + auto tile_grp = + group::tiled_partition(group::this_thread_block()); + const auto row = thread::get_subwarp_id_flat(); + if (row < num_min_rows_cols) { + const auto tid_in_warp = tile_grp.thread_rank(); + const IndexType row_start = row_ptrs[row]; + const IndexType num_nz = row_ptrs[row + 1] - row_start; + bool row_has_diag_local{false}; + for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { + if (col_idxs[iz + row_start] == row) { + row_has_diag_local = true; + break; + } + } + auto row_has_diag = static_cast(tile_grp.any(row_has_diag_local)); + if (!row_has_diag) { + if (tile_grp.thread_rank() == 0) { + *has_all_diags = false; + } + return; + } + } +} + + +} // namespace kernel diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 075e1607ace..4ed54310797 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -601,36 +601,6 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( } -template -__global__ __launch_bounds__(default_block_size) void check_unsorted( - const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag) -{ - __shared__ bool sh_flag; - auto block = group::this_thread_block(); - if (block.thread_rank() == 0) { - sh_flag = *flag; - } - block.sync(); - - auto row = thread::get_thread_id_flat(); - if (row >= num_rows) { - return; - } - - // fail early - if (sh_flag) { - for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) { - if (col_idxs[nz] > col_idxs[nz + 1]) { - *flag = false; - sh_flag = false; - return; - } - } - } -} - - template __global__ __launch_bounds__(default_block_size) void extract_diagonal( size_type diag_size, size_type nnz, @@ -657,29 +627,8 @@ __global__ __launch_bounds__(default_block_size) void extract_diagonal( } -} // namespace kernel - - -namespace { - - -template -__global__ __launch_bounds__(default_block_size) void conjugate_kernel( - size_type num_nonzeros, ValueType* __restrict__ val) -{ - const auto tidx = thread::get_thread_id_flat(); - - if (tidx < num_nonzeros) { - val[tidx] = conj(val[tidx]); - } -} - - -} // namespace - - template -__global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel( +__global__ __launch_bounds__(default_block_size) void row_ptr_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz) { @@ -694,11 +643,9 @@ __global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel( template -__global__ - __launch_bounds__(default_block_size) void inv_row_ptr_permute_kernel( - size_type num_rows, const IndexType* __restrict__ permutation, - const IndexType* __restrict__ in_row_ptrs, - IndexType* __restrict__ out_nnz) +__global__ __launch_bounds__(default_block_size) void inv_row_ptr_permute( + size_type num_rows, const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz) { auto tid = thread::get_thread_id_flat(); if (tid >= num_rows) { @@ -711,7 +658,7 @@ __global__ template -__global__ __launch_bounds__(default_block_size) void row_permute_kernel( +__global__ __launch_bounds__(default_block_size) void row_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, const IndexType* __restrict__ in_cols, @@ -737,7 +684,7 @@ __global__ __launch_bounds__(default_block_size) void row_permute_kernel( template -__global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel( +__global__ __launch_bounds__(default_block_size) void inv_row_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, const IndexType* __restrict__ in_cols, @@ -763,7 +710,7 @@ __global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel( template -__global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel( +__global__ __launch_bounds__(default_block_size) void inv_symm_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, const IndexType* __restrict__ in_cols, @@ -788,9 +735,6 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel( } -namespace kernel { - - template __global__ __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( @@ -869,37 +813,105 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity( } -template -__global__ __launch_bounds__(default_block_size) void check_diagonal_entries( - const IndexType num_min_rows_cols, - const IndexType* const __restrict__ row_ptrs, - const IndexType* const __restrict__ col_idxs, - bool* const __restrict__ has_all_diags) -{ - constexpr int warp_size = config::warp_size; - auto tile_grp = - group::tiled_partition(group::this_thread_block()); - const auto row = thread::get_subwarp_id_flat(); - if (row < num_min_rows_cols) { - const auto tid_in_warp = tile_grp.thread_rank(); - const IndexType row_start = row_ptrs[row]; - const IndexType num_nz = row_ptrs[row + 1] - row_start; - bool row_has_diag_local{false}; - for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { - if (col_idxs[iz + row_start] == row) { - row_has_diag_local = true; - break; - } - } - auto row_has_diag = static_cast(tile_grp.any(row_has_diag_local)); - if (!row_has_diag) { - if (tile_grp.thread_rank() == 0) { - *has_all_diags = false; - } - return; - } +} // namespace kernel + + +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& block_row_ptr_array, + Array& block_col_idx_array, + Array& block_value_array) +{ + using tuple_type = thrust::tuple; + const auto nnz = source->get_num_stored_elements(); + Array in_row_idxs{exec, nnz}; + Array in_col_idxs{exec, nnz}; + Array in_values{exec, nnz}; + exec->copy(nnz, source->get_const_col_idxs(), in_col_idxs.get_data()); + exec->copy(nnz, source->get_const_values(), in_values.get_data()); + components::convert_ptrs_to_idxs(exec, source->get_const_row_ptrs(), + source->get_size()[0], + in_row_idxs.get_data()); + auto block_row_ptrs = block_row_ptr_array.get_data(); + auto num_block_rows = block_row_ptr_array.get_num_elems() - 1; + if (nnz == 0) { + components::fill_array(exec, block_row_ptrs, num_block_rows + 1, + IndexType{}); + block_col_idx_array.resize_and_reset(0); + block_value_array.resize_and_reset(0); + return; } + auto in_rows = in_row_idxs.get_data(); + auto in_cols = in_col_idxs.get_data(); + // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken + // due to overly generic assignment operator and constructor leading to + // ambiguities. So we need to use our own fake_complex type + auto in_vals = + reinterpret_cast*>(in_values.get_data()); + auto in_loc_it = thrust::make_zip_iterator( + thrust::make_tuple(thrust::device_pointer_cast(in_rows), + thrust::device_pointer_cast(in_cols))); + thrust::sort_by_key(thrust::device, in_loc_it, in_loc_it + nnz, + thrust::device_pointer_cast(in_vals), + [bs] __device__(tuple_type a, tuple_type b) { + return thrust::make_pair(thrust::get<0>(a) / bs, + thrust::get<1>(a) / bs) < + thrust::make_pair(thrust::get<0>(b) / bs, + thrust::get<1>(b) / bs); + }); + // build block pattern + auto adj_predicate = [bs, in_rows, in_cols, nnz] __device__(size_type i) { + const auto a_block_row = i > 0 ? in_rows[i - 1] / bs : -1; + const auto a_block_col = i > 0 ? in_cols[i - 1] / bs : -1; + const auto b_block_row = in_rows[i] / bs; + const auto b_block_col = in_cols[i] / bs; + return (a_block_row != b_block_row) || (a_block_col != b_block_col); + }; + auto iota = thrust::make_counting_iterator(size_type{}); + // count how many blocks we have by counting how often the block changes + auto num_blocks = static_cast( + thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate)); + // allocate storage + Array block_row_idx_array{exec, num_blocks}; + Array block_ptr_array{exec, num_blocks}; + block_col_idx_array.resize_and_reset(num_blocks); + block_value_array.resize_and_reset(num_blocks * bs * bs); + auto row_idxs = block_row_idx_array.get_data(); + auto col_idxs = block_col_idx_array.get_data(); + auto values = as_device_type(block_value_array.get_data()); + auto block_ptrs = block_ptr_array.get_data(); + auto block_ptr_it = thrust::device_pointer_cast(block_ptrs); + // write (block_row, block_col, block_start_idx) tuples for each block + thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it, + adj_predicate); + auto block_output_it = thrust::make_zip_iterator( + thrust::make_tuple(thrust::device_pointer_cast(row_idxs), + thrust::device_pointer_cast(col_idxs))); + thrust::transform( + thrust::device, block_ptr_it, block_ptr_it + num_blocks, + block_output_it, [bs, in_rows, in_cols] __device__(size_type i) { + return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs); + }); + // build row pointers from row indices + components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(), + block_row_idx_array.get_num_elems(), + num_block_rows, block_row_ptrs); + // fill in values + components::fill_array(exec, block_value_array.get_data(), + num_blocks * bs * bs, zero()); + thrust::for_each_n( + thrust::device, iota, num_blocks, + [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, + values] __device__(size_type i) { + const auto block_begin = block_ptrs[i]; + const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; + for (auto nz = block_begin; nz < block_end; nz++) { + values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); + } + }); } - -} // namespace kernel +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index c4cdc214e76..9f0a5c0e4b7 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -33,6 +33,163 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace kernel { +template +__global__ + __launch_bounds__(default_block_size) void count_nonzero_blocks_per_row( + size_type num_block_rows, size_type num_block_cols, size_type stride, + int block_size, const ValueType* __restrict__ source, + IndexType* __restrict__ block_row_nnz) +{ + const auto brow = thread::get_subwarp_id_flat(); + + if (brow >= num_block_rows) { + return; + } + + const auto bs_sq = block_size * block_size; + const auto num_cols = num_block_cols * block_size; + auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = static_cast(warp.thread_rank()); + const auto block_mask = + block_size >= config::warp_size + ? ~config::lane_mask_type{} + : (config::lane_mask_type{1} << block_size) - 1u; + bool previous_block_nonzero = false; + IndexType block_count{}; + for (IndexType base_col = 0; base_col < num_cols; + base_col += config::warp_size) { + const auto col = base_col + lane; + const auto block_local_col = col % block_size; + // which is the first column in the current block? + const auto block_base_col = col - block_local_col; + // which lowest thread in the warp does this column belong to? + const auto block_base_thread = block_base_col - base_col; + // collect nonzero bitmask + config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u}; + for (int local_row = 0; local_row < block_size; local_row++) { + const auto row = local_row + brow * block_size; + nonzero_mask |= warp.ballot(col < num_cols && + is_nonzero(source[row * stride + col])); + } + const auto local_mask = + (block_base_thread < 0 ? nonzero_mask << -block_base_thread + : nonzero_mask >> block_base_thread) & + block_mask; + if (block_local_col == block_size - 1) { + block_count += local_mask ? 1 : 0; + } + // if we need to store something for the next iteration + if ((base_col + config::warp_size) % block_size != 0) { + // what's the first column of the last block in the warp? + const auto last_block_base_col = + (base_col + static_cast(config::warp_size) - 1) / + block_size * block_size; + // which thread does it belong to? + const auto last_block_base_thread = last_block_base_col - base_col; + previous_block_nonzero = + bool((last_block_base_thread < 0 + ? nonzero_mask << -last_block_base_thread + : nonzero_mask >> last_block_base_thread) & + block_mask); + } else { + previous_block_nonzero = false; + } + } + block_count = reduce(warp, block_count, + [](IndexType a, IndexType b) { return a + b; }); + if (lane == 0) { + block_row_nnz[brow] = block_count; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( + size_type num_block_rows, size_type num_block_cols, size_type stride, + int block_size, const ValueType* __restrict__ source, + const IndexType* __restrict__ block_row_ptrs, + IndexType* __restrict__ block_cols, ValueType* __restrict__ blocks) +{ + const auto brow = thread::get_subwarp_id_flat(); + + if (brow >= num_block_rows) { + return; + } + + const auto bs_sq = block_size * block_size; + const auto num_cols = num_block_cols * block_size; + auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = static_cast(warp.thread_rank()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + const auto block_mask = + block_size >= config::warp_size + ? ~config::lane_mask_type{} + : (config::lane_mask_type{1} << block_size) - 1u; + bool previous_block_nonzero = false; + auto block_base_nz = block_row_ptrs[brow]; + for (IndexType base_col = 0; base_col < num_cols; + base_col += config::warp_size) { + const auto col = base_col + lane; + const auto block_local_col = col % block_size; + // which is the first column in the current block? + const auto block_base_col = col - block_local_col; + // which lowest thread in the warp does this column belong to? + const auto block_base_thread = block_base_col - base_col; + // collect nonzero bitmask + config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u}; + for (int local_row = 0; local_row < block_size; local_row++) { + const auto row = local_row + brow * block_size; + nonzero_mask |= warp.ballot(col < num_cols && + is_nonzero(source[row * stride + col])); + } + const auto local_mask = + (block_base_thread < 0 ? nonzero_mask << -block_base_thread + : nonzero_mask >> block_base_thread) & + block_mask; + const auto block_nonzero_mask = + warp.ballot(local_mask && (block_local_col == block_size - 1)); + + // count how many blocks come before this block in the current block + const auto block_nz = + block_base_nz + popcnt(block_nonzero_mask & lane_prefix_mask); + // now in a second sweep, store the actual elements + if (local_mask) { + if (block_local_col == block_size - 1) { + block_cols[block_nz] = col / block_size; + } + if (col < num_cols) { + for (int local_row = 0; local_row < block_size; local_row++) { + const auto row = local_row + brow * block_size; + blocks[local_row + block_local_col * block_size + + block_nz * bs_sq] = source[row * stride + col]; + } + } + } + // if we need to store something for the next iteration + if ((base_col + config::warp_size) % block_size != 0) { + // what's the first column of the last block in the warp? + const auto last_block_base_col = + (base_col + static_cast(config::warp_size) - 1) / + block_size * block_size; + // which thread does it belong to? + const auto last_block_base_thread = last_block_base_col - base_col; + previous_block_nonzero = + bool((last_block_base_thread < 0 + ? nonzero_mask << -last_block_base_thread + : nonzero_mask >> last_block_base_thread) & + block_mask); + } else { + previous_block_nonzero = false; + } + // advance by the completed blocks + block_base_nz += popcnt(block_nonzero_mask); + } +} + + template __global__ __launch_bounds__(default_block_size) void fill_in_coo( size_type num_rows, size_type num_cols, size_type stride, diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 8af2d8eb27d..66067d3ffa6 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -93,9 +93,9 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr( if (block_row == 0 && lane == 0) { row_ptrs[0] = 0; } - if (lane < block_size) { - row_ptrs[first_row + lane + 1] = - block_row_begin + num_blocks * block_size * (lane + 1); + for (auto i = lane; i < block_size; i += config::warp_size) { + row_ptrs[first_row + i + 1] = + block_row_begin + num_blocks * block_size * (i + 1); } for (IndexType i = lane; i < num_blocks * block_size * block_size; i += config::warp_size) { @@ -154,19 +154,20 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( template void fill_in_matrix_data(std::shared_ptr exec, device_matrix_data& data, - int block_size, Array& row_ptr_array, - Array& col_idx_array, - Array& value_array) + int block_size, Array& block_row_ptr_array, + Array& block_col_idx_array, + Array& block_value_array) { using tuple_type = thrust::tuple; const auto nnz = data.get_num_elems(); const auto bs = block_size; - auto row_ptrs = row_ptr_array.get_data(); - auto num_rows = row_ptr_array.get_num_elems() - 1; + auto block_row_ptrs = block_row_ptr_array.get_data(); + auto num_block_rows = block_row_ptr_array.get_num_elems() - 1; if (nnz == 0) { - components::fill_array(exec, row_ptrs, num_rows + 1, IndexType{}); - col_idx_array.resize_and_reset(0); - value_array.resize_and_reset(0); + components::fill_array(exec, block_row_ptrs, num_block_rows + 1, + IndexType{}); + block_col_idx_array.resize_and_reset(0); + block_value_array.resize_and_reset(0); return; } auto in_rows = data.get_row_idxs(); @@ -200,13 +201,13 @@ void fill_in_matrix_data(std::shared_ptr exec, auto num_blocks = static_cast( thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate)); // allocate storage - Array row_idx_array{exec, num_blocks}; + Array block_row_idx_array{exec, num_blocks}; Array block_ptr_array{exec, num_blocks}; - col_idx_array.resize_and_reset(num_blocks); - value_array.resize_and_reset(num_blocks * bs * bs); - auto row_idxs = row_idx_array.get_data(); - auto col_idxs = col_idx_array.get_data(); - auto values = as_device_type(value_array.get_data()); + block_col_idx_array.resize_and_reset(num_blocks); + block_value_array.resize_and_reset(num_blocks * bs * bs); + auto row_idxs = block_row_idx_array.get_data(); + auto col_idxs = block_col_idx_array.get_data(); + auto values = as_device_type(block_value_array.get_data()); auto block_ptrs = block_ptr_array.get_data(); auto block_ptr_it = thrust::device_pointer_cast(block_ptrs); // write (block_row, block_col, block_start_idx) tuples for each block @@ -221,12 +222,12 @@ void fill_in_matrix_data(std::shared_ptr exec, return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs); }); // build row pointers from row indices - components::convert_idxs_to_ptrs(exec, row_idx_array.get_const_data(), - row_idx_array.get_num_elems(), num_rows, - row_ptrs); + components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(), + block_row_idx_array.get_num_elems(), + num_block_rows, block_row_ptrs); // fill in values - components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs, - zero()); + components::fill_array(exec, block_value_array.get_data(), + num_blocks * bs * bs, zero()); thrust::for_each_n( thrust::device, iota, num_blocks, [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index d60323c40c0..8afefc44972 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -36,6 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include +#include +#include +#include +#include + + #include #include #include @@ -47,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" @@ -97,6 +107,7 @@ using spgeam_kernels = syn::value_list; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -304,8 +315,6 @@ void load_balance_spmv(std::shared_ptr exec, as_cuda_type(c->get_stride())); } } - } else { - GKO_NOT_SUPPORTED(nwarps); } } @@ -897,16 +906,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_fbcsr(std::shared_ptr exec, - const matrix::Csr* source, int bs, - Array& row_ptrs, Array& col_idxs, - Array& values) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -1006,7 +1005,7 @@ void conj_transpose(std::shared_ptr exec, idxBase, alg, buffer); #endif if (grid_size > 0) { - conjugate_kernel<<>>( + kernel::conjugate<<>>( trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); } @@ -1028,7 +1027,7 @@ void inv_symm_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - inv_row_ptr_permute_kernel<<>>( + kernel::inv_row_ptr_permute<<>>( num_rows, perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); } @@ -1036,7 +1035,7 @@ void inv_symm_permute(std::shared_ptr exec, auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { - inv_symm_permute_kernel + kernel::inv_symm_permute <<>>( num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), @@ -1059,7 +1058,7 @@ void row_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - row_ptr_permute_kernel<<>>( + kernel::row_ptr_permute<<>>( num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); } @@ -1067,7 +1066,7 @@ void row_permute(std::shared_ptr exec, auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { - row_permute_kernel + kernel::row_permute <<>>( num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), @@ -1090,7 +1089,7 @@ void inverse_row_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - inv_row_ptr_permute_kernel<<>>( + kernel::inv_row_ptr_permute<<>>( num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); } @@ -1098,7 +1097,7 @@ void inverse_row_permute(std::shared_ptr exec, auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { - inv_row_permute_kernel + kernel::inv_row_permute <<>>( num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 9537d6595f0..bc55a60ac93 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -291,7 +292,18 @@ template void convert_to_fbcsr(std::shared_ptr exec, const matrix::Dense* source, matrix::Fbcsr* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_block_rows = result->get_num_block_rows(); + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel::convert_to_fbcsr<<>>( + num_block_rows, result->get_num_block_cols(), source->get_stride(), + result->get_block_size(), as_cuda_type(source->get_const_values()), + result->get_const_row_ptrs(), result->get_col_idxs(), + as_cuda_type(result->get_values())); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -300,8 +312,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void count_nonzero_blocks_per_row(std::shared_ptr exec, const matrix::Dense* source, - int bs, - IndexType* result) GKO_NOT_IMPLEMENTED; + int bs, IndexType* result) +{ + const auto num_block_rows = source->get_size()[0] / bs; + const auto num_block_cols = source->get_size()[1] / bs; + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel:: + count_nonzero_blocks_per_row<<>>( + num_block_rows, num_block_cols, source->get_stride(), bs, + as_cuda_type(source->get_const_values()), result); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -407,9 +430,9 @@ void transpose(std::shared_ptr exec, auto beta = zero(); cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, - static_cast(nullptr), trans->get_size()[1], - trans->get_values(), trans->get_stride()); + orig->get_stride(), &beta, trans->get_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; @@ -432,9 +455,9 @@ void conj_transpose(std::shared_ptr exec, auto beta = zero(); cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, - static_cast(nullptr), trans->get_size()[1], - trans->get_values(), trans->get_stride()); + orig->get_stride(), &beta, trans->get_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index b4fe2fc1325..d0a73d67c2a 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" @@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/merging.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" namespace gko { @@ -77,20 +79,6 @@ namespace kernels { namespace cuda { -namespace csr_reuse { - - -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/matrix/csr_kernels.hpp.inc" - - -} // namespace csr_reuse - - /** * @brief The fixed-size block compressed sparse row matrix format namespace. * @@ -102,7 +90,7 @@ namespace fbcsr { constexpr int default_block_size{512}; -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" @@ -114,6 +102,9 @@ void dense_transpose(std::shared_ptr exec, const size_type orig_stride, const ValueType* const orig, const size_type trans_stride, ValueType* const trans) { + if (nrows == 0) { + return; + } if (cublas::is_supported::value) { auto handle = exec->get_cublas_handle(); { @@ -121,8 +112,7 @@ void dense_transpose(std::shared_ptr exec, auto alpha = one(); auto beta = zero(); cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha, - orig, orig_stride, &beta, - static_cast(nullptr), nrows, trans, + orig, orig_stride, &beta, trans, trans_stride, trans, trans_stride); } } else { @@ -139,6 +129,15 @@ void spmv(std::shared_ptr exec, const matrix::Dense* const b, matrix::Dense* const c) { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: fill output with zero + dense::fill(exec, c, zero()); + return; + } if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); cusparse::pointer_mode_guard pm_guard(handle); @@ -153,21 +152,24 @@ void spmv(std::shared_ptr exec, const IndexType nb = a->get_num_block_cols(); const auto nnzb = static_cast(a->get_num_stored_blocks()); const auto nrhs = static_cast(b->get_size()[1]); - assert(nrhs == c->get_size()[1]); - if (nrhs == 1) { + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), &beta, c->get_values()); } else { - auto trans_c = - Array(exec, c->get_size()[0] * c->get_size()[1]); + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), nrhs, &beta, - trans_c.get_data(), mb * bs); - dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(), - c->get_stride(), c->get_values()); + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); } cusparse::destroy(descr); } else { @@ -186,6 +188,15 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* const beta, matrix::Dense* const c) { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); const auto alphp = alpha->get_const_values(); @@ -199,23 +210,26 @@ void advanced_spmv(std::shared_ptr exec, const IndexType nb = a->get_num_block_cols(); const auto nnzb = static_cast(a->get_num_stored_blocks()); const auto nrhs = static_cast(b->get_size()[1]); - assert(nrhs == c->get_size()[1]); - if (nrhs == 1) { + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), betap, c->get_values()); } else { - auto trans_c = - Array(exec, c->get_size()[0] * c->get_size()[1]); - dense_transpose(exec, mb * bs, nrhs, c->get_stride(), - c->get_values(), mb * bs, trans_c.get_data()); + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); + dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), + trans_stride, trans_c.get_data()); cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), nrhs, betap, - trans_c.get_data(), mb * bs); - dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(), - c->get_stride(), c->get_values()); + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); } cusparse::destroy(descr); } else { @@ -341,7 +355,7 @@ void conj_transpose(std::shared_ptr exec, ceildiv(trans->get_num_stored_elements(), default_block_size); transpose(exec, orig, trans); if (grid_size > 0) { - csr_reuse::conjugate_kernel<<>>( + kernel::conjugate<<>>( trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); } @@ -367,7 +381,7 @@ void is_sorted_by_column_index( static_cast(to_check->get_num_block_rows()); const auto num_blocks = ceildiv(num_brows, block_size); if (num_blocks > 0) { - csr_reuse::kernel::check_unsorted<<>>( + kernel::check_unsorted<<>>( to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), num_brows, gpu_array.get_data()); } diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index f108eacec05..c9989afe471 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1139,8 +1139,6 @@ void spmv(std::shared_ptr exec, a->get_const_col_idxs(), a->get_const_row_ptrs(), a->get_const_srow(), b->get_const_values(), b->get_stride(), c->get_values(), c->get_stride()); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -1235,8 +1233,6 @@ void advanced_spmv(std::shared_ptr exec, a->get_const_col_idxs(), a->get_const_row_ptrs(), a->get_const_srow(), b->get_const_values(), b->get_stride(), c->get_values(), c->get_stride()); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse") { diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp new file mode 100644 index 00000000000..e92c329fba1 --- /dev/null +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -0,0 +1,164 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ +#define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ + + +#include + + +#include + + +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The HIPSPARSE namespace. + * + * @ingroup hipsparse + */ +namespace hipsparse { + + +/// Default storage layout within each small dense block +constexpr hipsparseDirection_t blockDir = HIPSPARSE_DIRECTION_COLUMN; + + +#define GKO_BIND_HIPSPARSE32_BSRMV(ValueType, HipsparseName) \ + inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int32 mb, int32 nb, int32 nnzb, const ValueType* alpha, \ + const hipsparseMatDescr_t descrA, const ValueType* valA, \ + const int32* rowPtrA, const int32* colIndA, \ + int block_size, const ValueType* x, \ + const ValueType* beta, ValueType* y) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, blockDir, transA, mb, nb, nnzb, as_hiplibs_type(alpha), \ + descrA, as_hiplibs_type(valA), rowPtrA, colIndA, block_size, \ + as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_BSRMV(ValueType, HipsparseName) \ + inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int64 mb, int64 nb, int64 nnzb, const ValueType* alpha, \ + const hipsparseMatDescr_t descrA, const ValueType* valA, \ + const int64* rowPtrA, const int64* colIndA, \ + int block_size, const ValueType* x, \ + const ValueType* beta, ValueType* y) \ + GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_BSRMV(float, hipsparseSbsrmv); +GKO_BIND_HIPSPARSE32_BSRMV(double, hipsparseDbsrmv); +GKO_BIND_HIPSPARSE32_BSRMV(std::complex, hipsparseCbsrmv); +GKO_BIND_HIPSPARSE32_BSRMV(std::complex, hipsparseZbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(float, hipsparseSbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(double, hipsparseDbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(std::complex, hipsparseCbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(std::complex, hipsparseZbsrmv); +template +GKO_BIND_HIPSPARSE32_BSRMV(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_BSRMV(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_BSRMV +#undef GKO_BIND_HIPSPARSE64_BSRMV + + +#define GKO_BIND_HIPSPARSE32_BSRMM(ValueType, HipsparseName) \ + inline void bsrmm(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + hipsparseOperation_t transB, int32 mb, int32 n, \ + int32 kb, int32 nnzb, const ValueType* alpha, \ + const hipsparseMatDescr_t descrA, const ValueType* valA, \ + const int32* rowPtrA, const int32* colIndA, \ + int block_size, const ValueType* B, int32 ldb, \ + const ValueType* beta, ValueType* C, int32 ldc) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, blockDir, transA, transB, mb, n, kb, nnzb, \ + as_hiplibs_type(alpha), descrA, as_hiplibs_type(valA), rowPtrA, \ + colIndA, block_size, as_hiplibs_type(B), ldb, \ + as_hiplibs_type(beta), as_hiplibs_type(C), ldc)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_BSRMM(ValueType, HipsparseName) \ + inline void bsrmm( \ + hipsparseHandle_t handle, hipsparseOperation_t transA, \ + hipsparseOperation_t transB, int64 mb, int64 n, int64 kb, int64 nnzb, \ + const ValueType* alpha, const hipsparseMatDescr_t descrA, \ + const ValueType* valA, const int64* rowPtrA, const int64* colIndA, \ + int block_size, const ValueType* B, int64 ldb, const ValueType* beta, \ + ValueType* C, int64 ldc) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_BSRMM(float, hipsparseSbsrmm); +GKO_BIND_HIPSPARSE32_BSRMM(double, hipsparseDbsrmm); +GKO_BIND_HIPSPARSE32_BSRMM(std::complex, hipsparseCbsrmm); +GKO_BIND_HIPSPARSE32_BSRMM(std::complex, hipsparseZbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(float, hipsparseSbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(double, hipsparseDbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(std::complex, hipsparseCbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(std::complex, hipsparseZbsrmm); +template +GKO_BIND_HIPSPARSE32_BSRMM(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_BSRMM(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_BSRMM +#undef GKO_BIND_HIPSPARSE64_BSRMM + + +} // namespace hipsparse +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index a9fcb49e8e6..847eee34c3d 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -37,6 +37,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include +#include +#include +#include +#include #include @@ -50,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" @@ -99,6 +107,7 @@ using spgeam_kernels = syn::value_list; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -296,8 +305,6 @@ void spmv(std::shared_ptr exec, as_hip_type(b->get_const_values()), as_hip_type(b->get_stride()), as_hip_type(c->get_values()), as_hip_type(c->get_stride())); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -392,8 +399,6 @@ void advanced_spmv(std::shared_ptr exec, as_hip_type(b->get_const_values()), as_hip_type(b->get_stride()), as_hip_type(c->get_values()), as_hip_type(c->get_stride())); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -730,16 +735,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_fbcsr(std::shared_ptr exec, - const matrix::Csr* source, int bs, - Array& row_ptrs, Array& col_idxs, - Array& values) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -790,7 +785,7 @@ void conj_transpose(std::shared_ptr exec, trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); if (grid_size > 0) { - hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0, + hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0, trans->get_num_stored_elements(), as_hip_type(trans->get_values())); } @@ -812,7 +807,7 @@ void inv_symm_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute), count_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); @@ -822,7 +817,7 @@ void inv_symm_permute(std::shared_ptr exec, ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(inv_symm_permute_kernel), + HIP_KERNEL_NAME(kernel::inv_symm_permute), copy_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), permuted->get_row_ptrs(), @@ -842,7 +837,7 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(row_ptr_permute_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::row_ptr_permute), count_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); @@ -852,7 +847,7 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(row_permute_kernel), + HIP_KERNEL_NAME(kernel::row_permute), copy_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), @@ -874,7 +869,7 @@ void inverse_row_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute), count_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); @@ -884,7 +879,7 @@ void inverse_row_permute(std::shared_ptr exec, ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(inv_row_permute_kernel), + HIP_KERNEL_NAME(kernel::inv_row_permute), copy_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 6ace089e2ea..4b63be8f905 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -296,7 +297,18 @@ template void convert_to_fbcsr(std::shared_ptr exec, const matrix::Dense* source, matrix::Fbcsr* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_block_rows = result->get_num_block_rows(); + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel::convert_to_fbcsr<<>>( + num_block_rows, result->get_num_block_cols(), source->get_stride(), + result->get_block_size(), as_hip_type(source->get_const_values()), + result->get_const_row_ptrs(), result->get_col_idxs(), + as_hip_type(result->get_values())); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -305,8 +317,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void count_nonzero_blocks_per_row(std::shared_ptr exec, const matrix::Dense* source, - int bs, - IndexType* result) GKO_NOT_IMPLEMENTED; + int bs, IndexType* result) +{ + const auto num_block_rows = source->get_size()[0] / bs; + const auto num_block_cols = source->get_size()[1] / bs; + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel:: + count_nonzero_blocks_per_row<<>>( + num_block_rows, num_block_cols, source->get_stride(), bs, + as_hip_type(source->get_const_values()), result); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -417,7 +440,7 @@ void transpose(std::shared_ptr exec, hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), orig->get_stride(), &beta, - orig->get_const_values(), trans->get_size()[1], + trans->get_const_values(), trans->get_stride(), trans->get_values(), trans->get_stride()); } } else { @@ -442,7 +465,7 @@ void conj_transpose(std::shared_ptr exec, hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), orig->get_stride(), &beta, - orig->get_const_values(), trans->get_size()[1], + trans->get_values(), trans->get_stride(), trans->get_values(), trans->get_stride()); } } else { diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 33ed4a08e2e..096554cb9df 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -54,15 +54,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "core/base/block_sizes.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/hipsparse_block_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" namespace gko { namespace kernels { namespace hip { + + /** * @brief The fixed-size block compressed sparse row matrix format namespace. * @@ -74,25 +90,152 @@ namespace fbcsr { constexpr int default_block_size{512}; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" +namespace { + +template +void dense_transpose(std::shared_ptr exec, + const size_type nrows, const size_type ncols, + const size_type orig_stride, const ValueType* const orig, + const size_type trans_stride, ValueType* const trans) +{ + if (nrows == 0) { + return; + } + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + { + hipblas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols, + &alpha, orig, orig_stride, &beta, trans, trans_stride, + trans, trans_stride); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +} // namespace + + template void spmv(std::shared_ptr exec, - const matrix::Fbcsr* a, - const matrix::Dense* b, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0) { + // empty input: fill output with zero + dense::fill(exec, c, zero()); + return; + } + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + hipsparse::pointer_mode_guard pm_guard(handle); + const auto alpha = one(); + const auto beta = zero(); + auto descr = hipsparse::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, + bs, b->get_const_values(), &beta, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); + hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Fbcsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; + const matrix::Dense* const alpha, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + const matrix::Dense* const beta, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + const auto alphp = alpha->get_const_values(); + const auto betap = beta->get_const_values(); + auto descr = hipsparse::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); + dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), + trans_stride, trans_c.get_data()); + hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -101,7 +244,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; + matrix::Dense* result) +{ + constexpr auto warps_per_block = default_block_size / config::warp_size; + const auto num_blocks = + ceildiv(source->get_num_block_rows(), warps_per_block); + if (num_blocks > 0) { + kernel::fill_in_dense<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_hip_type(source->get_const_values()), + as_hip_type(result->get_values()), result->get_stride(), + source->get_num_block_rows(), source->get_block_size()); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); @@ -116,10 +271,8 @@ void convert_to_csr(const std::shared_ptr exec, const auto num_blocks = ceildiv(source->get_num_block_rows(), warps_per_block); if (num_blocks > 0) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks, - default_block_size, 0, 0, source->get_const_row_ptrs(), - source->get_const_col_idxs(), + kernel::convert_to_csr<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), as_hip_type(source->get_const_values()), result->get_row_ptrs(), result->get_col_idxs(), as_hip_type(result->get_values()), source->get_num_block_rows(), source->get_block_size()); @@ -131,9 +284,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) GKO_NOT_IMPLEMENTED; +void transpose(const std::shared_ptr exec, + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) + GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -152,8 +306,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void is_sorted_by_column_index( std::shared_ptr exec, - const matrix::Fbcsr* to_check, - bool* is_sorted) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const to_check, + bool* const is_sorted) +{ + *is_sorted = true; + auto gpu_array = Array(exec, 1); + // need to initialize the GPU value to true + exec->copy_from(exec->get_master().get(), 1, is_sorted, + gpu_array.get_data()); + auto block_size = default_block_size; + const auto num_brows = + static_cast(to_check->get_num_block_rows()); + const auto num_blocks = ceildiv(num_brows, block_size); + if (num_blocks > 0) { + kernel::check_unsorted<<>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_brows, gpu_array.get_data()); + } + *is_sorted = exec->copy_val_to_host(gpu_array.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index ed0bbc32c99..29e6479a282 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -200,35 +200,27 @@ struct CsrWithAutomaticalStrategy struct Ell : SimpleMatrixTest> {}; -struct FbcsrBlocksize1 - : SimpleMatrixTest> { +template +struct Fbcsr : SimpleMatrixTest> { static bool preserves_zeros() { return false; } static std::unique_ptr create( std::shared_ptr exec, gko::dim<2> size) { - return matrix_type::create(exec, size, 0, 1); + size[0] = gko::ceildiv(size[0], block_size) * block_size; + size[1] = gko::ceildiv(size[1], block_size) * block_size; + return matrix_type::create(exec, size, 0, block_size); } static void check_property(const std::unique_ptr& mtx) { - ASSERT_EQ(mtx->get_block_size(), 1); + ASSERT_EQ(mtx->get_block_size(), block_size); } -}; - -struct FbcsrBlocksize2 - : SimpleMatrixTest> { - static bool preserves_zeros() { return false; } - static std::unique_ptr create( - std::shared_ptr exec, gko::dim<2> size) - { - return matrix_type::create(exec, size, 0, 2); - } - - static void check_property(const std::unique_ptr& mtx) + static void modify_data(gko::matrix_data& data) { - ASSERT_EQ(mtx->get_block_size(), 2); + data.size[0] = gko::ceildiv(data.size[0], block_size) * block_size; + data.size[1] = gko::ceildiv(data.size[1], block_size) * block_size; } }; @@ -484,9 +476,10 @@ class Matrix : public ::testing::Test { template void forall_matrix_data_scenarios(TestFunction fn) { - auto guarded_fn = [&](auto mtx) { + auto guarded_fn = [&](auto data) { try { - fn(std::move(mtx)); + Config::modify_data(data); + fn(std::move(data)); } catch (std::exception& e) { FAIL() << e.what(); } @@ -507,6 +500,14 @@ class Matrix : public ::testing::Test { SCOPED_TRACE("Zero matrix (200x100)"); guarded_fn(gen_mtx_data(200, 100, 0, 0)); } + { + SCOPED_TRACE("Small Sparse Matrix with variable row nnz (10x10)"); + guarded_fn(gen_mtx_data(10, 10, 1, 5)); + } + { + SCOPED_TRACE("Small Dense Matrix (10x10)"); + guarded_fn(gen_mtx_data(10, 10, 10, 10)); + } { SCOPED_TRACE("Sparse Matrix with some zeros rows (200x100)"); guarded_fn(gen_mtx_data(200, 100, 0, 50)); @@ -637,19 +638,25 @@ class Matrix : public ::testing::Test { using MatrixTypes = ::testing::Types< DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy, - // The strategies have issues with zero rows - /* - #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \ - defined(GKO_COMPILING_DPCPP) - CsrWithClassicalStrategy, CsrWithMergePathStrategy, - CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy, - CsrWithAutomaticalStrategy, - #endif - */ - Ell, FbcsrBlocksize1, FbcsrBlocksize2, SellpDefaultParameters, - Sellp32Factor2, HybridDefaultStrategy, HybridColumnLimitStrategy, - HybridImbalanceLimitStrategy, HybridImbalanceBoundedLimitStrategy, - HybridMinStorageStrategy, HybridAutomaticStrategy, SparsityCsr>; +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \ + defined(GKO_COMPILING_DPCPP) + CsrWithClassicalStrategy, CsrWithMergePathStrategy, + CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy, + CsrWithAutomaticalStrategy, +#endif + Ell, +#ifdef GKO_COMPILING_OMP + // CUDA doesn't allow blocksize 1 + Fbcsr<1>, +#endif +#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA) + // Fbcsr is broken on ROCm + Fbcsr<2>, Fbcsr<3>, +#endif + SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy, + HybridColumnLimitStrategy, HybridImbalanceLimitStrategy, + HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy, + HybridAutomaticStrategy, SparsityCsr>; TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator); @@ -810,7 +817,6 @@ TYPED_TEST(Matrix, ReadWriteRoundtrip) auto new_mtx = TestConfig::create(this->exec, data.size); gko::matrix_data out_data; - TestConfig::modify_data(data); new_mtx->read(data); new_mtx->write(out_data); From 20562f4aaa5478b73c4e96fee91bb8797df4b523 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Mar 2022 16:37:46 +0100 Subject: [PATCH 04/17] improve and fix Dense -> Fbcsr conversion --- common/cuda_hip/matrix/dense_kernels.hpp.inc | 116 ++++++++++--------- core/matrix/dense.cpp | 1 + 2 files changed, 60 insertions(+), 57 deletions(-) diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index 9f0a5c0e4b7..f60747ae190 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -40,7 +40,8 @@ __global__ int block_size, const ValueType* __restrict__ source, IndexType* __restrict__ block_row_nnz) { - const auto brow = thread::get_subwarp_id_flat(); + const auto brow = + thread::get_subwarp_id_flat(); if (brow >= num_block_rows) { return; @@ -51,11 +52,9 @@ __global__ auto warp = group::tiled_partition(group::this_thread_block()); const auto lane = static_cast(warp.thread_rank()); - const auto block_mask = - block_size >= config::warp_size - ? ~config::lane_mask_type{} - : (config::lane_mask_type{1} << block_size) - 1u; - bool previous_block_nonzero = false; + constexpr auto full_mask = ~config::lane_mask_type{}; + constexpr auto one_mask = config::lane_mask_type{1}; + bool first_block_nonzero = false; IndexType block_count{}; for (IndexType base_col = 0; base_col < num_cols; base_col += config::warp_size) { @@ -63,37 +62,39 @@ __global__ const auto block_local_col = col % block_size; // which is the first column in the current block? const auto block_base_col = col - block_local_col; - // which lowest thread in the warp does this column belong to? - const auto block_base_thread = block_base_col - base_col; // collect nonzero bitmask - config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u}; + bool local_nonzero = false; for (int local_row = 0; local_row < block_size; local_row++) { const auto row = local_row + brow * block_size; - nonzero_mask |= warp.ballot(col < num_cols && - is_nonzero(source[row * stride + col])); + local_nonzero |= + col < num_cols && is_nonzero(source[row * stride + col]); } - const auto local_mask = - (block_base_thread < 0 ? nonzero_mask << -block_base_thread - : nonzero_mask >> block_base_thread) & - block_mask; + auto nonzero_mask = + warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u); + // only consider threads in the current block + const auto first_thread = block_base_col - base_col; + const auto last_thread = first_thread + block_size; + assert(first_thread < int(config::warp_size)); + assert(last_thread >= 0); + // mask off everything below first_thread + const auto lower_mask = + first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u); + // mask off everything from last_thread + const auto upper_mask = last_thread >= config::warp_size + ? full_mask + : ((one_mask << last_thread) - 1u); + const auto block_mask = upper_mask & lower_mask; + const auto local_mask = nonzero_mask & block_mask; if (block_local_col == block_size - 1) { block_count += local_mask ? 1 : 0; } // if we need to store something for the next iteration if ((base_col + config::warp_size) % block_size != 0) { - // what's the first column of the last block in the warp? - const auto last_block_base_col = - (base_col + static_cast(config::warp_size) - 1) / - block_size * block_size; - // which thread does it belong to? - const auto last_block_base_thread = last_block_base_col - base_col; - previous_block_nonzero = - bool((last_block_base_thread < 0 - ? nonzero_mask << -last_block_base_thread - : nonzero_mask >> last_block_base_thread) & - block_mask); + // check whether the last block (incomplete) in this warp is nonzero + first_block_nonzero = + bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1)); } else { - previous_block_nonzero = false; + first_block_nonzero = false; } } block_count = reduce(warp, block_count, @@ -111,7 +112,8 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( const IndexType* __restrict__ block_row_ptrs, IndexType* __restrict__ block_cols, ValueType* __restrict__ blocks) { - const auto brow = thread::get_subwarp_id_flat(); + const auto brow = + thread::get_subwarp_id_flat(); if (brow >= num_block_rows) { return; @@ -122,13 +124,10 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( auto warp = group::tiled_partition(group::this_thread_block()); const auto lane = static_cast(warp.thread_rank()); - auto lane_prefix_mask = - (config::lane_mask_type(1) << warp.thread_rank()) - 1; - const auto block_mask = - block_size >= config::warp_size - ? ~config::lane_mask_type{} - : (config::lane_mask_type{1} << block_size) - 1u; - bool previous_block_nonzero = false; + constexpr auto full_mask = ~config::lane_mask_type{}; + constexpr auto one_mask = config::lane_mask_type{1}; + const auto lane_prefix_mask = (one_mask << warp.thread_rank()) - 1u; + bool first_block_nonzero = false; auto block_base_nz = block_row_ptrs[brow]; for (IndexType base_col = 0; base_col < num_cols; base_col += config::warp_size) { @@ -136,19 +135,29 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( const auto block_local_col = col % block_size; // which is the first column in the current block? const auto block_base_col = col - block_local_col; - // which lowest thread in the warp does this column belong to? - const auto block_base_thread = block_base_col - base_col; // collect nonzero bitmask - config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u}; + bool local_nonzero = false; for (int local_row = 0; local_row < block_size; local_row++) { const auto row = local_row + brow * block_size; - nonzero_mask |= warp.ballot(col < num_cols && - is_nonzero(source[row * stride + col])); + local_nonzero |= + col < num_cols && is_nonzero(source[row * stride + col]); } - const auto local_mask = - (block_base_thread < 0 ? nonzero_mask << -block_base_thread - : nonzero_mask >> block_base_thread) & - block_mask; + auto nonzero_mask = + warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u); + // only consider threads in the current block + const auto first_thread = block_base_col - base_col; + const auto last_thread = first_thread + block_size; + assert(first_thread < int(config::warp_size)); + assert(last_thread >= 0); + // mask off everything below first_thread + const auto lower_mask = + first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u); + // mask off everything from last_thread + const auto upper_mask = last_thread >= config::warp_size + ? full_mask + : ((one_mask << last_thread) - 1u); + const auto block_mask = upper_mask & lower_mask; + const auto local_mask = nonzero_mask & block_mask; const auto block_nonzero_mask = warp.ballot(local_mask && (block_local_col == block_size - 1)); @@ -160,7 +169,8 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( if (block_local_col == block_size - 1) { block_cols[block_nz] = col / block_size; } - if (col < num_cols) { + // only if we encountered elements in this column + if (local_nonzero) { for (int local_row = 0; local_row < block_size; local_row++) { const auto row = local_row + brow * block_size; blocks[local_row + block_local_col * block_size + @@ -170,19 +180,11 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( } // if we need to store something for the next iteration if ((base_col + config::warp_size) % block_size != 0) { - // what's the first column of the last block in the warp? - const auto last_block_base_col = - (base_col + static_cast(config::warp_size) - 1) / - block_size * block_size; - // which thread does it belong to? - const auto last_block_base_thread = last_block_base_col - base_col; - previous_block_nonzero = - bool((last_block_base_thread < 0 - ? nonzero_mask << -last_block_base_thread - : nonzero_mask >> last_block_base_thread) & - block_mask); + // check whether the last block (incomplete) in this warp is nonzero + first_block_nonzero = + bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1)); } else { - previous_block_nonzero = false; + first_block_nonzero = false; } // advance by the completed blocks block_base_nz += popcnt(block_nonzero_mask); diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index b2a92cbbe95..817233997a3 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -628,6 +628,7 @@ void Dense::convert_impl(Fbcsr* result) const exec->copy_val_to_host(tmp->get_const_row_ptrs() + row_blocks); tmp->col_idxs_.resize_and_reset(nnz_blocks); tmp->values_.resize_and_reset(nnz_blocks * bs * bs); + tmp->values_.fill(zero()); tmp->set_size(this->get_size()); exec->run(dense::make_convert_to_fbcsr(this, tmp.get())); } From 8ae8a9472441e59f4cc79da8bfcd1e639f3269f1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 3 Mar 2022 10:54:25 +0100 Subject: [PATCH 05/17] fix HIP Fbcsr SpMV It's still broken for strided accesses though --- hip/matrix/fbcsr_kernels.hip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 096554cb9df..37b136e18d6 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -133,7 +133,7 @@ void spmv(std::shared_ptr exec, // empty output: nothing to do return; } - if (b->get_size()[0] == 0) { + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { // empty input: fill output with zero dense::fill(exec, c, zero()); return; @@ -192,7 +192,7 @@ void advanced_spmv(std::shared_ptr exec, // empty output: nothing to do return; } - if (b->get_size()[0] == 0) { + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { // empty input: scale output dense::scale(exec, beta, c); return; From 5dbb787a4520d9dc3910c6566e31efc978a7139a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 3 Mar 2022 11:09:58 +0100 Subject: [PATCH 06/17] review updates Co-authored-by: Yuhsiang Tsai --- common/cuda_hip/matrix/dense_kernels.hpp.inc | 6 +++--- common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 17 +++++++++-------- hip/matrix/fbcsr_kernels.hip.cpp | 2 ++ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index f60747ae190..b40d650a3a5 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -85,9 +85,9 @@ __global__ : ((one_mask << last_thread) - 1u); const auto block_mask = upper_mask & lower_mask; const auto local_mask = nonzero_mask & block_mask; - if (block_local_col == block_size - 1) { - block_count += local_mask ? 1 : 0; - } + // last column in the block increments the counter + block_count += + (block_local_col == block_size - 1 && local_mask) ? 1 : 0; // if we need to store something for the next iteration if ((base_col + config::warp_size) % block_size != 0) { // check whether the last block (incomplete) in this warp is nonzero diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 66067d3ffa6..b703fc783ce 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -205,17 +205,17 @@ void fill_in_matrix_data(std::shared_ptr exec, Array block_ptr_array{exec, num_blocks}; block_col_idx_array.resize_and_reset(num_blocks); block_value_array.resize_and_reset(num_blocks * bs * bs); - auto row_idxs = block_row_idx_array.get_data(); - auto col_idxs = block_col_idx_array.get_data(); - auto values = as_device_type(block_value_array.get_data()); + auto block_row_idxs = block_row_idx_array.get_data(); + auto block_col_idxs = block_col_idx_array.get_data(); + auto block_values = as_device_type(block_value_array.get_data()); auto block_ptrs = block_ptr_array.get_data(); auto block_ptr_it = thrust::device_pointer_cast(block_ptrs); // write (block_row, block_col, block_start_idx) tuples for each block thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it, adj_predicate); auto block_output_it = thrust::make_zip_iterator( - thrust::make_tuple(thrust::device_pointer_cast(row_idxs), - thrust::device_pointer_cast(col_idxs))); + thrust::make_tuple(thrust::device_pointer_cast(block_row_idxs), + thrust::device_pointer_cast(block_col_idxs))); thrust::transform( thrust::device, block_ptr_it, block_ptr_it + num_blocks, block_output_it, [bs, in_rows, in_cols] __device__(size_type i) { @@ -231,12 +231,13 @@ void fill_in_matrix_data(std::shared_ptr exec, thrust::for_each_n( thrust::device, iota, num_blocks, [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, - values] __device__(size_type i) { + block_values] __device__(size_type i) { const auto block_begin = block_ptrs[i]; const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; for (auto nz = block_begin; nz < block_end; nz++) { - values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); + block_values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = + fake_complex_unpack(in_vals[nz]); } }); } diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 37b136e18d6..0e021bce36b 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -96,6 +96,7 @@ constexpr int default_block_size{512}; namespace { + template void dense_transpose(std::shared_ptr exec, const size_type nrows, const size_type ncols, @@ -120,6 +121,7 @@ void dense_transpose(std::shared_ptr exec, } } + } // namespace From b57106a0615c104ea0e543cbd29c3817f7c6f08b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 3 Mar 2022 16:46:46 +0100 Subject: [PATCH 07/17] replace Array<...>::view by make_array_view --- core/base/composition.cpp | 7 ++- core/factorization/par_ic.cpp | 6 +-- core/factorization/par_ict.cpp | 4 +- core/factorization/par_ilut.cpp | 8 +-- core/matrix/dense.cpp | 4 +- core/preconditioner/jacobi.cpp | 6 +-- core/test/base/array.cpp | 22 ++++---- core/test/matrix/coo.cpp | 6 +-- core/test/matrix/csr.cpp | 6 +-- core/test/matrix/dense.cpp | 2 +- core/test/matrix/diagonal.cpp | 2 +- core/test/matrix/ell.cpp | 4 +- core/test/matrix/fbcsr.cpp | 6 +-- core/test/matrix/permutation.cpp | 8 +-- core/test/matrix/row_gatherer.cpp | 4 +- core/test/matrix/sparsity_csr.cpp | 4 +- core/test/utils/assertions_test.cpp | 2 +- .../par_ilut_approx_filter_kernel.cu | 4 +- cuda/factorization/par_ilut_filter_kernel.cu | 4 +- cuda/factorization/par_ilut_select_kernel.cu | 2 +- cuda/matrix/csr_kernels.cu | 2 +- .../par_ilut_approx_filter_kernel.hip.cpp | 4 +- .../par_ilut_filter_kernel.hip.cpp | 4 +- .../par_ilut_select_kernel.hip.cpp | 2 +- hip/matrix/csr_kernels.hip.cpp | 2 +- include/ginkgo/core/matrix/dense.hpp | 16 +++--- omp/factorization/par_ilut_kernels.cpp | 4 +- reference/factorization/par_ilut_kernels.cpp | 4 +- .../precision_conversion_kernels.cpp | 2 +- reference/test/matrix/permutation.cpp | 54 +++++++------------ 30 files changed, 91 insertions(+), 114 deletions(-) diff --git a/core/base/composition.cpp b/core/base/composition.cpp index 21140fe37e9..8f927fc8baf 100644 --- a/core/base/composition.cpp +++ b/core/base/composition.cpp @@ -82,8 +82,8 @@ std::unique_ptr apply_inner_operators( auto op_size = operators.back()->get_size(); auto out_dim = gko::dim<2>{op_size[0], num_rhs}; auto out_size = out_dim[0] * num_rhs; - auto out = Dense::create( - exec, out_dim, Array::view(exec, out_size, data), num_rhs); + auto out = Dense::create(exec, out_dim, + make_array_view(exec, out_size, data), num_rhs); // for operators with initial guess: set initial guess if (operators.back()->apply_uses_initial_guess()) { if (op_size[0] == op_size[1]) { @@ -111,8 +111,7 @@ std::unique_ptr apply_inner_operators( data + (reversed_storage ? storage_size - out_size : size_type{}); reversed_storage = !reversed_storage; out = Dense::create(exec, out_dim, - Array::view(exec, out_size, out_data), - num_rhs); + make_array_view(exec, out_size, out_data), num_rhs); // for operators with initial guess: set initial guess if (operators[i]->apply_uses_initial_guess()) { if (op_size[0] == op_size[1]) { diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp index e54538ad9e3..a2d10271aff 100644 --- a/core/factorization/par_ic.cpp +++ b/core/factorization/par_ic.cpp @@ -122,12 +122,10 @@ std::unique_ptr> ParIc::generate( // build COO representation of lower factor Array l_row_idxs{exec, l_nnz}; // copy values from l_factor, which are the lower triangular values of A - auto l_vals_view = - Array::view(exec, l_nnz, l_factor->get_values()); + auto l_vals_view = make_array_view(exec, l_nnz, l_factor->get_values()); auto a_vals = Array{exec, l_vals_view}; auto a_row_idxs = Array{exec, l_nnz}; - auto a_col_idxs = - Array::view(exec, l_nnz, l_factor->get_col_idxs()); + auto a_col_idxs = make_array_view(exec, l_nnz, l_factor->get_col_idxs()); auto a_lower_coo = CooMatrix::create(exec, matrix_size, std::move(a_vals), std::move(a_col_idxs), std::move(a_row_idxs)); diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp index cfbdfeee88a..742c8b9811f 100644 --- a/core/factorization/par_ict.cpp +++ b/core/factorization/par_ict.cpp @@ -243,9 +243,9 @@ void ParIctState::iterate() l_builder.get_row_idx_array().resize_and_reset(l_nnz); // update arrays that will be aliased l_builder.get_col_idx_array() = - Array::view(exec, l_nnz, l_new->get_col_idxs()); + make_array_view(exec, l_nnz, l_new->get_col_idxs()); l_builder.get_value_array() = - Array::view(exec, l_nnz, l_new->get_values()); + make_array_view(exec, l_nnz, l_new->get_values()); } // convert L into COO format diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp index e54eac93af5..8f2c0edbfa1 100644 --- a/core/factorization/par_ilut.cpp +++ b/core/factorization/par_ilut.cpp @@ -277,13 +277,13 @@ void ParIlutState::iterate() u_csc_builder.get_value_array().resize_and_reset(u_nnz); // update arrays that will be aliased l_builder.get_col_idx_array() = - Array::view(exec, l_nnz, l_new->get_col_idxs()); + make_array_view(exec, l_nnz, l_new->get_col_idxs()); u_builder.get_col_idx_array() = - Array::view(exec, u_nnz, u_new->get_col_idxs()); + make_array_view(exec, u_nnz, u_new->get_col_idxs()); l_builder.get_value_array() = - Array::view(exec, l_nnz, l_new->get_values()); + make_array_view(exec, l_nnz, l_new->get_values()); u_builder.get_value_array() = - Array::view(exec, u_nnz, u_new->get_values()); + make_array_view(exec, u_nnz, u_new->get_values()); } // convert U' into CSC format diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 817233997a3..06fef2b96c4 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -471,8 +471,8 @@ void Dense::convert_to(Dense* result) const // matrix on the array to avoid special-casing cross-executor copies auto tmp_result = Dense{exec, result->get_size(), - Array::view(exec, result_array->get_num_elems(), - result_array->get_data()), + make_array_view(exec, result_array->get_num_elems(), + result_array->get_data()), result->get_stride()}; exec->run(dense::make_copy(this, &tmp_result)); } else { diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index e9f9352bf7b..22c7b61012e 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -273,9 +273,9 @@ void Jacobi::generate(const LinOp* system_matrix, if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); } - auto temp = Array::view(diag_vt->get_executor(), - diag_vt->get_size()[0], - diag_vt->get_values()); + auto temp = + make_array_view(diag_vt->get_executor(), diag_vt->get_size()[0], + diag_vt->get_values()); this->blocks_ = Array(exec, temp.get_num_elems()); exec->run(jacobi::make_invert_diagonal(temp, this->blocks_)); this->num_blocks_ = diag_vt->get_size()[0]; diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index d0acb5c68cf..d48bd0bea7e 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -356,7 +356,7 @@ TYPED_TEST(Array, CanBeResized) TYPED_TEST(Array, ViewCannotBeResized) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); EXPECT_THROW(view.resize_and_reset(1), gko::NotSupported); EXPECT_EQ(view.get_num_elems(), 3); @@ -416,7 +416,7 @@ TYPED_TEST(Array, ChangesExecutors) TYPED_TEST(Array, ViewModifiesOriginalData) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); TypeParam new_data[] = {5, 4, 2}; std::copy(new_data, new_data + 3, view.get_data()); @@ -448,11 +448,11 @@ TYPED_TEST(Array, CopyArrayToArray) TYPED_TEST(Array, CopyViewToView) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); TypeParam data2[] = {5, 4, 2}; - auto view2 = gko::Array::view(this->exec, 3, data2); + auto view2 = gko::make_array_view(this->exec, 3, data2); TypeParam data_size4[] = {5, 4, 2, 1}; - auto view_size4 = gko::Array::view(this->exec, 4, data_size4); + auto view_size4 = gko::make_array_view(this->exec, 4, data_size4); view = view2; view2.get_data()[0] = 2; @@ -470,7 +470,7 @@ TYPED_TEST(Array, CopyViewToView) TYPED_TEST(Array, CopyViewToArray) { TypeParam data[] = {1, 2, 3, 4}; - auto view = gko::Array::view(this->exec, 4, data); + auto view = gko::make_array_view(this->exec, 4, data); gko::Array array(this->exec, {5, 4, 2}); array = view; @@ -488,7 +488,7 @@ TYPED_TEST(Array, CopyViewToArray) TYPED_TEST(Array, CopyArrayToView) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); gko::Array array_size2(this->exec, {5, 4}); gko::Array array_size4(this->exec, {5, 4, 2, 1}); @@ -525,9 +525,9 @@ TYPED_TEST(Array, MoveArrayToArray) TYPED_TEST(Array, MoveViewToView) { TypeParam data[] = {1, 2, 3, 4}; - auto view = gko::Array::view(this->exec, 4, data); + auto view = gko::make_array_view(this->exec, 4, data); TypeParam data2[] = {5, 4, 2}; - auto view2 = gko::Array::view(this->exec, 3, data2); + auto view2 = gko::make_array_view(this->exec, 3, data2); view = std::move(view2); @@ -550,7 +550,7 @@ TYPED_TEST(Array, MoveViewToArray) { TypeParam data[] = {1, 2, 3, 4}; gko::Array array(this->exec, {5, 4, 2}); - auto view = gko::Array::view(this->exec, 4, data); + auto view = gko::make_array_view(this->exec, 4, data); array = std::move(view); @@ -572,7 +572,7 @@ TYPED_TEST(Array, MoveViewToArray) TYPED_TEST(Array, MoveArrayToView) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); gko::Array array_size2(this->exec, {5, 4}); gko::Array array_size4(this->exec, {5, 4, 2, 1}); auto size2_ptr = array_size2.get_data(); diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp index 18438a6dd68..4cc17b145b9 100644 --- a/core/test/matrix/coo.cpp +++ b/core/test/matrix/coo.cpp @@ -142,9 +142,9 @@ TYPED_TEST(Coo, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Coo::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 4, values), - gko::Array::view(this->exec, 4, col_idxs), - gko::Array::view(this->exec, 4, row_idxs)); + gko::make_array_view(this->exec, 4, values), + gko::make_array_view(this->exec, 4, col_idxs), + gko::make_array_view(this->exec, 4, row_idxs)); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp index 166b1adb86b..85163b95e01 100644 --- a/core/test/matrix/csr.cpp +++ b/core/test/matrix/csr.cpp @@ -147,9 +147,9 @@ TYPED_TEST(Csr, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Csr::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 4, values), - gko::Array::view(this->exec, 4, col_idxs), - gko::Array::view(this->exec, 4, row_ptrs), + gko::make_array_view(this->exec, 4, values), + gko::make_array_view(this->exec, 4, col_idxs), + gko::make_array_view(this->exec, 4, row_ptrs), std::make_shared(2)); ASSERT_EQ(mtx->get_num_srow_elements(), 1); diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index 2c670fe8862..44b235f804d 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -131,7 +131,7 @@ TYPED_TEST(Dense, CanBeConstructedFromExistingData) auto m = gko::matrix::Dense::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 9, data), 3); + gko::make_array_view(this->exec, 9, data), 3); ASSERT_EQ(m->get_const_values(), data); ASSERT_EQ(m->at(2, 1), value_type{6.0}); diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp index a9fca07aa37..bf8bcc221bf 100644 --- a/core/test/matrix/diagonal.cpp +++ b/core/test/matrix/diagonal.cpp @@ -108,7 +108,7 @@ TYPED_TEST(Diagonal, CanBeCreatedFromExistingData) value_type values[] = {1.0, 2.0, 3.0}; auto diag = gko::matrix::Diagonal::create( - this->exec, 3, gko::Array::view(this->exec, 3, values)); + this->exec, 3, gko::make_array_view(this->exec, 3, values)); ASSERT_EQ(diag->get_const_values(), values); } diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp index e7c9fd622e0..d792e00d302 100644 --- a/core/test/matrix/ell.cpp +++ b/core/test/matrix/ell.cpp @@ -146,8 +146,8 @@ TYPED_TEST(Ell, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Ell::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 8, values), - gko::Array::view(this->exec, 8, col_idxs), 2, 4); + gko::make_array_view(this->exec, 8, values), + gko::make_array_view(this->exec, 8, col_idxs), 2, 4); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp index eddf6d4b753..4dc15755f48 100644 --- a/core/test/matrix/fbcsr.cpp +++ b/core/test/matrix/fbcsr.cpp @@ -398,9 +398,9 @@ TYPED_TEST(Fbcsr, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Fbcsr::create( this->exec, gko::dim<2>{nbrows * bs, nbcols * bs}, bs, - gko::Array::view(this->exec, bnnz * bs * bs, values), - gko::Array::view(this->exec, bnnz, col_idxs), - gko::Array::view(this->exec, nbrows + 1, row_ptrs)); + gko::make_array_view(this->exec, bnnz * bs * bs, values), + gko::make_array_view(this->exec, bnnz, col_idxs), + gko::make_array_view(this->exec, nbrows + 1, row_ptrs)); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp index a5d21ba1782..5214f439ac9 100644 --- a/core/test/matrix/permutation.cpp +++ b/core/test/matrix/permutation.cpp @@ -136,7 +136,7 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData) auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 5}, - gko::Array::view(this->exec, 3, data)); + gko::make_array_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_permutation(), data); } @@ -191,7 +191,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions) ASSERT_THROW(gko::matrix::Permutation::create( this->exec, gko::dim<2>{4, 2}, - gko::Array::view(this->exec, 3, data)), + gko::make_array_view(this->exec, 3, data)), gko::ValueMismatch); } @@ -203,7 +203,7 @@ TYPED_TEST(Permutation, SettingMaskDoesNotModifyData) auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 5}, - gko::Array::view(this->exec, 3, data)); + gko::make_array_view(this->exec, 3, data)); auto mask = m->get_permute_mask(); ASSERT_EQ(m->get_const_permutation(), data); @@ -225,7 +225,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions) ASSERT_THROW(gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 4}, - gko::Array::view(this->exec, 3, data), + gko::make_array_view(this->exec, 3, data), gko::matrix::column_permute), gko::ValueMismatch); } diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp index 51096a94893..68c3142ca0c 100644 --- a/core/test/matrix/row_gatherer.cpp +++ b/core/test/matrix/row_gatherer.cpp @@ -127,7 +127,7 @@ TYPED_TEST(RowGatherer, RowGathererCanBeConstructedFromExistingData) auto m = gko::matrix::RowGatherer::create( this->exec, gko::dim<2>{3, 5}, - gko::Array::view(this->exec, 3, data)); + gko::make_array_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_row_idxs(), data); } @@ -140,7 +140,7 @@ TYPED_TEST(RowGatherer, RowGathererThrowsforWrongRowPermDimensions) ASSERT_THROW(gko::matrix::RowGatherer::create( this->exec, gko::dim<2>{4, 2}, - gko::Array::view(this->exec, 3, data)), + gko::make_array_view(this->exec, 3, data)), gko::ValueMismatch); } diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp index ec59661dfe0..d14a8897d19 100644 --- a/core/test/matrix/sparsity_csr.cpp +++ b/core/test/matrix/sparsity_csr.cpp @@ -154,8 +154,8 @@ TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingData) auto mtx = gko::matrix::SparsityCsr::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 4, col_idxs), - gko::Array::view(this->exec, 4, row_ptrs), 2.0); + gko::make_array_view(this->exec, 4, col_idxs), + gko::make_array_view(this->exec, 4, row_ptrs), 2.0); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); ASSERT_EQ(mtx->get_const_row_ptrs(), row_ptrs); diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp index 7c8609573f0..5f8d454fac7 100644 --- a/core/test/utils/assertions_test.cpp +++ b/core/test/utils/assertions_test.cpp @@ -54,7 +54,7 @@ class MatricesNear : public ::testing::Test { template gko::Array make_view(std::array& array) { - return gko::Array::view(exec, size, array.data()); + return gko::make_array_view(exec, size, array.data()); } MatricesNear() diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu index be86588a909..e79042b8eec 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu @@ -161,9 +161,9 @@ void threshold_filter_approx(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu index 6dd83c41835..0451582187f 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -117,9 +117,9 @@ void threshold_filter(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu index 6655db78296..92bd9e28ab8 100644 --- a/cuda/factorization/par_ilut_select_kernel.cu +++ b/cuda/factorization/par_ilut_select_kernel.cu @@ -159,7 +159,7 @@ void threshold_select(std::shared_ptr exec, if (step > 5) { Array cpu_out_array{ exec->get_master(), - Array::view(exec, bucket.size, tmp_out)}; + make_array_view(exec, bucket.size, tmp_out)}; auto begin = cpu_out_array.get_data(); auto end = begin + bucket.size; auto middle = begin + rank; diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 8afefc44972..2b83c4fdc22 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -1246,7 +1246,7 @@ void is_sorted_by_column_index( const matrix::Csr* to_check, bool* is_sorted) { *is_sorted = true; - auto cpu_array = Array::view(exec->get_master(), 1, is_sorted); + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); auto gpu_array = Array{exec, cpu_array}; auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp index d484af93d2a..f86bed4fa76 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -165,9 +165,9 @@ void threshold_filter_approx(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index cf691f58f8b..7b335c08175 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -120,9 +120,9 @@ void threshold_filter(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp index 8130a76cb18..7e38a6c67a0 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernel.hip.cpp @@ -163,7 +163,7 @@ void threshold_select(std::shared_ptr exec, if (step > 5) { Array cpu_out_array{ exec->get_master(), - Array::view(exec, bucket.size, tmp_out)}; + make_array_view(exec, bucket.size, tmp_out)}; auto begin = cpu_out_array.get_data(); auto end = begin + bucket.size; auto middle = begin + rank; diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 847eee34c3d..1b2c8ae6a51 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -1022,7 +1022,7 @@ void is_sorted_by_column_index( const matrix::Csr* to_check, bool* is_sorted) { *is_sorted = true; - auto cpu_array = Array::view(exec->get_master(), 1, is_sorted); + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); auto gpu_array = Array{exec, cpu_array}; auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index bcaba840fdf..d5b4811032d 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -866,10 +866,9 @@ class Dense return Dense>::create( this->get_executor(), dim<2>{num_rows, num_cols}, - Array>::view( - this->get_executor(), num_rows * stride, - reinterpret_cast*>( - this->get_values())), + make_array_view(this->get_executor(), num_rows * stride, + reinterpret_cast*>( + this->get_values())), stride); } @@ -888,7 +887,7 @@ class Dense return Dense>::create( this->get_executor(), dim<2>{num_rows, num_cols}, - Array>::view( + make_array_view( this->get_executor(), num_rows * stride, const_cast*>( reinterpret_cast*>( @@ -1114,10 +1113,9 @@ class Dense return Dense::create( this->get_executor(), dim<2>{range_result.length(0), range_result.length(1)}, - Array::view( - this->get_executor(), - range_result.length(0) * range_this.length(1) - columns.begin, - range_result->data), + make_array_view(this->get_executor(), + range_result.length(0) * this->get_stride(), + range_result->data), stride); } diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index 8f0b597686d..90183ec37df 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -136,9 +136,9 @@ void abstract_filter(std::shared_ptr exec, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index 56cebc91366..691de096660 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -137,9 +137,9 @@ void abstract_filter(std::shared_ptr exec, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } diff --git a/reference/test/components/precision_conversion_kernels.cpp b/reference/test/components/precision_conversion_kernels.cpp index a82b9a2ae5c..4ff17753098 100644 --- a/reference/test/components/precision_conversion_kernels.cpp +++ b/reference/test/components/precision_conversion_kernels.cpp @@ -128,7 +128,7 @@ TEST_F(PrecisionConversion, ConvertsRealFromView) gko::Array tmp{ref}; gko::Array out{ref}; - tmp = gko::Array::view(ref, vals.get_num_elems(), vals.get_data()); + tmp = gko::make_array_view(ref, vals.get_num_elems(), vals.get_data()); out = tmp; GKO_ASSERT_ARRAY_EQ(vals, out); diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index bdb0f30059d..40773359229 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -81,8 +81,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToDense) i_type rdata[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, rdata)); + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); perm->apply(x.get(), y.get()); // clang-format off @@ -108,8 +107,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToDense) i_type rdata[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, rdata), + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata), gko::matrix::column_permute); perm->apply(x.get(), y.get()); @@ -138,11 +136,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense) i_type rdata[] = {1, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, rdata)); + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, cdata), + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, cdata), gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); @@ -170,8 +166,7 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray) i_type data[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, data), + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, data), gko::matrix::row_permute | gko::matrix::column_permute); perm->apply(x.get(), y1.get()); @@ -200,12 +195,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); @@ -234,8 +227,7 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray) i_type data[] = {1, 2, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, data), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, data), gko::matrix::column_permute | gko::matrix::row_permute | gko::matrix::inverse_permute); @@ -264,8 +256,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); rperm->apply(x.get(), y.get()); @@ -293,8 +284,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToDense) i_type cdata[] = {1, 2, 0}; auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); cperm->apply(x.get(), y.get()); @@ -323,8 +313,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata)); + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata)); perm->apply(x.get(), y.get()); // clang-format off @@ -352,8 +341,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToCsr) i_type cdata[] = {1, 2, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::column_permute); perm->apply(x.get(), y.get()); @@ -384,11 +372,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata)); + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata)); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); @@ -417,8 +403,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); rperm->apply(x.get(), y.get()); @@ -446,8 +431,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr) i_type cdata[] = {1, 2, 0}; auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); cperm->apply(x.get(), y.get()); @@ -477,12 +461,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); From f7e327dd63ff68da55349387c6edffea681f0f9f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 3 Mar 2022 16:48:45 +0100 Subject: [PATCH 08/17] add stride to convert_to(Dense) tests This also changes the Dense convert_to(Dense) behavior so that it preserves strides only if no reallocation is necessary, or on moves --- core/matrix/dense.cpp | 93 ++++++++++++++++++++++------ core/test/matrix/dense.cpp | 11 ++-- include/ginkgo/core/matrix/dense.hpp | 56 ++--------------- test/matrix/matrix.cpp | 49 +++++++++++---- 4 files changed, 124 insertions(+), 85 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 06fef2b96c4..d7e4a0f428d 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -462,31 +462,32 @@ void Dense::compute_norm1_impl(LinOp* result) const template void Dense::convert_to(Dense* result) const { - if (this->get_size() && result->get_size() == this->get_size()) { - // we need to create a executor-local clone of the target data, that - // will be copied back later. - auto exec = this->get_executor(); - auto result_array = make_temporary_output_clone(exec, &result->values_); - // create a (value, not pointer to avoid allocation overhead) view - // matrix on the array to avoid special-casing cross-executor copies - auto tmp_result = - Dense{exec, result->get_size(), - make_array_view(exec, result_array->get_num_elems(), - result_array->get_data()), - result->get_stride()}; - exec->run(dense::make_copy(this, &tmp_result)); - } else { - result->values_ = this->values_; - result->stride_ = this->stride_; - result->set_size(this->get_size()); + if (result->get_size() != this->get_size()) { + result->resize(this->get_size()); } + // we need to create a clone of the local data, so the target executor can + // copy it. + auto result_exec = result->get_executor(); + auto result_input = make_temporary_clone(result_exec, &values_); + // create a (value, not pointer to avoid allocation overhead) view + // matrix on the array to avoid special-casing cross-executor copies + auto tmp_this = Dense{ + result_exec, this->get_size(), + gko::detail::array_const_cast(gko::detail::ConstArrayView( + result_exec, result_input->get_num_elems(), + result_input->get_const_data())), + this->get_stride()}; + result_exec->run(dense::make_copy(&tmp_this, result)); } template void Dense::move_to(Dense* result) { - this->convert_to(result); + result->values_ = std::move(this->values_); + result->stride_ = std::exchange(this->stride_, 0); + result->set_size(this->get_size()); + this->resize(gko::dim<2>{}); } @@ -1585,6 +1586,62 @@ void Dense::add_scaled_identity_impl(const LinOp* const a, } +template +std::unique_ptr::real_type> +Dense::create_real_view() +{ + const auto num_rows = this->get_size()[0]; + const bool complex = is_complex(); + const auto num_cols = + complex ? 2 * this->get_size()[1] : this->get_size()[1]; + const auto stride = complex ? 2 * this->get_stride() : this->get_stride(); + + return Dense>::create( + this->get_executor(), dim<2>{num_rows, num_cols}, + make_array_view( + this->get_executor(), num_rows * stride, + reinterpret_cast*>(this->get_values())), + stride); +} + + +template +std::unique_ptr::real_type> +Dense::create_real_view() const +{ + const auto num_rows = this->get_size()[0]; + const bool complex = is_complex(); + const auto num_cols = + complex ? 2 * this->get_size()[1] : this->get_size()[1]; + const auto stride = complex ? 2 * this->get_stride() : this->get_stride(); + + return Dense>::create( + this->get_executor(), dim<2>{num_rows, num_cols}, + make_array_view(this->get_executor(), num_rows * stride, + const_cast*>( + reinterpret_cast*>( + this->get_const_values()))), + stride); +} + + +template +std::unique_ptr> Dense::create_submatrix_impl( + const span& rows, const span& columns, const size_type stride) +{ + row_major_range range_this{this->get_values(), this->get_size()[0], + this->get_size()[1], this->get_stride()}; + auto range_result = range_this(rows, columns); + return Dense::create( + this->get_executor(), + dim<2>{range_result.length(0), range_result.length(1)}, + make_array_view(this->get_executor(), + range_result.length(0) * this->get_stride(), + range_result->data), + stride); +} + + #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX); diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index 44b235f804d..cbb94a5c605 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -60,8 +60,7 @@ class Dense : public ::testing::Test { static void assert_equal_to_original_mtx(gko::matrix::Dense* m) { ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_stride(), 4); - ASSERT_EQ(m->get_num_stored_elements(), 2 * 4); + ASSERT_EQ(m->get_num_stored_elements(), 2 * m->get_stride()); EXPECT_EQ(m->at(0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1), value_type{2.0}); EXPECT_EQ(m->at(0, 2), value_type{3.0}); @@ -172,6 +171,7 @@ TYPED_TEST(Dense, CreateWithSameConfigKeepsStride) TYPED_TEST(Dense, KnowsItsSizeAndValues) { this->assert_equal_to_original_mtx(this->mtx.get()); + ASSERT_EQ(this->mtx->get_stride(), 4); } @@ -241,6 +241,8 @@ TYPED_TEST(Dense, CanBeCopied) this->assert_equal_to_original_mtx(this->mtx.get()); this->mtx->at(0) = 7; this->assert_equal_to_original_mtx(mtx_copy.get()); + ASSERT_EQ(this->mtx->get_stride(), 4); + ASSERT_EQ(mtx_copy->get_stride(), 3); } @@ -249,14 +251,15 @@ TYPED_TEST(Dense, CanBeMoved) auto mtx_copy = gko::matrix::Dense::create(this->exec); mtx_copy->copy_from(std::move(this->mtx)); this->assert_equal_to_original_mtx(mtx_copy.get()); + ASSERT_EQ(mtx_copy->get_stride(), 4); } TYPED_TEST(Dense, CanBeCloned) { auto mtx_clone = this->mtx->clone(); - this->assert_equal_to_original_mtx( - dynamic_castmtx.get())>(mtx_clone.get())); + this->assert_equal_to_original_mtx(mtx_clone.get()); + ASSERT_EQ(mtx_clone->get_stride(), 3); } diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index d5b4811032d..20479295ad7 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -855,45 +855,12 @@ class Dense * real with a reinterpret_cast with twice the number of columns and * double the stride. */ - std::unique_ptr>> create_real_view() - { - const auto num_rows = this->get_size()[0]; - const bool complex = is_complex(); - const auto num_cols = - complex ? 2 * this->get_size()[1] : this->get_size()[1]; - const auto stride = - complex ? 2 * this->get_stride() : this->get_stride(); - - return Dense>::create( - this->get_executor(), dim<2>{num_rows, num_cols}, - make_array_view(this->get_executor(), num_rows * stride, - reinterpret_cast*>( - this->get_values())), - stride); - } + std::unique_ptr create_real_view(); /** * @copydoc create_real_view() */ - std::unique_ptr>> create_real_view() - const - { - const auto num_rows = this->get_size()[0]; - const bool complex = is_complex(); - const auto num_cols = - complex ? 2 * this->get_size()[1] : this->get_size()[1]; - const auto stride = - complex ? 2 * this->get_stride() : this->get_stride(); - - return Dense>::create( - this->get_executor(), dim<2>{num_rows, num_cols}, - make_array_view( - this->get_executor(), num_rows * stride, - const_cast*>( - reinterpret_cast*>( - this->get_const_values()))), - stride); - } + std::unique_ptr create_real_view() const; /** * Creates a constant (immutable) Dense matrix from a constant array. @@ -1101,23 +1068,8 @@ class Dense * instead of create_submatrix(const span, const span, const * size_type). */ - virtual std::unique_ptr create_submatrix_impl(const span& rows, - const span& columns, - const size_type stride) - { - row_major_range range_this{this->get_values(), this->get_size()[0], - this->get_size()[1], this->get_stride()}; - auto range_result = range_this(rows, columns); - // TODO: can result in HUGE padding - which will be copied with the - // vector - return Dense::create( - this->get_executor(), - dim<2>{range_result.length(0), range_result.length(1)}, - make_array_view(this->get_executor(), - range_result.length(0) * this->get_stride(), - range_result->data), - stride); - } + virtual std::unique_ptr create_submatrix_impl( + const span& rows, const span& columns, const size_type stride); void apply_impl(const LinOp* b, LinOp* x) const override; diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 29e6479a282..475f8e84fb8 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -81,6 +81,8 @@ struct SimpleMatrixTest { {} static void check_property(const std::unique_ptr&) {} + + static bool supports_strides() { return true; } }; struct DenseWithDefaultStride @@ -222,6 +224,11 @@ struct Fbcsr : SimpleMatrixTest> { data.size[0] = gko::ceildiv(data.size[0], block_size) * block_size; data.size[1] = gko::ceildiv(data.size[1], block_size) * block_size; } + +#ifdef GKO_COMPILING_HIP + // Fbcsr support in rocSPARSE is buggy w.r.t. strides + static bool supports_strides() { return false; } +#endif }; @@ -588,7 +595,7 @@ class Matrix : public ::testing::Test { guarded_fn(gen_in_vec(mtx, 1, 1), gen_out_vec(mtx, 1, 1)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Single strided vector"); guarded_fn(gen_in_vec(mtx, 1, 2), gen_out_vec(mtx, 1, 3)); @@ -597,12 +604,12 @@ class Matrix : public ::testing::Test { // check application of real matrix to complex vector // viewed as interleaved real/imag vector using complex_vec = gko::to_complex; - { + if (Config::supports_strides()) { SCOPED_TRACE("Single strided complex vector"); guarded_fn(gen_in_vec(mtx, 1, 2), gen_out_vec(mtx, 1, 3)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Strided complex multivector with 2 columns"); guarded_fn(gen_in_vec(mtx, 2, 3), gen_out_vec(mtx, 2, 4)); @@ -613,7 +620,7 @@ class Matrix : public ::testing::Test { guarded_fn(gen_in_vec(mtx, 2, 2), gen_out_vec(mtx, 2, 2)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Strided multivector with 2 columns"); guarded_fn(gen_in_vec(mtx, 2, 3), gen_out_vec(mtx, 2, 4)); @@ -623,7 +630,7 @@ class Matrix : public ::testing::Test { guarded_fn(gen_in_vec(mtx, 40, 40), gen_out_vec(mtx, 40, 40)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Strided multivector with 40 columns"); guarded_fn(gen_in_vec(mtx, 40, 43), gen_out_vec(mtx, 40, 45)); @@ -649,8 +656,8 @@ using MatrixTypes = ::testing::Types< // CUDA doesn't allow blocksize 1 Fbcsr<1>, #endif -#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA) - // Fbcsr is broken on ROCm +#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA) || \ + defined(GKO_COMPILING_HIP) Fbcsr<2>, Fbcsr<3>, #endif SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy, @@ -775,13 +782,30 @@ TYPED_TEST(Matrix, ConvertToDenseIsEquivalentToRef) using Mtx = typename TestFixture::Mtx; using Dense = gko::matrix::Dense; this->forall_matrix_scenarios([&](auto mtx) { - auto ref_result = Dense::create(this->ref); - auto dev_result = Dense::create(this->exec); + const auto size = mtx.ref->get_size(); + const auto stride = size[1] + 5; + const auto padded_size = gko::dim<2>{size[0], stride}; + auto ref_padded = Dense::create(this->ref, padded_size); + auto dev_padded = Dense::create(this->exec, padded_size); + ref_padded->fill(12345); + dev_padded->fill(12345); + const auto rows = gko::span{0, size[0]}; + const auto cols = gko::span{0, size[1]}; + const auto pad_cols = gko::span{size[1], stride}; + auto ref_result = ref_padded->create_submatrix(rows, cols); + auto dev_result = dev_padded->create_submatrix(rows, cols); + auto ref_padding = ref_padded->create_submatrix(rows, pad_cols); + auto dev_padding = dev_padded->create_submatrix(rows, pad_cols); + auto orig_padding = ref_padding->clone(); mtx.ref->convert_to(ref_result.get()); mtx.dev->convert_to(dev_result.get()); GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0); + ASSERT_EQ(ref_result->get_stride(), stride); + ASSERT_EQ(dev_result->get_stride(), stride); + GKO_ASSERT_MTX_NEAR(ref_padding, orig_padding, 0.0); + GKO_ASSERT_MTX_NEAR(dev_padding, orig_padding, 0.0); }); } @@ -792,10 +816,13 @@ TYPED_TEST(Matrix, ConvertFromDenseIsEquivalentToRef) using Mtx = typename TestFixture::Mtx; using Dense = gko::matrix::Dense; this->forall_matrix_data_scenarios([&](auto data) { - auto ref_src = Dense::create(this->ref); - auto dev_src = Dense::create(this->exec); + const auto stride = data.size[0] + 2; + auto ref_src = Dense::create(this->ref, data.size, stride); + auto dev_src = Dense::create(this->exec, data.size, stride); ref_src->read(data); dev_src->read(data); + ASSERT_EQ(ref_src->get_stride(), stride); + ASSERT_EQ(dev_src->get_stride(), stride); auto ref_result = TestConfig::create(this->ref, data.size); auto dev_result = TestConfig::create(this->exec, data.size); From bcd48e5f9e96ff1f0665aa20e71572fa9f6fa936 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 3 Mar 2022 17:15:56 +0100 Subject: [PATCH 09/17] fix submatrix generation for empty and full rows --- core/matrix/dense.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index d7e4a0f428d..89833e5b514 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -1632,12 +1632,14 @@ std::unique_ptr> Dense::create_submatrix_impl( row_major_range range_this{this->get_values(), this->get_size()[0], this->get_size()[1], this->get_stride()}; auto range_result = range_this(rows, columns); + size_type storage_size = + rows.length() > 0 + ? range_result.length(0) * this->get_stride() - columns.begin + : 0; return Dense::create( this->get_executor(), dim<2>{range_result.length(0), range_result.length(1)}, - make_array_view(this->get_executor(), - range_result.length(0) * this->get_stride(), - range_result->data), + make_array_view(this->get_executor(), storage_size, range_result->data), stride); } From 5d798ea125f166f000adab66fbb71ca94e5e344d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 3 Mar 2022 18:45:11 +0100 Subject: [PATCH 10/17] test and fix SpMV interaction with stride/padding --- cuda/matrix/ell_kernels.cu | 4 +- dpcpp/matrix/csr_kernels.dp.cpp | 3 +- dpcpp/matrix/ell_kernels.dp.cpp | 4 +- hip/matrix/csr_kernels.hip.cpp | 3 +- hip/matrix/ell_kernels.hip.cpp | 4 +- test/matrix/matrix.cpp | 69 +++++++++++++++++++++------------ 6 files changed, 50 insertions(+), 37 deletions(-) diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index c906bb7fd1a..58c59af2cc2 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -242,9 +242,7 @@ void spmv(std::shared_ptr exec, */ const int info = (!atomic) * num_thread_per_worker; if (atomic) { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), - zero()); + dense::fill(exec, c, zero()); } select_abstract_spmv( compiled_kernels(), diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index c9989afe471..86e6784203f 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1126,8 +1126,7 @@ void spmv(std::shared_ptr exec, if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do } else if (a->get_strategy()->get_name() == "load_balance") { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + dense::fill(exec, c, zero()); const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { const dim3 csr_block(config::warp_size, warps_in_block, 1); diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index fbd1063bf13..8be02fb7988 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -429,9 +429,7 @@ void spmv(std::shared_ptr exec, */ const int info = (!atomic) * num_thread_per_worker; if (atomic) { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), - zero()); + dense::fill(exec, c, zero()); } select_abstract_spmv( compiled_kernels(), diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 1b2c8ae6a51..5bac002f63c 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -289,8 +289,7 @@ void spmv(std::shared_ptr exec, if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do } else if (a->get_strategy()->get_name() == "load_balance") { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + dense::fill(exec, c, zero()); const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { const dim3 csr_block(config::warp_size, warps_in_block, 1); diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2486a666ab0..a56b9b96d13 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -246,9 +246,7 @@ void spmv(std::shared_ptr exec, */ const int info = (!atomic) * num_thread_per_worker; if (atomic) { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), - zero()); + dense::fill(exec, c, zero()); } select_abstract_spmv( compiled_kernels(), diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 475f8e84fb8..30563438c65 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -439,12 +439,11 @@ class Matrix : public ::testing::Test { } template - test_pair gen_in_vec(const test_pair& mtx, int nrhs, - int stride) + test_pair gen_in_vec(const test_pair& mtx, int nrhs) { auto size = gko::dim<2>{mtx.ref->get_size()[1], static_cast(nrhs)}; - auto result = VecType::create(ref, size, stride); + auto result = VecType::create(ref, size); result->read(gen_dense_data(size)); return {std::move(result), exec}; @@ -465,12 +464,11 @@ class Matrix : public ::testing::Test { } template - test_pair gen_out_vec(const test_pair& mtx, int nrhs, - int stride) + test_pair gen_out_vec(const test_pair& mtx, int nrhs) { auto size = gko::dim<2>{mtx.ref->get_size()[0], static_cast(nrhs)}; - auto result = VecType::create(ref, size, stride); + auto result = VecType::create(ref, size); result->read(gen_dense_data(size)); return {std::move(result), exec}; @@ -575,6 +573,34 @@ class Matrix : public ::testing::Test { }); } + template + void run_strided(const test_pair& mtx, int rhs, int in_stride, + int out_stride, TestFunction fn) + { + // create slightly bigger vectors + auto in_padded = gen_in_vec(mtx, in_stride); + auto out_padded = gen_out_vec(mtx, out_stride); + const auto in_rows = gko::span(0, mtx.ref->get_size()[1]); + const auto out_rows = gko::span(0, mtx.ref->get_size()[0]); + const auto cols = gko::span(0, rhs); + const auto out_pad_cols = gko::span(rhs, out_stride); + // create views of the padding and in/out vectors + auto out_padding = test_pair{ + out_padded.ref->create_submatrix(out_rows, out_pad_cols), + out_padded.dev->create_submatrix(out_rows, out_pad_cols)}; + auto orig_padding = out_padding.ref->clone(); + auto in = + test_pair{in_padded.ref->create_submatrix(in_rows, cols), + in_padded.dev->create_submatrix(in_rows, cols)}; + auto out = test_pair{ + out_padded.ref->create_submatrix(out_rows, cols), + out_padded.dev->create_submatrix(out_rows, cols)}; + fn(std::move(in), std::move(out)); + // check that padding was unmodified + GKO_ASSERT_MTX_NEAR(out_padding.ref, orig_padding, 0.0); + GKO_ASSERT_MTX_NEAR(out_padding.dev, orig_padding, 0.0); + } + template void forall_vector_scenarios(const test_pair& mtx, TestFunction fn) { @@ -587,18 +613,17 @@ class Matrix : public ::testing::Test { }; { SCOPED_TRACE("Multivector with 0 columns"); - guarded_fn(gen_in_vec(mtx, 0, 0), - gen_out_vec(mtx, 0, 0)); + guarded_fn(gen_in_vec(mtx, 0), + gen_out_vec(mtx, 0)); } { SCOPED_TRACE("Single vector"); - guarded_fn(gen_in_vec(mtx, 1, 1), - gen_out_vec(mtx, 1, 1)); + guarded_fn(gen_in_vec(mtx, 1), + gen_out_vec(mtx, 1)); } if (Config::supports_strides()) { SCOPED_TRACE("Single strided vector"); - guarded_fn(gen_in_vec(mtx, 1, 2), - gen_out_vec(mtx, 1, 3)); + run_strided(mtx, 1, 2, 3, guarded_fn); } if (!gko::is_complex()) { // check application of real matrix to complex vector @@ -606,34 +631,30 @@ class Matrix : public ::testing::Test { using complex_vec = gko::to_complex; if (Config::supports_strides()) { SCOPED_TRACE("Single strided complex vector"); - guarded_fn(gen_in_vec(mtx, 1, 2), - gen_out_vec(mtx, 1, 3)); + run_strided(mtx, 1, 2, 3, guarded_fn); } if (Config::supports_strides()) { SCOPED_TRACE("Strided complex multivector with 2 columns"); - guarded_fn(gen_in_vec(mtx, 2, 3), - gen_out_vec(mtx, 2, 4)); + run_strided(mtx, 2, 3, 4, guarded_fn); } } { SCOPED_TRACE("Multivector with 2 columns"); - guarded_fn(gen_in_vec(mtx, 2, 2), - gen_out_vec(mtx, 2, 2)); + guarded_fn(gen_in_vec(mtx, 2), + gen_out_vec(mtx, 2)); } if (Config::supports_strides()) { SCOPED_TRACE("Strided multivector with 2 columns"); - guarded_fn(gen_in_vec(mtx, 2, 3), - gen_out_vec(mtx, 2, 4)); + run_strided(mtx, 2, 3, 4, guarded_fn); } { SCOPED_TRACE("Multivector with 40 columns"); - guarded_fn(gen_in_vec(mtx, 40, 40), - gen_out_vec(mtx, 40, 40)); + guarded_fn(gen_in_vec(mtx, 40), + gen_out_vec(mtx, 40)); } if (Config::supports_strides()) { SCOPED_TRACE("Strided multivector with 40 columns"); - guarded_fn(gen_in_vec(mtx, 40, 43), - gen_out_vec(mtx, 40, 45)); + run_strided(mtx, 40, 43, 45, guarded_fn); } } From 592d93fe27db166bcc06a8c1b2407208097e7e92 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 4 Mar 2022 00:19:10 +0100 Subject: [PATCH 11/17] fix DPC++ Csr merge_path SpMV segfault --- dpcpp/matrix/csr_kernels.dp.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 86e6784203f..ef686807f97 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1125,7 +1125,14 @@ void spmv(std::shared_ptr exec, { if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) { + // empty input: zero output + dense::fill(exec, c, zero()); + return; + } + if (a->get_strategy()->get_name() == "load_balance") { dense::fill(exec, c, zero()); const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { @@ -1216,7 +1223,14 @@ void advanced_spmv(std::shared_ptr exec, { if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } + if (a->get_strategy()->get_name() == "load_balance") { dense::scale(exec, beta, c); const IndexType nwarps = a->get_num_srow_elements(); From 0b74a6513a4a7cf43ceb23b0436deb22c065a26a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 4 Mar 2022 14:47:43 +0100 Subject: [PATCH 12/17] initiate Dense cross-executor copy from source This means that padding in the target will be copied and written back after the copy has finished. This fixes issues in builds without OpenMP executor and reference/test/matrix/dense_kernels --- core/matrix/dense.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 89833e5b514..616cd9f482f 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -465,19 +465,17 @@ void Dense::convert_to(Dense* result) const if (result->get_size() != this->get_size()) { result->resize(this->get_size()); } - // we need to create a clone of the local data, so the target executor can - // copy it. - auto result_exec = result->get_executor(); - auto result_input = make_temporary_clone(result_exec, &values_); + // we need to create a temporary clone of the target data to write to + auto exec = this->get_executor(); + auto result_output = make_temporary_clone(exec, &result->values_); // create a (value, not pointer to avoid allocation overhead) view // matrix on the array to avoid special-casing cross-executor copies - auto tmp_this = Dense{ - result_exec, this->get_size(), - gko::detail::array_const_cast(gko::detail::ConstArrayView( - result_exec, result_input->get_num_elems(), - result_input->get_const_data())), - this->get_stride()}; - result_exec->run(dense::make_copy(&tmp_this, result)); + auto tmp_result = + Dense{exec, this->get_size(), + make_array_view(exec, result_output->get_num_elems(), + result_output->get_data()), + result->get_stride()}; + exec->run(dense::make_copy(this, &tmp_result)); } From 3dd7aa5c85d6630372ce1ede6c562c5a36b3d3d0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 4 Mar 2022 17:23:23 +0100 Subject: [PATCH 13/17] fix dpcpp Csr classical SpMV --- dpcpp/matrix/csr_kernels.dp.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index ef686807f97..7c503cfd972 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -719,12 +719,14 @@ void abstract_classical_spmv(dim3 grid, dim3 block, ValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, row_ptrs, b, - b_stride, beta, c, c_stride, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta, + c, c_stride, item_ct1); + }); }); } From 11737d76a23c1c25c2ea413446afed3093ebb694 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 4 Mar 2022 18:27:50 +0100 Subject: [PATCH 14/17] remove assertion causing hip-nvcc to emit traps This is only relevant for really old HIP/CUDA, but still causes test failures. --- common/cuda_hip/matrix/dense_kernels.hpp.inc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index b40d650a3a5..4002aefbbbb 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -74,8 +74,9 @@ __global__ // only consider threads in the current block const auto first_thread = block_base_col - base_col; const auto last_thread = first_thread + block_size; - assert(first_thread < int(config::warp_size)); - assert(last_thread >= 0); + // HIP compiles these assertions in Release, traps unconditionally + // assert(first_thread < int(config::warp_size)); + // assert(last_thread >= 0); // mask off everything below first_thread const auto lower_mask = first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u); @@ -147,8 +148,9 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( // only consider threads in the current block const auto first_thread = block_base_col - base_col; const auto last_thread = first_thread + block_size; - assert(first_thread < int(config::warp_size)); - assert(last_thread >= 0); + // HIP compiles these assertions in Release, traps unconditionally + // assert(first_thread < int(config::warp_size)); + // assert(last_thread >= 0); // mask off everything below first_thread const auto lower_mask = first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u); From 480bbb51fc9616e8728056f0b37b6a3e835c2162 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 7 Mar 2022 14:12:31 +0100 Subject: [PATCH 15/17] fix dpcpp Csr classical SpMV with subgroup_size 1 --- dpcpp/matrix/csr_kernels.dp.cpp | 62 ++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 7c503cfd972..fb8066dbb61 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -676,16 +676,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block, const size_type b_stride, ValueType* c, const size_type c_stride) { - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - abstract_classical_spmv(num_rows, val, col_idxs, - row_ptrs, b, b_stride, c, - c_stride, item_ct1); - }); - }); + if (subgroup_size > 1) { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, b_stride, c, + c_stride, item_ct1); + }); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, + b_stride, c, c_stride, item_ct1); + }); + }); + } } @@ -718,16 +729,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block, const size_type b_stride, const ValueType* beta, ValueType* c, const size_type c_stride) { - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta, - c, c_stride, item_ct1); - }); - }); + if (subgroup_size > 1) { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, + beta, c, c_stride, item_ct1); + }); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, + b, b_stride, beta, c, c_stride, item_ct1); + }); + }); + } } From 1d7ac95a17b465e4977ec60bcc6f050adb9f986d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 7 Apr 2022 18:49:40 +0200 Subject: [PATCH 16/17] review updates * use ConstArrayView for Dense::create_real_view * improve readability of Dense -> Fbcsr conversion * update documentation * remove unnecessary template parameters * more constexpr * fix generic matrix test formatting Co-authored-by: Aditya Kashi Co-authored-by: Yuhsiang M. Tsai --- common/cuda_hip/matrix/dense_kernels.hpp.inc | 15 ++++++++++----- core/matrix/dense.cpp | 14 +++++++------- cuda/matrix/fbcsr_kernels.cu | 4 ++-- hip/matrix/fbcsr_kernels.hip.cpp | 4 ++-- include/ginkgo/core/base/array.hpp | 19 +++++++++++++++++++ 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index 4002aefbbbb..acb148db555 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -92,8 +92,10 @@ __global__ // if we need to store something for the next iteration if ((base_col + config::warp_size) % block_size != 0) { // check whether the last block (incomplete) in this warp is nonzero - first_block_nonzero = - bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1)); + auto local_block_nonzero_mask = warp.ballot(local_mask != 0u); + bool last_block_nonzero = + (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u; + first_block_nonzero = last_block_nonzero; } else { first_block_nonzero = false; } @@ -163,7 +165,8 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( const auto block_nonzero_mask = warp.ballot(local_mask && (block_local_col == block_size - 1)); - // count how many blocks come before this block in the current block + // count how many Fbcsr blocks come before the Fbcsr block handled by + // the local group of threads const auto block_nz = block_base_nz + popcnt(block_nonzero_mask & lane_prefix_mask); // now in a second sweep, store the actual elements @@ -183,8 +186,10 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( // if we need to store something for the next iteration if ((base_col + config::warp_size) % block_size != 0) { // check whether the last block (incomplete) in this warp is nonzero - first_block_nonzero = - bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1)); + auto local_block_nonzero_mask = warp.ballot(local_mask != 0u); + bool last_block_nonzero = + (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u; + first_block_nonzero = last_block_nonzero; } else { first_block_nonzero = false; } diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 616cd9f482f..94cb35c42c9 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -1589,7 +1589,7 @@ std::unique_ptr::real_type> Dense::create_real_view() { const auto num_rows = this->get_size()[0]; - const bool complex = is_complex(); + constexpr bool complex = is_complex(); const auto num_cols = complex ? 2 * this->get_size()[1] : this->get_size()[1]; const auto stride = complex ? 2 * this->get_stride() : this->get_stride(); @@ -1608,17 +1608,17 @@ std::unique_ptr::real_type> Dense::create_real_view() const { const auto num_rows = this->get_size()[0]; - const bool complex = is_complex(); + constexpr bool complex = is_complex(); const auto num_cols = complex ? 2 * this->get_size()[1] : this->get_size()[1]; const auto stride = complex ? 2 * this->get_stride() : this->get_stride(); - return Dense>::create( + return Dense>::create_const( this->get_executor(), dim<2>{num_rows, num_cols}, - make_array_view(this->get_executor(), num_rows * stride, - const_cast*>( - reinterpret_cast*>( - this->get_const_values()))), + make_const_array_view( + this->get_executor(), num_rows * stride, + reinterpret_cast*>( + this->get_const_values())), stride); } diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index d0a73d67c2a..bec6ce29807 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -374,8 +374,8 @@ void is_sorted_by_column_index( *is_sorted = true; auto gpu_array = Array(exec, 1); // need to initialize the GPU value to true - exec->copy_from(exec->get_master().get(), 1, is_sorted, - gpu_array.get_data()); + exec->copy_from(exec->get_master().get(), 1, is_sorted, + gpu_array.get_data()); auto block_size = default_block_size; const auto num_brows = static_cast(to_check->get_num_block_rows()); diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 0e021bce36b..bf707859b02 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -314,8 +314,8 @@ void is_sorted_by_column_index( *is_sorted = true; auto gpu_array = Array(exec, 1); // need to initialize the GPU value to true - exec->copy_from(exec->get_master().get(), 1, is_sorted, - gpu_array.get_data()); + exec->copy_from(exec->get_master().get(), 1, is_sorted, + gpu_array.get_data()); auto block_size = default_block_size; const auto num_brows = static_cast(to_check->get_num_block_rows()); diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index 2f95d5a4da7..35ed7985ca2 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -755,6 +755,25 @@ Array make_array_view(std::shared_ptr exec, } +/** + * Helper function to create a const array view deducing the value type. + * + * @param exec the executor on which the array resides + * @param size the number of elements for the array + * @param data the pointer to the array we create a view on. + * + * @tparam ValueType the type of the array elements + * + * @return `Array::const_view(exec, size, data)` + */ +template +detail::ConstArrayView make_const_array_view( + std::shared_ptr exec, size_type size, const ValueType* data) +{ + return Array::const_view(exec, size, data); +} + + namespace detail { From 9ba1ee94d7c92099ae9571a177710489d65c97cb Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 22 Apr 2022 16:11:02 +0200 Subject: [PATCH 17/17] use ROCm 4.0 instead of 3.5 --- .gitlab-ci.yml | 10 +++++----- .gitlab/image.yml | 4 ++-- CHANGELOG.md | 2 +- README.md | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 282eac6f039..b3ee2f02a35 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -530,12 +530,12 @@ build/cuda114/nompi/gcc/cuda/debug/shared: EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" CUDA_ARCH: 61 -# ROCm 3.5 and friends -build/amd/nompi/gcc/rocm35/debug/shared: +# ROCm 4.0 and friends +build/amd/nompi/gcc/rocm40/debug/shared: <<: *default_build_with_test extends: - .quick_test_condition - - .use_gko-rocm35-openmpi-gnu5-llvm50 + - .use_gko-rocm40-openmpi-gnu5-llvm50 variables: <<: *default_variables BUILD_OMP: "ON" @@ -544,11 +544,11 @@ build/amd/nompi/gcc/rocm35/debug/shared: BUILD_TYPE: "Debug" FAST_TESTS: "ON" -build/amd/openmpi/clang/rocm35/release/static: +build/amd/openmpi/clang/rocm40/release/static: <<: *default_build_with_test extends: - .full_test_condition - - .use_gko-rocm35-openmpi-gnu5-llvm50 + - .use_gko-rocm40-openmpi-gnu5-llvm50 variables: <<: *default_variables C_COMPILER: "clang" diff --git a/.gitlab/image.yml b/.gitlab/image.yml index e64b2cfeb60..a8a60d062ad 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -72,8 +72,8 @@ - private_ci - nvidia-gpu -.use_gko-rocm35-openmpi-gnu5-llvm50: - image: ginkgohub/rocm:35-openmpi-gnu5-llvm50 +.use_gko-rocm40-openmpi-gnu5-llvm50: + image: ginkgohub/rocm:40-openmpi-gnu5-llvm50 tags: - private_ci - amdci diff --git a/CHANGELOG.md b/CHANGELOG.md index 238f8921921..e1e411bea44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ Supported systems and requirements: + Intel compiler: 2018+ + Apple LLVM: 8.0+ + CUDA module: CUDA 9.0+ - + HIP module: ROCm 3.5+ + + HIP module: ROCm 4.0+ + DPC++ module: Intel OneAPI 2021.3. Set the CXX compiler to `dpcpp`. + Windows + MinGW and Cygwin: gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+ diff --git a/README.md b/README.md index 53c096a3a7e..f234291e16d 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ The Ginkgo CUDA module has the following __additional__ requirements: The Ginkgo HIP module has the following __additional__ requirements: -* _ROCm 3.5+_ +* _ROCm 4.0+_ * the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either: * _AMD_ backend (using the `clang` compiler) * _9.2 <= CUDA < 11_ backend