diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 282eac6f039..b3ee2f02a35 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -530,12 +530,12 @@ build/cuda114/nompi/gcc/cuda/debug/shared: EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177" CUDA_ARCH: 61 -# ROCm 3.5 and friends -build/amd/nompi/gcc/rocm35/debug/shared: +# ROCm 4.0 and friends +build/amd/nompi/gcc/rocm40/debug/shared: <<: *default_build_with_test extends: - .quick_test_condition - - .use_gko-rocm35-openmpi-gnu5-llvm50 + - .use_gko-rocm40-openmpi-gnu5-llvm50 variables: <<: *default_variables BUILD_OMP: "ON" @@ -544,11 +544,11 @@ build/amd/nompi/gcc/rocm35/debug/shared: BUILD_TYPE: "Debug" FAST_TESTS: "ON" -build/amd/openmpi/clang/rocm35/release/static: +build/amd/openmpi/clang/rocm40/release/static: <<: *default_build_with_test extends: - .full_test_condition - - .use_gko-rocm35-openmpi-gnu5-llvm50 + - .use_gko-rocm40-openmpi-gnu5-llvm50 variables: <<: *default_variables C_COMPILER: "clang" diff --git a/.gitlab/image.yml b/.gitlab/image.yml index e64b2cfeb60..a8a60d062ad 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -72,8 +72,8 @@ - private_ci - nvidia-gpu -.use_gko-rocm35-openmpi-gnu5-llvm50: - image: ginkgohub/rocm:35-openmpi-gnu5-llvm50 +.use_gko-rocm40-openmpi-gnu5-llvm50: + image: ginkgohub/rocm:40-openmpi-gnu5-llvm50 tags: - private_ci - amdci diff --git a/CHANGELOG.md b/CHANGELOG.md index 238f8921921..e1e411bea44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ Supported systems and requirements: + Intel compiler: 2018+ + Apple LLVM: 8.0+ + CUDA module: CUDA 9.0+ - + HIP module: ROCm 3.5+ + + HIP module: ROCm 4.0+ + DPC++ module: Intel OneAPI 2021.3. Set the CXX compiler to `dpcpp`. + Windows + MinGW and Cygwin: gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+ diff --git a/README.md b/README.md index 53c096a3a7e..f234291e16d 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ The Ginkgo CUDA module has the following __additional__ requirements: The Ginkgo HIP module has the following __additional__ requirements: -* _ROCm 3.5+_ +* _ROCm 4.0+_ * the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either: * _AMD_ backend (using the `clang` compiler) * _9.2 <= CUDA < 11_ backend diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc new file mode 100644 index 00000000000..d22981c1edf --- /dev/null +++ b/common/cuda_hip/matrix/csr_common.hpp.inc @@ -0,0 +1,111 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void check_unsorted( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag) +{ + __shared__ bool sh_flag; + auto block = group::this_thread_block(); + if (block.thread_rank() == 0) { + sh_flag = *flag; + } + block.sync(); + + auto row = thread::get_thread_id_flat(); + if (row >= num_rows) { + return; + } + + // fail early + if (sh_flag) { + for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) { + if (col_idxs[nz] > col_idxs[nz + 1]) { + *flag = false; + sh_flag = false; + return; + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void conjugate( + size_type num_nonzeros, ValueType* __restrict__ val) +{ + const auto tidx = thread::get_thread_id_flat(); + + if (tidx < num_nonzeros) { + val[tidx] = conj(val[tidx]); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void check_diagonal_entries( + const IndexType num_min_rows_cols, + const IndexType* const __restrict__ row_ptrs, + const IndexType* const __restrict__ col_idxs, + bool* const __restrict__ has_all_diags) +{ + constexpr int warp_size = config::warp_size; + auto tile_grp = + group::tiled_partition(group::this_thread_block()); + const auto row = thread::get_subwarp_id_flat(); + if (row < num_min_rows_cols) { + const auto tid_in_warp = tile_grp.thread_rank(); + const IndexType row_start = row_ptrs[row]; + const IndexType num_nz = row_ptrs[row + 1] - row_start; + bool row_has_diag_local{false}; + for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { + if (col_idxs[iz + row_start] == row) { + row_has_diag_local = true; + break; + } + } + auto row_has_diag = static_cast(tile_grp.any(row_has_diag_local)); + if (!row_has_diag) { + if (tile_grp.thread_rank() == 0) { + *has_all_diags = false; + } + return; + } + } +} + + +} // namespace kernel diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 075e1607ace..4ed54310797 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -601,36 +601,6 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( } -template -__global__ __launch_bounds__(default_block_size) void check_unsorted( - const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag) -{ - __shared__ bool sh_flag; - auto block = group::this_thread_block(); - if (block.thread_rank() == 0) { - sh_flag = *flag; - } - block.sync(); - - auto row = thread::get_thread_id_flat(); - if (row >= num_rows) { - return; - } - - // fail early - if (sh_flag) { - for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) { - if (col_idxs[nz] > col_idxs[nz + 1]) { - *flag = false; - sh_flag = false; - return; - } - } - } -} - - template __global__ __launch_bounds__(default_block_size) void extract_diagonal( size_type diag_size, size_type nnz, @@ -657,29 +627,8 @@ __global__ __launch_bounds__(default_block_size) void extract_diagonal( } -} // namespace kernel - - -namespace { - - -template -__global__ __launch_bounds__(default_block_size) void conjugate_kernel( - size_type num_nonzeros, ValueType* __restrict__ val) -{ - const auto tidx = thread::get_thread_id_flat(); - - if (tidx < num_nonzeros) { - val[tidx] = conj(val[tidx]); - } -} - - -} // namespace - - template -__global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel( +__global__ __launch_bounds__(default_block_size) void row_ptr_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz) { @@ -694,11 +643,9 @@ __global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel( template -__global__ - __launch_bounds__(default_block_size) void inv_row_ptr_permute_kernel( - size_type num_rows, const IndexType* __restrict__ permutation, - const IndexType* __restrict__ in_row_ptrs, - IndexType* __restrict__ out_nnz) +__global__ __launch_bounds__(default_block_size) void inv_row_ptr_permute( + size_type num_rows, const IndexType* __restrict__ permutation, + const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz) { auto tid = thread::get_thread_id_flat(); if (tid >= num_rows) { @@ -711,7 +658,7 @@ __global__ template -__global__ __launch_bounds__(default_block_size) void row_permute_kernel( +__global__ __launch_bounds__(default_block_size) void row_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, const IndexType* __restrict__ in_cols, @@ -737,7 +684,7 @@ __global__ __launch_bounds__(default_block_size) void row_permute_kernel( template -__global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel( +__global__ __launch_bounds__(default_block_size) void inv_row_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, const IndexType* __restrict__ in_cols, @@ -763,7 +710,7 @@ __global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel( template -__global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel( +__global__ __launch_bounds__(default_block_size) void inv_symm_permute( size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, const IndexType* __restrict__ in_cols, @@ -788,9 +735,6 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel( } -namespace kernel { - - template __global__ __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( @@ -869,37 +813,105 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity( } -template -__global__ __launch_bounds__(default_block_size) void check_diagonal_entries( - const IndexType num_min_rows_cols, - const IndexType* const __restrict__ row_ptrs, - const IndexType* const __restrict__ col_idxs, - bool* const __restrict__ has_all_diags) -{ - constexpr int warp_size = config::warp_size; - auto tile_grp = - group::tiled_partition(group::this_thread_block()); - const auto row = thread::get_subwarp_id_flat(); - if (row < num_min_rows_cols) { - const auto tid_in_warp = tile_grp.thread_rank(); - const IndexType row_start = row_ptrs[row]; - const IndexType num_nz = row_ptrs[row + 1] - row_start; - bool row_has_diag_local{false}; - for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) { - if (col_idxs[iz + row_start] == row) { - row_has_diag_local = true; - break; - } - } - auto row_has_diag = static_cast(tile_grp.any(row_has_diag_local)); - if (!row_has_diag) { - if (tile_grp.thread_rank() == 0) { - *has_all_diags = false; - } - return; - } +} // namespace kernel + + +template +void convert_to_fbcsr(std::shared_ptr exec, + const matrix::Csr* source, int bs, + Array& block_row_ptr_array, + Array& block_col_idx_array, + Array& block_value_array) +{ + using tuple_type = thrust::tuple; + const auto nnz = source->get_num_stored_elements(); + Array in_row_idxs{exec, nnz}; + Array in_col_idxs{exec, nnz}; + Array in_values{exec, nnz}; + exec->copy(nnz, source->get_const_col_idxs(), in_col_idxs.get_data()); + exec->copy(nnz, source->get_const_values(), in_values.get_data()); + components::convert_ptrs_to_idxs(exec, source->get_const_row_ptrs(), + source->get_size()[0], + in_row_idxs.get_data()); + auto block_row_ptrs = block_row_ptr_array.get_data(); + auto num_block_rows = block_row_ptr_array.get_num_elems() - 1; + if (nnz == 0) { + components::fill_array(exec, block_row_ptrs, num_block_rows + 1, + IndexType{}); + block_col_idx_array.resize_and_reset(0); + block_value_array.resize_and_reset(0); + return; } + auto in_rows = in_row_idxs.get_data(); + auto in_cols = in_col_idxs.get_data(); + // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken + // due to overly generic assignment operator and constructor leading to + // ambiguities. So we need to use our own fake_complex type + auto in_vals = + reinterpret_cast*>(in_values.get_data()); + auto in_loc_it = thrust::make_zip_iterator( + thrust::make_tuple(thrust::device_pointer_cast(in_rows), + thrust::device_pointer_cast(in_cols))); + thrust::sort_by_key(thrust::device, in_loc_it, in_loc_it + nnz, + thrust::device_pointer_cast(in_vals), + [bs] __device__(tuple_type a, tuple_type b) { + return thrust::make_pair(thrust::get<0>(a) / bs, + thrust::get<1>(a) / bs) < + thrust::make_pair(thrust::get<0>(b) / bs, + thrust::get<1>(b) / bs); + }); + // build block pattern + auto adj_predicate = [bs, in_rows, in_cols, nnz] __device__(size_type i) { + const auto a_block_row = i > 0 ? in_rows[i - 1] / bs : -1; + const auto a_block_col = i > 0 ? in_cols[i - 1] / bs : -1; + const auto b_block_row = in_rows[i] / bs; + const auto b_block_col = in_cols[i] / bs; + return (a_block_row != b_block_row) || (a_block_col != b_block_col); + }; + auto iota = thrust::make_counting_iterator(size_type{}); + // count how many blocks we have by counting how often the block changes + auto num_blocks = static_cast( + thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate)); + // allocate storage + Array block_row_idx_array{exec, num_blocks}; + Array block_ptr_array{exec, num_blocks}; + block_col_idx_array.resize_and_reset(num_blocks); + block_value_array.resize_and_reset(num_blocks * bs * bs); + auto row_idxs = block_row_idx_array.get_data(); + auto col_idxs = block_col_idx_array.get_data(); + auto values = as_device_type(block_value_array.get_data()); + auto block_ptrs = block_ptr_array.get_data(); + auto block_ptr_it = thrust::device_pointer_cast(block_ptrs); + // write (block_row, block_col, block_start_idx) tuples for each block + thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it, + adj_predicate); + auto block_output_it = thrust::make_zip_iterator( + thrust::make_tuple(thrust::device_pointer_cast(row_idxs), + thrust::device_pointer_cast(col_idxs))); + thrust::transform( + thrust::device, block_ptr_it, block_ptr_it + num_blocks, + block_output_it, [bs, in_rows, in_cols] __device__(size_type i) { + return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs); + }); + // build row pointers from row indices + components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(), + block_row_idx_array.get_num_elems(), + num_block_rows, block_row_ptrs); + // fill in values + components::fill_array(exec, block_value_array.get_data(), + num_blocks * bs * bs, zero()); + thrust::for_each_n( + thrust::device, iota, num_blocks, + [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, + values] __device__(size_type i) { + const auto block_begin = block_ptrs[i]; + const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; + for (auto nz = block_begin; nz < block_end; nz++) { + values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); + } + }); } - -} // namespace kernel +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index c4cdc214e76..acb148db555 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -33,6 +33,172 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace kernel { +template +__global__ + __launch_bounds__(default_block_size) void count_nonzero_blocks_per_row( + size_type num_block_rows, size_type num_block_cols, size_type stride, + int block_size, const ValueType* __restrict__ source, + IndexType* __restrict__ block_row_nnz) +{ + const auto brow = + thread::get_subwarp_id_flat(); + + if (brow >= num_block_rows) { + return; + } + + const auto bs_sq = block_size * block_size; + const auto num_cols = num_block_cols * block_size; + auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = static_cast(warp.thread_rank()); + constexpr auto full_mask = ~config::lane_mask_type{}; + constexpr auto one_mask = config::lane_mask_type{1}; + bool first_block_nonzero = false; + IndexType block_count{}; + for (IndexType base_col = 0; base_col < num_cols; + base_col += config::warp_size) { + const auto col = base_col + lane; + const auto block_local_col = col % block_size; + // which is the first column in the current block? + const auto block_base_col = col - block_local_col; + // collect nonzero bitmask + bool local_nonzero = false; + for (int local_row = 0; local_row < block_size; local_row++) { + const auto row = local_row + brow * block_size; + local_nonzero |= + col < num_cols && is_nonzero(source[row * stride + col]); + } + auto nonzero_mask = + warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u); + // only consider threads in the current block + const auto first_thread = block_base_col - base_col; + const auto last_thread = first_thread + block_size; + // HIP compiles these assertions in Release, traps unconditionally + // assert(first_thread < int(config::warp_size)); + // assert(last_thread >= 0); + // mask off everything below first_thread + const auto lower_mask = + first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u); + // mask off everything from last_thread + const auto upper_mask = last_thread >= config::warp_size + ? full_mask + : ((one_mask << last_thread) - 1u); + const auto block_mask = upper_mask & lower_mask; + const auto local_mask = nonzero_mask & block_mask; + // last column in the block increments the counter + block_count += + (block_local_col == block_size - 1 && local_mask) ? 1 : 0; + // if we need to store something for the next iteration + if ((base_col + config::warp_size) % block_size != 0) { + // check whether the last block (incomplete) in this warp is nonzero + auto local_block_nonzero_mask = warp.ballot(local_mask != 0u); + bool last_block_nonzero = + (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u; + first_block_nonzero = last_block_nonzero; + } else { + first_block_nonzero = false; + } + } + block_count = reduce(warp, block_count, + [](IndexType a, IndexType b) { return a + b; }); + if (lane == 0) { + block_row_nnz[brow] = block_count; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void convert_to_fbcsr( + size_type num_block_rows, size_type num_block_cols, size_type stride, + int block_size, const ValueType* __restrict__ source, + const IndexType* __restrict__ block_row_ptrs, + IndexType* __restrict__ block_cols, ValueType* __restrict__ blocks) +{ + const auto brow = + thread::get_subwarp_id_flat(); + + if (brow >= num_block_rows) { + return; + } + + const auto bs_sq = block_size * block_size; + const auto num_cols = num_block_cols * block_size; + auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = static_cast(warp.thread_rank()); + constexpr auto full_mask = ~config::lane_mask_type{}; + constexpr auto one_mask = config::lane_mask_type{1}; + const auto lane_prefix_mask = (one_mask << warp.thread_rank()) - 1u; + bool first_block_nonzero = false; + auto block_base_nz = block_row_ptrs[brow]; + for (IndexType base_col = 0; base_col < num_cols; + base_col += config::warp_size) { + const auto col = base_col + lane; + const auto block_local_col = col % block_size; + // which is the first column in the current block? + const auto block_base_col = col - block_local_col; + // collect nonzero bitmask + bool local_nonzero = false; + for (int local_row = 0; local_row < block_size; local_row++) { + const auto row = local_row + brow * block_size; + local_nonzero |= + col < num_cols && is_nonzero(source[row * stride + col]); + } + auto nonzero_mask = + warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u); + // only consider threads in the current block + const auto first_thread = block_base_col - base_col; + const auto last_thread = first_thread + block_size; + // HIP compiles these assertions in Release, traps unconditionally + // assert(first_thread < int(config::warp_size)); + // assert(last_thread >= 0); + // mask off everything below first_thread + const auto lower_mask = + first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u); + // mask off everything from last_thread + const auto upper_mask = last_thread >= config::warp_size + ? full_mask + : ((one_mask << last_thread) - 1u); + const auto block_mask = upper_mask & lower_mask; + const auto local_mask = nonzero_mask & block_mask; + const auto block_nonzero_mask = + warp.ballot(local_mask && (block_local_col == block_size - 1)); + + // count how many Fbcsr blocks come before the Fbcsr block handled by + // the local group of threads + const auto block_nz = + block_base_nz + popcnt(block_nonzero_mask & lane_prefix_mask); + // now in a second sweep, store the actual elements + if (local_mask) { + if (block_local_col == block_size - 1) { + block_cols[block_nz] = col / block_size; + } + // only if we encountered elements in this column + if (local_nonzero) { + for (int local_row = 0; local_row < block_size; local_row++) { + const auto row = local_row + brow * block_size; + blocks[local_row + block_local_col * block_size + + block_nz * bs_sq] = source[row * stride + col]; + } + } + } + // if we need to store something for the next iteration + if ((base_col + config::warp_size) % block_size != 0) { + // check whether the last block (incomplete) in this warp is nonzero + auto local_block_nonzero_mask = warp.ballot(local_mask != 0u); + bool last_block_nonzero = + (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u; + first_block_nonzero = last_block_nonzero; + } else { + first_block_nonzero = false; + } + // advance by the completed blocks + block_base_nz += popcnt(block_nonzero_mask); + } +} + + template __global__ __launch_bounds__(default_block_size) void fill_in_coo( size_type num_rows, size_type num_cols, size_type stride, diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc index 51feaf82fe6..b703fc783ce 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc @@ -93,9 +93,9 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr( if (block_row == 0 && lane == 0) { row_ptrs[0] = 0; } - if (lane < block_size) { - row_ptrs[first_row + lane + 1] = - block_row_begin + num_blocks * block_size * (lane + 1); + for (auto i = lane; i < block_size; i += config::warp_size) { + row_ptrs[first_row + i + 1] = + block_row_begin + num_blocks * block_size * (i + 1); } for (IndexType i = lane; i < num_blocks * block_size * block_size; i += config::warp_size) { @@ -116,25 +116,58 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr( } +template +__global__ __launch_bounds__(default_block_size) void fill_in_dense( + const IndexType* block_row_ptrs, const IndexType* block_col_idxs, + const ValueType* blocks, ValueType* values, size_type stride, + size_type num_block_rows, int block_size) +{ + const auto block_row = thread::get_subwarp_id_flat(); + if (block_row >= num_block_rows) { + return; + } + const auto block_begin = block_row_ptrs[block_row]; + const auto block_end = block_row_ptrs[block_row + 1]; + const auto num_blocks = block_end - block_begin; + const auto block_row_begin = block_begin * block_size * block_size; + const auto num_entries = num_blocks * block_size * block_size; + const auto bs_sq = block_size * block_size; + const auto warp = + group::tiled_partition(group::this_thread_block()); + const auto lane = warp.thread_rank(); + for (IndexType nz = lane; nz < num_entries; nz += config::warp_size) { + const auto local_id = nz % bs_sq; + const auto block = nz / bs_sq; + const auto local_row = local_id % block_size; + const auto local_col = local_id / block_size; + const auto col = + block_col_idxs[block + block_begin] * block_size + local_col; + const auto row = block_row * block_size + local_row; + values[row * stride + col] = blocks[nz + block_row_begin]; + } +} + + } // namespace kernel template void fill_in_matrix_data(std::shared_ptr exec, device_matrix_data& data, - int block_size, Array& row_ptr_array, - Array& col_idx_array, - Array& value_array) + int block_size, Array& block_row_ptr_array, + Array& block_col_idx_array, + Array& block_value_array) { using tuple_type = thrust::tuple; const auto nnz = data.get_num_elems(); const auto bs = block_size; - auto row_ptrs = row_ptr_array.get_data(); - auto num_rows = row_ptr_array.get_num_elems() - 1; + auto block_row_ptrs = block_row_ptr_array.get_data(); + auto num_block_rows = block_row_ptr_array.get_num_elems() - 1; if (nnz == 0) { - components::fill_array(exec, row_ptrs, num_rows + 1, IndexType{}); - col_idx_array.resize_and_reset(0); - value_array.resize_and_reset(0); + components::fill_array(exec, block_row_ptrs, num_block_rows + 1, + IndexType{}); + block_col_idx_array.resize_and_reset(0); + block_value_array.resize_and_reset(0); return; } auto in_rows = data.get_row_idxs(); @@ -168,42 +201,43 @@ void fill_in_matrix_data(std::shared_ptr exec, auto num_blocks = static_cast( thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate)); // allocate storage - Array row_idx_array{exec, num_blocks}; + Array block_row_idx_array{exec, num_blocks}; Array block_ptr_array{exec, num_blocks}; - col_idx_array.resize_and_reset(num_blocks); - value_array.resize_and_reset(num_blocks * bs * bs); - auto row_idxs = row_idx_array.get_data(); - auto col_idxs = col_idx_array.get_data(); - auto values = as_device_type(value_array.get_data()); + block_col_idx_array.resize_and_reset(num_blocks); + block_value_array.resize_and_reset(num_blocks * bs * bs); + auto block_row_idxs = block_row_idx_array.get_data(); + auto block_col_idxs = block_col_idx_array.get_data(); + auto block_values = as_device_type(block_value_array.get_data()); auto block_ptrs = block_ptr_array.get_data(); auto block_ptr_it = thrust::device_pointer_cast(block_ptrs); // write (block_row, block_col, block_start_idx) tuples for each block thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it, adj_predicate); auto block_output_it = thrust::make_zip_iterator( - thrust::make_tuple(thrust::device_pointer_cast(row_idxs), - thrust::device_pointer_cast(col_idxs))); + thrust::make_tuple(thrust::device_pointer_cast(block_row_idxs), + thrust::device_pointer_cast(block_col_idxs))); thrust::transform( thrust::device, block_ptr_it, block_ptr_it + num_blocks, block_output_it, [bs, in_rows, in_cols] __device__(size_type i) { return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs); }); // build row pointers from row indices - components::convert_idxs_to_ptrs(exec, row_idx_array.get_const_data(), - row_idx_array.get_num_elems(), num_rows, - row_ptrs); + components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(), + block_row_idx_array.get_num_elems(), + num_block_rows, block_row_ptrs); // fill in values - components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs, - zero()); + components::fill_array(exec, block_value_array.get_data(), + num_blocks * bs * bs, zero()); thrust::for_each_n( thrust::device, iota, num_blocks, [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals, - values] __device__(size_type i) { + block_values] __device__(size_type i) { const auto block_begin = block_ptrs[i]; const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz; for (auto nz = block_begin; nz < block_end; nz++) { - values[i * bs * bs + (in_cols[nz] % bs) * bs + - (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]); + block_values[i * bs * bs + (in_cols[nz] % bs) * bs + + (in_rows[nz] % bs)] = + fake_complex_unpack(in_vals[nz]); } }); } diff --git a/core/base/composition.cpp b/core/base/composition.cpp index 21140fe37e9..8f927fc8baf 100644 --- a/core/base/composition.cpp +++ b/core/base/composition.cpp @@ -82,8 +82,8 @@ std::unique_ptr apply_inner_operators( auto op_size = operators.back()->get_size(); auto out_dim = gko::dim<2>{op_size[0], num_rhs}; auto out_size = out_dim[0] * num_rhs; - auto out = Dense::create( - exec, out_dim, Array::view(exec, out_size, data), num_rhs); + auto out = Dense::create(exec, out_dim, + make_array_view(exec, out_size, data), num_rhs); // for operators with initial guess: set initial guess if (operators.back()->apply_uses_initial_guess()) { if (op_size[0] == op_size[1]) { @@ -111,8 +111,7 @@ std::unique_ptr apply_inner_operators( data + (reversed_storage ? storage_size - out_size : size_type{}); reversed_storage = !reversed_storage; out = Dense::create(exec, out_dim, - Array::view(exec, out_size, out_data), - num_rhs); + make_array_view(exec, out_size, out_data), num_rhs); // for operators with initial guess: set initial guess if (operators[i]->apply_uses_initial_guess()) { if (op_size[0] == op_size[1]) { diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp index e54538ad9e3..a2d10271aff 100644 --- a/core/factorization/par_ic.cpp +++ b/core/factorization/par_ic.cpp @@ -122,12 +122,10 @@ std::unique_ptr> ParIc::generate( // build COO representation of lower factor Array l_row_idxs{exec, l_nnz}; // copy values from l_factor, which are the lower triangular values of A - auto l_vals_view = - Array::view(exec, l_nnz, l_factor->get_values()); + auto l_vals_view = make_array_view(exec, l_nnz, l_factor->get_values()); auto a_vals = Array{exec, l_vals_view}; auto a_row_idxs = Array{exec, l_nnz}; - auto a_col_idxs = - Array::view(exec, l_nnz, l_factor->get_col_idxs()); + auto a_col_idxs = make_array_view(exec, l_nnz, l_factor->get_col_idxs()); auto a_lower_coo = CooMatrix::create(exec, matrix_size, std::move(a_vals), std::move(a_col_idxs), std::move(a_row_idxs)); diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp index cfbdfeee88a..742c8b9811f 100644 --- a/core/factorization/par_ict.cpp +++ b/core/factorization/par_ict.cpp @@ -243,9 +243,9 @@ void ParIctState::iterate() l_builder.get_row_idx_array().resize_and_reset(l_nnz); // update arrays that will be aliased l_builder.get_col_idx_array() = - Array::view(exec, l_nnz, l_new->get_col_idxs()); + make_array_view(exec, l_nnz, l_new->get_col_idxs()); l_builder.get_value_array() = - Array::view(exec, l_nnz, l_new->get_values()); + make_array_view(exec, l_nnz, l_new->get_values()); } // convert L into COO format diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp index e54eac93af5..8f2c0edbfa1 100644 --- a/core/factorization/par_ilut.cpp +++ b/core/factorization/par_ilut.cpp @@ -277,13 +277,13 @@ void ParIlutState::iterate() u_csc_builder.get_value_array().resize_and_reset(u_nnz); // update arrays that will be aliased l_builder.get_col_idx_array() = - Array::view(exec, l_nnz, l_new->get_col_idxs()); + make_array_view(exec, l_nnz, l_new->get_col_idxs()); u_builder.get_col_idx_array() = - Array::view(exec, u_nnz, u_new->get_col_idxs()); + make_array_view(exec, u_nnz, u_new->get_col_idxs()); l_builder.get_value_array() = - Array::view(exec, l_nnz, l_new->get_values()); + make_array_view(exec, l_nnz, l_new->get_values()); u_builder.get_value_array() = - Array::view(exec, u_nnz, u_new->get_values()); + make_array_view(exec, u_nnz, u_new->get_values()); } // convert U' into CSC format diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index b2a92cbbe95..94cb35c42c9 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -462,31 +462,30 @@ void Dense::compute_norm1_impl(LinOp* result) const template void Dense::convert_to(Dense* result) const { - if (this->get_size() && result->get_size() == this->get_size()) { - // we need to create a executor-local clone of the target data, that - // will be copied back later. - auto exec = this->get_executor(); - auto result_array = make_temporary_output_clone(exec, &result->values_); - // create a (value, not pointer to avoid allocation overhead) view - // matrix on the array to avoid special-casing cross-executor copies - auto tmp_result = - Dense{exec, result->get_size(), - Array::view(exec, result_array->get_num_elems(), - result_array->get_data()), - result->get_stride()}; - exec->run(dense::make_copy(this, &tmp_result)); - } else { - result->values_ = this->values_; - result->stride_ = this->stride_; - result->set_size(this->get_size()); + if (result->get_size() != this->get_size()) { + result->resize(this->get_size()); } + // we need to create a temporary clone of the target data to write to + auto exec = this->get_executor(); + auto result_output = make_temporary_clone(exec, &result->values_); + // create a (value, not pointer to avoid allocation overhead) view + // matrix on the array to avoid special-casing cross-executor copies + auto tmp_result = + Dense{exec, this->get_size(), + make_array_view(exec, result_output->get_num_elems(), + result_output->get_data()), + result->get_stride()}; + exec->run(dense::make_copy(this, &tmp_result)); } template void Dense::move_to(Dense* result) { - this->convert_to(result); + result->values_ = std::move(this->values_); + result->stride_ = std::exchange(this->stride_, 0); + result->set_size(this->get_size()); + this->resize(gko::dim<2>{}); } @@ -628,6 +627,7 @@ void Dense::convert_impl(Fbcsr* result) const exec->copy_val_to_host(tmp->get_const_row_ptrs() + row_blocks); tmp->col_idxs_.resize_and_reset(nnz_blocks); tmp->values_.resize_and_reset(nnz_blocks * bs * bs); + tmp->values_.fill(zero()); tmp->set_size(this->get_size()); exec->run(dense::make_convert_to_fbcsr(this, tmp.get())); } @@ -1584,6 +1584,64 @@ void Dense::add_scaled_identity_impl(const LinOp* const a, } +template +std::unique_ptr::real_type> +Dense::create_real_view() +{ + const auto num_rows = this->get_size()[0]; + constexpr bool complex = is_complex(); + const auto num_cols = + complex ? 2 * this->get_size()[1] : this->get_size()[1]; + const auto stride = complex ? 2 * this->get_stride() : this->get_stride(); + + return Dense>::create( + this->get_executor(), dim<2>{num_rows, num_cols}, + make_array_view( + this->get_executor(), num_rows * stride, + reinterpret_cast*>(this->get_values())), + stride); +} + + +template +std::unique_ptr::real_type> +Dense::create_real_view() const +{ + const auto num_rows = this->get_size()[0]; + constexpr bool complex = is_complex(); + const auto num_cols = + complex ? 2 * this->get_size()[1] : this->get_size()[1]; + const auto stride = complex ? 2 * this->get_stride() : this->get_stride(); + + return Dense>::create_const( + this->get_executor(), dim<2>{num_rows, num_cols}, + make_const_array_view( + this->get_executor(), num_rows * stride, + reinterpret_cast*>( + this->get_const_values())), + stride); +} + + +template +std::unique_ptr> Dense::create_submatrix_impl( + const span& rows, const span& columns, const size_type stride) +{ + row_major_range range_this{this->get_values(), this->get_size()[0], + this->get_size()[1], this->get_stride()}; + auto range_result = range_this(rows, columns); + size_type storage_size = + rows.length() > 0 + ? range_result.length(0) * this->get_stride() - columns.begin + : 0; + return Dense::create( + this->get_executor(), + dim<2>{range_result.length(0), range_result.length(1)}, + make_array_view(this->get_executor(), storage_size, range_result->data), + stride); +} + + #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX); diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 3325638fbde..a651fa41cec 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -232,9 +232,12 @@ void Fbcsr::read(device_mat_data&& data) this->set_size(data.get_size()); row_ptrs_.resize_and_reset(row_blocks + 1); auto exec = this->get_executor(); - auto local_data = make_temporary_clone(exec, &data); - exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_, - col_idxs_, values_)); + { + auto local_data = make_temporary_clone(exec, &data); + exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_, + col_idxs_, values_)); + } + // this needs to happen after the temporary clone copy-back data.empty_out(); } diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index e9f9352bf7b..22c7b61012e 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -273,9 +273,9 @@ void Jacobi::generate(const LinOp* system_matrix, if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); } - auto temp = Array::view(diag_vt->get_executor(), - diag_vt->get_size()[0], - diag_vt->get_values()); + auto temp = + make_array_view(diag_vt->get_executor(), diag_vt->get_size()[0], + diag_vt->get_values()); this->blocks_ = Array(exec, temp.get_num_elems()); exec->run(jacobi::make_invert_diagonal(temp, this->blocks_)); this->num_blocks_ = diag_vt->get_size()[0]; diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp index d0acb5c68cf..d48bd0bea7e 100644 --- a/core/test/base/array.cpp +++ b/core/test/base/array.cpp @@ -356,7 +356,7 @@ TYPED_TEST(Array, CanBeResized) TYPED_TEST(Array, ViewCannotBeResized) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); EXPECT_THROW(view.resize_and_reset(1), gko::NotSupported); EXPECT_EQ(view.get_num_elems(), 3); @@ -416,7 +416,7 @@ TYPED_TEST(Array, ChangesExecutors) TYPED_TEST(Array, ViewModifiesOriginalData) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); TypeParam new_data[] = {5, 4, 2}; std::copy(new_data, new_data + 3, view.get_data()); @@ -448,11 +448,11 @@ TYPED_TEST(Array, CopyArrayToArray) TYPED_TEST(Array, CopyViewToView) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); TypeParam data2[] = {5, 4, 2}; - auto view2 = gko::Array::view(this->exec, 3, data2); + auto view2 = gko::make_array_view(this->exec, 3, data2); TypeParam data_size4[] = {5, 4, 2, 1}; - auto view_size4 = gko::Array::view(this->exec, 4, data_size4); + auto view_size4 = gko::make_array_view(this->exec, 4, data_size4); view = view2; view2.get_data()[0] = 2; @@ -470,7 +470,7 @@ TYPED_TEST(Array, CopyViewToView) TYPED_TEST(Array, CopyViewToArray) { TypeParam data[] = {1, 2, 3, 4}; - auto view = gko::Array::view(this->exec, 4, data); + auto view = gko::make_array_view(this->exec, 4, data); gko::Array array(this->exec, {5, 4, 2}); array = view; @@ -488,7 +488,7 @@ TYPED_TEST(Array, CopyViewToArray) TYPED_TEST(Array, CopyArrayToView) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); gko::Array array_size2(this->exec, {5, 4}); gko::Array array_size4(this->exec, {5, 4, 2, 1}); @@ -525,9 +525,9 @@ TYPED_TEST(Array, MoveArrayToArray) TYPED_TEST(Array, MoveViewToView) { TypeParam data[] = {1, 2, 3, 4}; - auto view = gko::Array::view(this->exec, 4, data); + auto view = gko::make_array_view(this->exec, 4, data); TypeParam data2[] = {5, 4, 2}; - auto view2 = gko::Array::view(this->exec, 3, data2); + auto view2 = gko::make_array_view(this->exec, 3, data2); view = std::move(view2); @@ -550,7 +550,7 @@ TYPED_TEST(Array, MoveViewToArray) { TypeParam data[] = {1, 2, 3, 4}; gko::Array array(this->exec, {5, 4, 2}); - auto view = gko::Array::view(this->exec, 4, data); + auto view = gko::make_array_view(this->exec, 4, data); array = std::move(view); @@ -572,7 +572,7 @@ TYPED_TEST(Array, MoveViewToArray) TYPED_TEST(Array, MoveArrayToView) { TypeParam data[] = {1, 2, 3}; - auto view = gko::Array::view(this->exec, 3, data); + auto view = gko::make_array_view(this->exec, 3, data); gko::Array array_size2(this->exec, {5, 4}); gko::Array array_size4(this->exec, {5, 4, 2, 1}); auto size2_ptr = array_size2.get_data(); diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp index 18438a6dd68..4cc17b145b9 100644 --- a/core/test/matrix/coo.cpp +++ b/core/test/matrix/coo.cpp @@ -142,9 +142,9 @@ TYPED_TEST(Coo, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Coo::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 4, values), - gko::Array::view(this->exec, 4, col_idxs), - gko::Array::view(this->exec, 4, row_idxs)); + gko::make_array_view(this->exec, 4, values), + gko::make_array_view(this->exec, 4, col_idxs), + gko::make_array_view(this->exec, 4, row_idxs)); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp index 166b1adb86b..85163b95e01 100644 --- a/core/test/matrix/csr.cpp +++ b/core/test/matrix/csr.cpp @@ -147,9 +147,9 @@ TYPED_TEST(Csr, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Csr::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 4, values), - gko::Array::view(this->exec, 4, col_idxs), - gko::Array::view(this->exec, 4, row_ptrs), + gko::make_array_view(this->exec, 4, values), + gko::make_array_view(this->exec, 4, col_idxs), + gko::make_array_view(this->exec, 4, row_ptrs), std::make_shared(2)); ASSERT_EQ(mtx->get_num_srow_elements(), 1); diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index 2c670fe8862..cbb94a5c605 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -60,8 +60,7 @@ class Dense : public ::testing::Test { static void assert_equal_to_original_mtx(gko::matrix::Dense* m) { ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3)); - ASSERT_EQ(m->get_stride(), 4); - ASSERT_EQ(m->get_num_stored_elements(), 2 * 4); + ASSERT_EQ(m->get_num_stored_elements(), 2 * m->get_stride()); EXPECT_EQ(m->at(0, 0), value_type{1.0}); EXPECT_EQ(m->at(0, 1), value_type{2.0}); EXPECT_EQ(m->at(0, 2), value_type{3.0}); @@ -131,7 +130,7 @@ TYPED_TEST(Dense, CanBeConstructedFromExistingData) auto m = gko::matrix::Dense::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 9, data), 3); + gko::make_array_view(this->exec, 9, data), 3); ASSERT_EQ(m->get_const_values(), data); ASSERT_EQ(m->at(2, 1), value_type{6.0}); @@ -172,6 +171,7 @@ TYPED_TEST(Dense, CreateWithSameConfigKeepsStride) TYPED_TEST(Dense, KnowsItsSizeAndValues) { this->assert_equal_to_original_mtx(this->mtx.get()); + ASSERT_EQ(this->mtx->get_stride(), 4); } @@ -241,6 +241,8 @@ TYPED_TEST(Dense, CanBeCopied) this->assert_equal_to_original_mtx(this->mtx.get()); this->mtx->at(0) = 7; this->assert_equal_to_original_mtx(mtx_copy.get()); + ASSERT_EQ(this->mtx->get_stride(), 4); + ASSERT_EQ(mtx_copy->get_stride(), 3); } @@ -249,14 +251,15 @@ TYPED_TEST(Dense, CanBeMoved) auto mtx_copy = gko::matrix::Dense::create(this->exec); mtx_copy->copy_from(std::move(this->mtx)); this->assert_equal_to_original_mtx(mtx_copy.get()); + ASSERT_EQ(mtx_copy->get_stride(), 4); } TYPED_TEST(Dense, CanBeCloned) { auto mtx_clone = this->mtx->clone(); - this->assert_equal_to_original_mtx( - dynamic_castmtx.get())>(mtx_clone.get())); + this->assert_equal_to_original_mtx(mtx_clone.get()); + ASSERT_EQ(mtx_clone->get_stride(), 3); } diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp index a9fca07aa37..bf8bcc221bf 100644 --- a/core/test/matrix/diagonal.cpp +++ b/core/test/matrix/diagonal.cpp @@ -108,7 +108,7 @@ TYPED_TEST(Diagonal, CanBeCreatedFromExistingData) value_type values[] = {1.0, 2.0, 3.0}; auto diag = gko::matrix::Diagonal::create( - this->exec, 3, gko::Array::view(this->exec, 3, values)); + this->exec, 3, gko::make_array_view(this->exec, 3, values)); ASSERT_EQ(diag->get_const_values(), values); } diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp index e7c9fd622e0..d792e00d302 100644 --- a/core/test/matrix/ell.cpp +++ b/core/test/matrix/ell.cpp @@ -146,8 +146,8 @@ TYPED_TEST(Ell, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Ell::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 8, values), - gko::Array::view(this->exec, 8, col_idxs), 2, 4); + gko::make_array_view(this->exec, 8, values), + gko::make_array_view(this->exec, 8, col_idxs), 2, 4); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp index eddf6d4b753..4dc15755f48 100644 --- a/core/test/matrix/fbcsr.cpp +++ b/core/test/matrix/fbcsr.cpp @@ -398,9 +398,9 @@ TYPED_TEST(Fbcsr, CanBeCreatedFromExistingData) auto mtx = gko::matrix::Fbcsr::create( this->exec, gko::dim<2>{nbrows * bs, nbcols * bs}, bs, - gko::Array::view(this->exec, bnnz * bs * bs, values), - gko::Array::view(this->exec, bnnz, col_idxs), - gko::Array::view(this->exec, nbrows + 1, row_ptrs)); + gko::make_array_view(this->exec, bnnz * bs * bs, values), + gko::make_array_view(this->exec, bnnz, col_idxs), + gko::make_array_view(this->exec, nbrows + 1, row_ptrs)); ASSERT_EQ(mtx->get_const_values(), values); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp index a5d21ba1782..5214f439ac9 100644 --- a/core/test/matrix/permutation.cpp +++ b/core/test/matrix/permutation.cpp @@ -136,7 +136,7 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData) auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 5}, - gko::Array::view(this->exec, 3, data)); + gko::make_array_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_permutation(), data); } @@ -191,7 +191,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions) ASSERT_THROW(gko::matrix::Permutation::create( this->exec, gko::dim<2>{4, 2}, - gko::Array::view(this->exec, 3, data)), + gko::make_array_view(this->exec, 3, data)), gko::ValueMismatch); } @@ -203,7 +203,7 @@ TYPED_TEST(Permutation, SettingMaskDoesNotModifyData) auto m = gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 5}, - gko::Array::view(this->exec, 3, data)); + gko::make_array_view(this->exec, 3, data)); auto mask = m->get_permute_mask(); ASSERT_EQ(m->get_const_permutation(), data); @@ -225,7 +225,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions) ASSERT_THROW(gko::matrix::Permutation::create( this->exec, gko::dim<2>{3, 4}, - gko::Array::view(this->exec, 3, data), + gko::make_array_view(this->exec, 3, data), gko::matrix::column_permute), gko::ValueMismatch); } diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp index 51096a94893..68c3142ca0c 100644 --- a/core/test/matrix/row_gatherer.cpp +++ b/core/test/matrix/row_gatherer.cpp @@ -127,7 +127,7 @@ TYPED_TEST(RowGatherer, RowGathererCanBeConstructedFromExistingData) auto m = gko::matrix::RowGatherer::create( this->exec, gko::dim<2>{3, 5}, - gko::Array::view(this->exec, 3, data)); + gko::make_array_view(this->exec, 3, data)); ASSERT_EQ(m->get_const_row_idxs(), data); } @@ -140,7 +140,7 @@ TYPED_TEST(RowGatherer, RowGathererThrowsforWrongRowPermDimensions) ASSERT_THROW(gko::matrix::RowGatherer::create( this->exec, gko::dim<2>{4, 2}, - gko::Array::view(this->exec, 3, data)), + gko::make_array_view(this->exec, 3, data)), gko::ValueMismatch); } diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp index ec59661dfe0..d14a8897d19 100644 --- a/core/test/matrix/sparsity_csr.cpp +++ b/core/test/matrix/sparsity_csr.cpp @@ -154,8 +154,8 @@ TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingData) auto mtx = gko::matrix::SparsityCsr::create( this->exec, gko::dim<2>{3, 2}, - gko::Array::view(this->exec, 4, col_idxs), - gko::Array::view(this->exec, 4, row_ptrs), 2.0); + gko::make_array_view(this->exec, 4, col_idxs), + gko::make_array_view(this->exec, 4, row_ptrs), 2.0); ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs); ASSERT_EQ(mtx->get_const_row_ptrs(), row_ptrs); diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp index 7c8609573f0..5f8d454fac7 100644 --- a/core/test/utils/assertions_test.cpp +++ b/core/test/utils/assertions_test.cpp @@ -54,7 +54,7 @@ class MatricesNear : public ::testing::Test { template gko::Array make_view(std::array& array) { - return gko::Array::view(exec, size, array.data()); + return gko::make_array_view(exec, size, array.data()); } MatricesNear() diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu index be86588a909..e79042b8eec 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu @@ -161,9 +161,9 @@ void threshold_filter_approx(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu index 6dd83c41835..0451582187f 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -117,9 +117,9 @@ void threshold_filter(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu index 6655db78296..92bd9e28ab8 100644 --- a/cuda/factorization/par_ilut_select_kernel.cu +++ b/cuda/factorization/par_ilut_select_kernel.cu @@ -159,7 +159,7 @@ void threshold_select(std::shared_ptr exec, if (step > 5) { Array cpu_out_array{ exec->get_master(), - Array::view(exec, bucket.size, tmp_out)}; + make_array_view(exec, bucket.size, tmp_out)}; auto begin = cpu_out_array.get_data(); auto end = begin + bucket.size; auto middle = begin + rank; diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index d60323c40c0..2b83c4fdc22 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -36,6 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include +#include +#include +#include +#include + + #include #include #include @@ -47,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" @@ -97,6 +107,7 @@ using spgeam_kernels = syn::value_list; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -304,8 +315,6 @@ void load_balance_spmv(std::shared_ptr exec, as_cuda_type(c->get_stride())); } } - } else { - GKO_NOT_SUPPORTED(nwarps); } } @@ -897,16 +906,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_fbcsr(std::shared_ptr exec, - const matrix::Csr* source, int bs, - Array& row_ptrs, Array& col_idxs, - Array& values) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -1006,7 +1005,7 @@ void conj_transpose(std::shared_ptr exec, idxBase, alg, buffer); #endif if (grid_size > 0) { - conjugate_kernel<<>>( + kernel::conjugate<<>>( trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); } @@ -1028,7 +1027,7 @@ void inv_symm_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - inv_row_ptr_permute_kernel<<>>( + kernel::inv_row_ptr_permute<<>>( num_rows, perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); } @@ -1036,7 +1035,7 @@ void inv_symm_permute(std::shared_ptr exec, auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { - inv_symm_permute_kernel + kernel::inv_symm_permute <<>>( num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), @@ -1059,7 +1058,7 @@ void row_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - row_ptr_permute_kernel<<>>( + kernel::row_ptr_permute<<>>( num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); } @@ -1067,7 +1066,7 @@ void row_permute(std::shared_ptr exec, auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { - row_permute_kernel + kernel::row_permute <<>>( num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), @@ -1090,7 +1089,7 @@ void inverse_row_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - inv_row_ptr_permute_kernel<<>>( + kernel::inv_row_ptr_permute<<>>( num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); } @@ -1098,7 +1097,7 @@ void inverse_row_permute(std::shared_ptr exec, auto copy_num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { - inv_row_permute_kernel + kernel::inv_row_permute <<>>( num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), @@ -1247,7 +1246,7 @@ void is_sorted_by_column_index( const matrix::Csr* to_check, bool* is_sorted) { *is_sorted = true; - auto cpu_array = Array::view(exec->get_master(), 1, is_sorted); + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); auto gpu_array = Array{exec, cpu_array}; auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 9537d6595f0..bc55a60ac93 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -291,7 +292,18 @@ template void convert_to_fbcsr(std::shared_ptr exec, const matrix::Dense* source, matrix::Fbcsr* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_block_rows = result->get_num_block_rows(); + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel::convert_to_fbcsr<<>>( + num_block_rows, result->get_num_block_cols(), source->get_stride(), + result->get_block_size(), as_cuda_type(source->get_const_values()), + result->get_const_row_ptrs(), result->get_col_idxs(), + as_cuda_type(result->get_values())); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -300,8 +312,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void count_nonzero_blocks_per_row(std::shared_ptr exec, const matrix::Dense* source, - int bs, - IndexType* result) GKO_NOT_IMPLEMENTED; + int bs, IndexType* result) +{ + const auto num_block_rows = source->get_size()[0] / bs; + const auto num_block_cols = source->get_size()[1] / bs; + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel:: + count_nonzero_blocks_per_row<<>>( + num_block_rows, num_block_cols, source->get_stride(), bs, + as_cuda_type(source->get_const_values()), result); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -407,9 +430,9 @@ void transpose(std::shared_ptr exec, auto beta = zero(); cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, - static_cast(nullptr), trans->get_size()[1], - trans->get_values(), trans->get_stride()); + orig->get_stride(), &beta, trans->get_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; @@ -432,9 +455,9 @@ void conj_transpose(std::shared_ptr exec, auto beta = zero(); cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, - static_cast(nullptr), trans->get_size()[1], - trans->get_values(), trans->get_stride()); + orig->get_stride(), &beta, trans->get_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); } } else { GKO_NOT_IMPLEMENTED; diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index c906bb7fd1a..58c59af2cc2 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -242,9 +242,7 @@ void spmv(std::shared_ptr exec, */ const int info = (!atomic) * num_thread_per_worker; if (atomic) { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), - zero()); + dense::fill(exec, c, zero()); } select_abstract_spmv( compiled_kernels(), diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index c9c4782d262..bec6ce29807 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" @@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/merging.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" namespace gko { @@ -77,20 +79,6 @@ namespace kernels { namespace cuda { -namespace csr_reuse { - - -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/matrix/csr_kernels.hpp.inc" - - -} // namespace csr_reuse - - /** * @brief The fixed-size block compressed sparse row matrix format namespace. * @@ -102,7 +90,7 @@ namespace fbcsr { constexpr int default_block_size{512}; -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" @@ -114,6 +102,9 @@ void dense_transpose(std::shared_ptr exec, const size_type orig_stride, const ValueType* const orig, const size_type trans_stride, ValueType* const trans) { + if (nrows == 0) { + return; + } if (cublas::is_supported::value) { auto handle = exec->get_cublas_handle(); { @@ -121,8 +112,7 @@ void dense_transpose(std::shared_ptr exec, auto alpha = one(); auto beta = zero(); cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha, - orig, orig_stride, &beta, - static_cast(nullptr), nrows, trans, + orig, orig_stride, &beta, trans, trans_stride, trans, trans_stride); } } else { @@ -139,6 +129,15 @@ void spmv(std::shared_ptr exec, const matrix::Dense* const b, matrix::Dense* const c) { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: fill output with zero + dense::fill(exec, c, zero()); + return; + } if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); cusparse::pointer_mode_guard pm_guard(handle); @@ -153,21 +152,24 @@ void spmv(std::shared_ptr exec, const IndexType nb = a->get_num_block_cols(); const auto nnzb = static_cast(a->get_num_stored_blocks()); const auto nrhs = static_cast(b->get_size()[1]); - assert(nrhs == c->get_size()[1]); - if (nrhs == 1) { + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), &beta, c->get_values()); } else { - auto trans_c = - Array(exec, c->get_size()[0] * c->get_size()[1]); + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), nrhs, &beta, - trans_c.get_data(), mb * bs); - dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(), - c->get_stride(), c->get_values()); + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); } cusparse::destroy(descr); } else { @@ -186,6 +188,15 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense* const beta, matrix::Dense* const c) { + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); const auto alphp = alpha->get_const_values(); @@ -199,23 +210,26 @@ void advanced_spmv(std::shared_ptr exec, const IndexType nb = a->get_num_block_cols(); const auto nnzb = static_cast(a->get_num_stored_blocks()); const auto nrhs = static_cast(b->get_size()[1]); - assert(nrhs == c->get_size()[1]); - if (nrhs == 1) { + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, b->get_const_values(), betap, c->get_values()); } else { - auto trans_c = - Array(exec, c->get_size()[0] * c->get_size()[1]); - dense_transpose(exec, mb * bs, nrhs, c->get_stride(), - c->get_values(), mb * bs, trans_c.get_data()); + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); + dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), + trans_stride, trans_c.get_data()); cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), nrhs, betap, - trans_c.get_data(), mb * bs); - dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(), - c->get_stride(), c->get_values()); + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); } cusparse::destroy(descr); } else { @@ -230,7 +244,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; + matrix::Dense* result) +{ + constexpr auto warps_per_block = default_block_size / config::warp_size; + const auto num_blocks = + ceildiv(source->get_num_block_rows(), warps_per_block); + if (num_blocks > 0) { + kernel::fill_in_dense<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_cuda_type(source->get_const_values()), + as_cuda_type(result->get_values()), result->get_stride(), + source->get_num_block_rows(), source->get_block_size()); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); @@ -329,7 +355,7 @@ void conj_transpose(std::shared_ptr exec, ceildiv(trans->get_num_stored_elements(), default_block_size); transpose(exec, orig, trans); if (grid_size > 0) { - csr_reuse::conjugate_kernel<<>>( + kernel::conjugate<<>>( trans->get_num_stored_elements(), as_cuda_type(trans->get_values())); } @@ -348,14 +374,14 @@ void is_sorted_by_column_index( *is_sorted = true; auto gpu_array = Array(exec, 1); // need to initialize the GPU value to true - exec->copy_from(exec->get_master().get(), 1, is_sorted, - gpu_array.get_data()); + exec->copy_from(exec->get_master().get(), 1, is_sorted, + gpu_array.get_data()); auto block_size = default_block_size; const auto num_brows = static_cast(to_check->get_num_block_rows()); const auto num_blocks = ceildiv(num_brows, block_size); if (num_blocks > 0) { - csr_reuse::kernel::check_unsorted<<>>( + kernel::check_unsorted<<>>( to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), num_brows, gpu_array.get_data()); } diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index f108eacec05..fb8066dbb61 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -676,16 +676,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block, const size_type b_stride, ValueType* c, const size_type c_stride) { - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - abstract_classical_spmv(num_rows, val, col_idxs, - row_ptrs, b, b_stride, c, - c_stride, item_ct1); - }); - }); + if (subgroup_size > 1) { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, b_stride, c, + c_stride, item_ct1); + }); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, + b_stride, c, c_stride, item_ct1); + }); + }); + } } @@ -718,14 +729,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block, const size_type b_stride, const ValueType* beta, ValueType* c, const size_type c_stride) { - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, row_ptrs, b, - b_stride, beta, c, c_stride, item_ct1); - }); - }); + if (subgroup_size > 1) { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, + beta, c, c_stride, item_ct1); + }); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, + b, b_stride, beta, c, c_stride, item_ct1); + }); + }); + } } @@ -1125,9 +1149,15 @@ void spmv(std::shared_ptr exec, { if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) { + // empty input: zero output + dense::fill(exec, c, zero()); + return; + } + if (a->get_strategy()->get_name() == "load_balance") { + dense::fill(exec, c, zero()); const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { const dim3 csr_block(config::warp_size, warps_in_block, 1); @@ -1139,8 +1169,6 @@ void spmv(std::shared_ptr exec, a->get_const_col_idxs(), a->get_const_row_ptrs(), a->get_const_srow(), b->get_const_values(), b->get_stride(), c->get_values(), c->get_stride()); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -1219,7 +1247,14 @@ void advanced_spmv(std::shared_ptr exec, { if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do - } else if (a->get_strategy()->get_name() == "load_balance") { + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } + if (a->get_strategy()->get_name() == "load_balance") { dense::scale(exec, beta, c); const IndexType nwarps = a->get_num_srow_elements(); @@ -1235,8 +1270,6 @@ void advanced_spmv(std::shared_ptr exec, a->get_const_col_idxs(), a->get_const_row_ptrs(), a->get_const_srow(), b->get_const_values(), b->get_stride(), c->get_values(), c->get_stride()); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse") { diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index fbd1063bf13..8be02fb7988 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -429,9 +429,7 @@ void spmv(std::shared_ptr exec, */ const int info = (!atomic) * num_thread_per_worker; if (atomic) { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), - zero()); + dense::fill(exec, c, zero()); } select_abstract_spmv( compiled_kernels(), diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp new file mode 100644 index 00000000000..e92c329fba1 --- /dev/null +++ b/hip/base/hipsparse_block_bindings.hip.hpp @@ -0,0 +1,164 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ +#define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ + + +#include + + +#include + + +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/types.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The HIPSPARSE namespace. + * + * @ingroup hipsparse + */ +namespace hipsparse { + + +/// Default storage layout within each small dense block +constexpr hipsparseDirection_t blockDir = HIPSPARSE_DIRECTION_COLUMN; + + +#define GKO_BIND_HIPSPARSE32_BSRMV(ValueType, HipsparseName) \ + inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int32 mb, int32 nb, int32 nnzb, const ValueType* alpha, \ + const hipsparseMatDescr_t descrA, const ValueType* valA, \ + const int32* rowPtrA, const int32* colIndA, \ + int block_size, const ValueType* x, \ + const ValueType* beta, ValueType* y) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, blockDir, transA, mb, nb, nnzb, as_hiplibs_type(alpha), \ + descrA, as_hiplibs_type(valA), rowPtrA, colIndA, block_size, \ + as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y))); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_BSRMV(ValueType, HipsparseName) \ + inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + int64 mb, int64 nb, int64 nnzb, const ValueType* alpha, \ + const hipsparseMatDescr_t descrA, const ValueType* valA, \ + const int64* rowPtrA, const int64* colIndA, \ + int block_size, const ValueType* x, \ + const ValueType* beta, ValueType* y) \ + GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_BSRMV(float, hipsparseSbsrmv); +GKO_BIND_HIPSPARSE32_BSRMV(double, hipsparseDbsrmv); +GKO_BIND_HIPSPARSE32_BSRMV(std::complex, hipsparseCbsrmv); +GKO_BIND_HIPSPARSE32_BSRMV(std::complex, hipsparseZbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(float, hipsparseSbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(double, hipsparseDbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(std::complex, hipsparseCbsrmv); +GKO_BIND_HIPSPARSE64_BSRMV(std::complex, hipsparseZbsrmv); +template +GKO_BIND_HIPSPARSE32_BSRMV(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_BSRMV(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_BSRMV +#undef GKO_BIND_HIPSPARSE64_BSRMV + + +#define GKO_BIND_HIPSPARSE32_BSRMM(ValueType, HipsparseName) \ + inline void bsrmm(hipsparseHandle_t handle, hipsparseOperation_t transA, \ + hipsparseOperation_t transB, int32 mb, int32 n, \ + int32 kb, int32 nnzb, const ValueType* alpha, \ + const hipsparseMatDescr_t descrA, const ValueType* valA, \ + const int32* rowPtrA, const int32* colIndA, \ + int block_size, const ValueType* B, int32 ldb, \ + const ValueType* beta, ValueType* C, int32 ldc) \ + { \ + GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ + handle, blockDir, transA, transB, mb, n, kb, nnzb, \ + as_hiplibs_type(alpha), descrA, as_hiplibs_type(valA), rowPtrA, \ + colIndA, block_size, as_hiplibs_type(B), ldb, \ + as_hiplibs_type(beta), as_hiplibs_type(C), ldc)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define GKO_BIND_HIPSPARSE64_BSRMM(ValueType, HipsparseName) \ + inline void bsrmm( \ + hipsparseHandle_t handle, hipsparseOperation_t transA, \ + hipsparseOperation_t transB, int64 mb, int64 n, int64 kb, int64 nnzb, \ + const ValueType* alpha, const hipsparseMatDescr_t descrA, \ + const ValueType* valA, const int64* rowPtrA, const int64* colIndA, \ + int block_size, const ValueType* B, int64 ldb, const ValueType* beta, \ + ValueType* C, int64 ldc) GKO_NOT_IMPLEMENTED; \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_HIPSPARSE32_BSRMM(float, hipsparseSbsrmm); +GKO_BIND_HIPSPARSE32_BSRMM(double, hipsparseDbsrmm); +GKO_BIND_HIPSPARSE32_BSRMM(std::complex, hipsparseCbsrmm); +GKO_BIND_HIPSPARSE32_BSRMM(std::complex, hipsparseZbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(float, hipsparseSbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(double, hipsparseDbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(std::complex, hipsparseCbsrmm); +GKO_BIND_HIPSPARSE64_BSRMM(std::complex, hipsparseZbsrmm); +template +GKO_BIND_HIPSPARSE32_BSRMM(ValueType, detail::not_implemented); +template +GKO_BIND_HIPSPARSE64_BSRMM(ValueType, detail::not_implemented); + + +#undef GKO_BIND_HIPSPARSE32_BSRMM +#undef GKO_BIND_HIPSPARSE64_BSRMM + + +} // namespace hipsparse +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_ diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp index d484af93d2a..f86bed4fa76 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -165,9 +165,9 @@ void threshold_filter_approx(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index cf691f58f8b..7b335c08175 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -120,9 +120,9 @@ void threshold_filter(syn::value_list, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } if (num_blocks > 0) { diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp index 8130a76cb18..7e38a6c67a0 100644 --- a/hip/factorization/par_ilut_select_kernel.hip.cpp +++ b/hip/factorization/par_ilut_select_kernel.hip.cpp @@ -163,7 +163,7 @@ void threshold_select(std::shared_ptr exec, if (step > 5) { Array cpu_out_array{ exec->get_master(), - Array::view(exec, bucket.size, tmp_out)}; + make_array_view(exec, bucket.size, tmp_out)}; auto begin = cpu_out_array.get_data(); auto end = begin + bucket.size; auto middle = begin + rank; diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index a9fcb49e8e6..5bac002f63c 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -37,6 +37,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include +#include +#include +#include +#include #include @@ -50,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" @@ -99,6 +107,7 @@ using spgeam_kernels = syn::value_list; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -280,8 +289,7 @@ void spmv(std::shared_ptr exec, if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { // empty output: nothing to do } else if (a->get_strategy()->get_name() == "load_balance") { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + dense::fill(exec, c, zero()); const IndexType nwarps = a->get_num_srow_elements(); if (nwarps > 0) { const dim3 csr_block(config::warp_size, warps_in_block, 1); @@ -296,8 +304,6 @@ void spmv(std::shared_ptr exec, as_hip_type(b->get_const_values()), as_hip_type(b->get_stride()), as_hip_type(c->get_values()), as_hip_type(c->get_stride())); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -392,8 +398,6 @@ void advanced_spmv(std::shared_ptr exec, as_hip_type(b->get_const_values()), as_hip_type(b->get_stride()), as_hip_type(c->get_values()), as_hip_type(c->get_stride())); - } else { - GKO_NOT_SUPPORTED(nwarps); } } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = @@ -730,16 +734,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL); -template -void convert_to_fbcsr(std::shared_ptr exec, - const matrix::Csr* source, int bs, - Array& row_ptrs, Array& col_idxs, - Array& values) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); - - template void transpose(std::shared_ptr exec, const matrix::Csr* orig, @@ -790,7 +784,7 @@ void conj_transpose(std::shared_ptr exec, trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); if (grid_size > 0) { - hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0, + hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0, trans->get_num_stored_elements(), as_hip_type(trans->get_values())); } @@ -812,7 +806,7 @@ void inv_symm_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute), count_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs()); @@ -822,7 +816,7 @@ void inv_symm_permute(std::shared_ptr exec, ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(inv_symm_permute_kernel), + HIP_KERNEL_NAME(kernel::inv_symm_permute), copy_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), permuted->get_row_ptrs(), @@ -842,7 +836,7 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(row_ptr_permute_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::row_ptr_permute), count_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); @@ -852,7 +846,7 @@ void row_permute(std::shared_ptr exec, const IndexType* perm, ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(row_permute_kernel), + HIP_KERNEL_NAME(kernel::row_permute), copy_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), @@ -874,7 +868,7 @@ void inverse_row_permute(std::shared_ptr exec, auto num_rows = orig->get_size()[0]; auto count_num_blocks = ceildiv(num_rows, default_block_size); if (count_num_blocks > 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute), count_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs()); @@ -884,7 +878,7 @@ void inverse_row_permute(std::shared_ptr exec, ceildiv(num_rows, default_block_size / config::warp_size); if (copy_num_blocks > 0) { hipLaunchKernelGGL( - HIP_KERNEL_NAME(inv_row_permute_kernel), + HIP_KERNEL_NAME(kernel::inv_row_permute), copy_num_blocks, default_block_size, 0, 0, num_rows, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(), as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(), @@ -1027,7 +1021,7 @@ void is_sorted_by_column_index( const matrix::Csr* to_check, bool* is_sorted) { *is_sorted = true; - auto cpu_array = Array::view(exec->get_master(), 1, is_sorted); + auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); auto gpu_array = Array{exec, cpu_array}; auto block_size = default_block_size; auto num_rows = static_cast(to_check->get_size()[0]); diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 6ace089e2ea..4b63be8f905 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -296,7 +297,18 @@ template void convert_to_fbcsr(std::shared_ptr exec, const matrix::Dense* source, matrix::Fbcsr* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_block_rows = result->get_num_block_rows(); + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel::convert_to_fbcsr<<>>( + num_block_rows, result->get_num_block_cols(), source->get_stride(), + result->get_block_size(), as_hip_type(source->get_const_values()), + result->get_const_row_ptrs(), result->get_col_idxs(), + as_hip_type(result->get_values())); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL); @@ -305,8 +317,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void count_nonzero_blocks_per_row(std::shared_ptr exec, const matrix::Dense* source, - int bs, - IndexType* result) GKO_NOT_IMPLEMENTED; + int bs, IndexType* result) +{ + const auto num_block_rows = source->get_size()[0] / bs; + const auto num_block_cols = source->get_size()[1] / bs; + if (num_block_rows > 0) { + const auto num_blocks = + ceildiv(num_block_rows, default_block_size / config::warp_size); + kernel:: + count_nonzero_blocks_per_row<<>>( + num_block_rows, num_block_cols, source->get_stride(), bs, + as_hip_type(source->get_const_values()), result); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL); @@ -417,7 +440,7 @@ void transpose(std::shared_ptr exec, hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), orig->get_stride(), &beta, - orig->get_const_values(), trans->get_size()[1], + trans->get_const_values(), trans->get_stride(), trans->get_values(), trans->get_stride()); } } else { @@ -442,7 +465,7 @@ void conj_transpose(std::shared_ptr exec, hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N, orig->get_size()[0], orig->get_size()[1], &alpha, orig->get_const_values(), orig->get_stride(), &beta, - orig->get_const_values(), trans->get_size()[1], + trans->get_values(), trans->get_stride(), trans->get_values(), trans->get_stride()); } } else { diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2486a666ab0..a56b9b96d13 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -246,9 +246,7 @@ void spmv(std::shared_ptr exec, */ const int info = (!atomic) * num_thread_per_worker; if (atomic) { - components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), - zero()); + dense::fill(exec, c, zero()); } select_abstract_spmv( compiled_kernels(), diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 33ed4a08e2e..bf707859b02 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -54,15 +54,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "core/base/block_sizes.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/hipsparse_block_bindings.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/merging.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" namespace gko { namespace kernels { namespace hip { + + /** * @brief The fixed-size block compressed sparse row matrix format namespace. * @@ -74,25 +90,154 @@ namespace fbcsr { constexpr int default_block_size{512}; +#include "common/cuda_hip/matrix/csr_common.hpp.inc" #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" +namespace { + + +template +void dense_transpose(std::shared_ptr exec, + const size_type nrows, const size_type ncols, + const size_type orig_stride, const ValueType* const orig, + const size_type trans_stride, ValueType* const trans) +{ + if (nrows == 0) { + return; + } + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + { + hipblas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols, + &alpha, orig, orig_stride, &beta, trans, trans_stride, + trans, trans_stride); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +} // namespace + + template void spmv(std::shared_ptr exec, - const matrix::Fbcsr* a, - const matrix::Dense* b, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: fill output with zero + dense::fill(exec, c, zero()); + return; + } + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + hipsparse::pointer_mode_guard pm_guard(handle); + const auto alpha = one(); + const auto beta = zero(); + auto descr = hipsparse::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, + bs, b->get_const_values(), &beta, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); + hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL); template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Fbcsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) GKO_NOT_IMPLEMENTED; + const matrix::Dense* const alpha, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + const matrix::Dense* const beta, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } + if (hipsparse::is_supported::value) { + auto handle = exec->get_hipsparse_handle(); + const auto alphp = alpha->get_const_values(); + const auto betap = beta->get_const_values(); + auto descr = hipsparse::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = Array(exec, nrows * nrhs); + dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), + trans_stride, trans_c.get_data()); + hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + hipsparse::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL); @@ -101,7 +246,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void fill_in_dense(std::shared_ptr exec, const matrix::Fbcsr* source, - matrix::Dense* result) GKO_NOT_IMPLEMENTED; + matrix::Dense* result) +{ + constexpr auto warps_per_block = default_block_size / config::warp_size; + const auto num_blocks = + ceildiv(source->get_num_block_rows(), warps_per_block); + if (num_blocks > 0) { + kernel::fill_in_dense<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), + as_hip_type(source->get_const_values()), + as_hip_type(result->get_values()), result->get_stride(), + source->get_num_block_rows(), source->get_block_size()); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL); @@ -116,10 +273,8 @@ void convert_to_csr(const std::shared_ptr exec, const auto num_blocks = ceildiv(source->get_num_block_rows(), warps_per_block); if (num_blocks > 0) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks, - default_block_size, 0, 0, source->get_const_row_ptrs(), - source->get_const_col_idxs(), + kernel::convert_to_csr<<>>( + source->get_const_row_ptrs(), source->get_const_col_idxs(), as_hip_type(source->get_const_values()), result->get_row_ptrs(), result->get_col_idxs(), as_hip_type(result->get_values()), source->get_num_block_rows(), source->get_block_size()); @@ -131,9 +286,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) GKO_NOT_IMPLEMENTED; +void transpose(const std::shared_ptr exec, + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) + GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL); @@ -152,8 +308,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void is_sorted_by_column_index( std::shared_ptr exec, - const matrix::Fbcsr* to_check, - bool* is_sorted) GKO_NOT_IMPLEMENTED; + const matrix::Fbcsr* const to_check, + bool* const is_sorted) +{ + *is_sorted = true; + auto gpu_array = Array(exec, 1); + // need to initialize the GPU value to true + exec->copy_from(exec->get_master().get(), 1, is_sorted, + gpu_array.get_data()); + auto block_size = default_block_size; + const auto num_brows = + static_cast(to_check->get_num_block_rows()); + const auto num_blocks = ceildiv(num_brows, block_size); + if (num_blocks > 0) { + kernel::check_unsorted<<>>( + to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), + num_brows, gpu_array.get_data()); + } + *is_sorted = exec->copy_val_to_host(gpu_array.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX); diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index 2f95d5a4da7..35ed7985ca2 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -755,6 +755,25 @@ Array make_array_view(std::shared_ptr exec, } +/** + * Helper function to create a const array view deducing the value type. + * + * @param exec the executor on which the array resides + * @param size the number of elements for the array + * @param data the pointer to the array we create a view on. + * + * @tparam ValueType the type of the array elements + * + * @return `Array::const_view(exec, size, data)` + */ +template +detail::ConstArrayView make_const_array_view( + std::shared_ptr exec, size_type size, const ValueType* data) +{ + return Array::const_view(exec, size, data); +} + + namespace detail { diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index bcaba840fdf..20479295ad7 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -855,46 +855,12 @@ class Dense * real with a reinterpret_cast with twice the number of columns and * double the stride. */ - std::unique_ptr>> create_real_view() - { - const auto num_rows = this->get_size()[0]; - const bool complex = is_complex(); - const auto num_cols = - complex ? 2 * this->get_size()[1] : this->get_size()[1]; - const auto stride = - complex ? 2 * this->get_stride() : this->get_stride(); - - return Dense>::create( - this->get_executor(), dim<2>{num_rows, num_cols}, - Array>::view( - this->get_executor(), num_rows * stride, - reinterpret_cast*>( - this->get_values())), - stride); - } + std::unique_ptr create_real_view(); /** * @copydoc create_real_view() */ - std::unique_ptr>> create_real_view() - const - { - const auto num_rows = this->get_size()[0]; - const bool complex = is_complex(); - const auto num_cols = - complex ? 2 * this->get_size()[1] : this->get_size()[1]; - const auto stride = - complex ? 2 * this->get_stride() : this->get_stride(); - - return Dense>::create( - this->get_executor(), dim<2>{num_rows, num_cols}, - Array>::view( - this->get_executor(), num_rows * stride, - const_cast*>( - reinterpret_cast*>( - this->get_const_values()))), - stride); - } + std::unique_ptr create_real_view() const; /** * Creates a constant (immutable) Dense matrix from a constant array. @@ -1102,24 +1068,8 @@ class Dense * instead of create_submatrix(const span, const span, const * size_type). */ - virtual std::unique_ptr create_submatrix_impl(const span& rows, - const span& columns, - const size_type stride) - { - row_major_range range_this{this->get_values(), this->get_size()[0], - this->get_size()[1], this->get_stride()}; - auto range_result = range_this(rows, columns); - // TODO: can result in HUGE padding - which will be copied with the - // vector - return Dense::create( - this->get_executor(), - dim<2>{range_result.length(0), range_result.length(1)}, - Array::view( - this->get_executor(), - range_result.length(0) * range_this.length(1) - columns.begin, - range_result->data), - stride); - } + virtual std::unique_ptr create_submatrix_impl( + const span& rows, const span& columns, const size_type stride); void apply_impl(const LinOp* b, LinOp* x) const override; diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index 8f0b597686d..90183ec37df 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -136,9 +136,9 @@ void abstract_filter(std::shared_ptr exec, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index ce647f79935..ea01355cffa 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -86,9 +86,10 @@ void spmv(std::shared_ptr exec, #pragma omp parallel for for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) { - c->get_values()[i] = zero(); + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) = zero(); + } } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { @@ -130,10 +131,11 @@ void advanced_spmv(std::shared_ptr exec, #pragma omp parallel for for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) - c->get_values()[i] *= vbeta; - + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) *= vbeta; + } + } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { for (int ib = 0; ib < bs; ib++) { @@ -164,7 +166,7 @@ void fill_in_matrix_data(std::shared_ptr exec, components::soa_to_aos(exec, data, block_ordered); const auto in_nnz = data.get_num_elems(); auto block_ordered_ptr = block_ordered.get_data(); - std::stable_sort( + std::sort( block_ordered_ptr, block_ordered_ptr + in_nnz, [block_size](auto a, auto b) { return std::make_tuple(a.row / block_size, a.column / block_size) < @@ -248,7 +250,43 @@ template void convert_to_csr(const std::shared_ptr exec, const matrix::Fbcsr* const source, matrix::Csr* const result) - GKO_NOT_IMPLEMENTED; +{ + const auto nbrows = source->get_num_block_rows(); + const auto bs = source->get_block_size(); + const auto block_row_ptrs = source->get_const_row_ptrs(); + const auto block_col_idxs = source->get_const_col_idxs(); + const auto row_ptrs = result->get_row_ptrs(); + const auto col_idxs = result->get_col_idxs(); + const auto vals = result->get_values(); + auto sizes = + gko::to_std_array(block_row_ptrs[nbrows], bs, bs); + const auto block_vals = + acc::range>( + sizes, source->get_const_values()); +#pragma omp parallel for + for (IndexType block_row = 0; block_row < nbrows; block_row++) { + const auto block_row_begin = block_row_ptrs[block_row]; + const auto block_row_end = block_row_ptrs[block_row + 1]; + const auto block_row_size = block_row_end - block_row_begin; + const auto row_size = block_row_size * bs; + const auto row_base = block_row_begin * bs * bs; + for (int local_row = 0; local_row < bs; local_row++) { + const auto row = block_row * bs + local_row; + row_ptrs[row] = row_base + row_size * local_row; + for (auto block = block_row_begin; block < block_row_end; block++) { + const auto block_base = + row_ptrs[row] + bs * (block - block_row_begin); + for (int local_col = 0; local_col < bs; local_col++) { + const auto col = block_col_idxs[block] * bs + local_col; + const auto out_idx = block_base + local_col; + col_idxs[out_idx] = col; + vals[out_idx] = block_vals(block, local_row, local_col); + } + } + } + } + row_ptrs[result->get_size()[0]] = source->get_num_stored_elements(); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL); diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index 56cebc91366..691de096660 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -137,9 +137,9 @@ void abstract_filter(std::shared_ptr exec, matrix::CooBuilder coo_builder{m_out_coo}; coo_builder.get_row_idx_array().resize_and_reset(new_nnz); coo_builder.get_col_idx_array() = - Array::view(exec, new_nnz, new_col_idxs); + make_array_view(exec, new_nnz, new_col_idxs); coo_builder.get_value_array() = - Array::view(exec, new_nnz, new_vals); + make_array_view(exec, new_nnz, new_vals); new_row_idxs = m_out_coo->get_row_idxs(); } diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 9b093a97b68..deca89b976a 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -85,9 +85,10 @@ void spmv(const std::shared_ptr, to_std_array(nbnz, bs, bs), a->get_const_values()}; for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) { - c->get_values()[i] = zero(); + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) = zero(); + } } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { @@ -128,9 +129,11 @@ void advanced_spmv(const std::shared_ptr, to_std_array(nbnz, bs, bs), a->get_const_values()}; for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) { - for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs; - ++i) - c->get_values()[i] *= vbeta; + for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) { + for (IndexType rhs = 0; rhs < nvecs; rhs++) { + c->at(row, rhs) *= vbeta; + } + } for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1]; ++inz) { diff --git a/reference/test/components/precision_conversion_kernels.cpp b/reference/test/components/precision_conversion_kernels.cpp index a82b9a2ae5c..4ff17753098 100644 --- a/reference/test/components/precision_conversion_kernels.cpp +++ b/reference/test/components/precision_conversion_kernels.cpp @@ -128,7 +128,7 @@ TEST_F(PrecisionConversion, ConvertsRealFromView) gko::Array tmp{ref}; gko::Array out{ref}; - tmp = gko::Array::view(ref, vals.get_num_elems(), vals.get_data()); + tmp = gko::make_array_view(ref, vals.get_num_elems(), vals.get_data()); out = tmp; GKO_ASSERT_ARRAY_EQ(vals, out); diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp index bdb0f30059d..40773359229 100644 --- a/reference/test/matrix/permutation.cpp +++ b/reference/test/matrix/permutation.cpp @@ -81,8 +81,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToDense) i_type rdata[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, rdata)); + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); perm->apply(x.get(), y.get()); // clang-format off @@ -108,8 +107,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToDense) i_type rdata[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, rdata), + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata), gko::matrix::column_permute); perm->apply(x.get(), y.get()); @@ -138,11 +136,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense) i_type rdata[] = {1, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, rdata)); + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata)); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, cdata), + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, cdata), gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); @@ -170,8 +166,7 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray) i_type data[] = {1, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{2}, - gko::Array::view(this->exec, 2, data), + this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, data), gko::matrix::row_permute | gko::matrix::column_permute); perm->apply(x.get(), y1.get()); @@ -200,12 +195,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); @@ -234,8 +227,7 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray) i_type data[] = {1, 2, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, data), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, data), gko::matrix::column_permute | gko::matrix::row_permute | gko::matrix::inverse_permute); @@ -264,8 +256,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); rperm->apply(x.get(), y.get()); @@ -293,8 +284,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToDense) i_type cdata[] = {1, 2, 0}; auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); cperm->apply(x.get(), y.get()); @@ -323,8 +313,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata)); + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata)); perm->apply(x.get(), y.get()); // clang-format off @@ -352,8 +341,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToCsr) i_type cdata[] = {1, 2, 0}; auto perm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::column_permute); perm->apply(x.get(), y.get()); @@ -384,11 +372,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata)); + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata)); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); @@ -417,8 +403,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); rperm->apply(x.get(), y.get()); @@ -446,8 +431,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr) i_type cdata[] = {1, 2, 0}; auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); cperm->apply(x.get(), y.get()); @@ -477,12 +461,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr) i_type rdata[] = {1, 2, 0}; auto rperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, rdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata), gko::matrix::row_permute | gko::matrix::inverse_permute); auto cperm = gko::matrix::Permutation::create( - this->exec, gko::dim<2>{3}, - gko::Array::view(this->exec, 3, cdata), + this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata), gko::matrix::inverse_permute | gko::matrix::column_permute); rperm->apply(x.get(), y1.get()); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index c87874776c3..30563438c65 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -81,6 +81,8 @@ struct SimpleMatrixTest { {} static void check_property(const std::unique_ptr&) {} + + static bool supports_strides() { return true; } }; struct DenseWithDefaultStride @@ -200,36 +202,33 @@ struct CsrWithAutomaticalStrategy struct Ell : SimpleMatrixTest> {}; -struct FbcsrBlocksize1 - : SimpleMatrixTest> { +template +struct Fbcsr : SimpleMatrixTest> { static bool preserves_zeros() { return false; } static std::unique_ptr create( std::shared_ptr exec, gko::dim<2> size) { - return matrix_type::create(exec, size, 0, 1); + size[0] = gko::ceildiv(size[0], block_size) * block_size; + size[1] = gko::ceildiv(size[1], block_size) * block_size; + return matrix_type::create(exec, size, 0, block_size); } static void check_property(const std::unique_ptr& mtx) { - ASSERT_EQ(mtx->get_block_size(), 1); + ASSERT_EQ(mtx->get_block_size(), block_size); } -}; - -struct FbcsrBlocksize2 - : SimpleMatrixTest> { - static bool preserves_zeros() { return false; } - static std::unique_ptr create( - std::shared_ptr exec, gko::dim<2> size) + static void modify_data(gko::matrix_data& data) { - return matrix_type::create(exec, size, 0, 2); + data.size[0] = gko::ceildiv(data.size[0], block_size) * block_size; + data.size[1] = gko::ceildiv(data.size[1], block_size) * block_size; } - static void check_property(const std::unique_ptr& mtx) - { - ASSERT_EQ(mtx->get_block_size(), 2); - } +#ifdef GKO_COMPILING_HIP + // Fbcsr support in rocSPARSE is buggy w.r.t. strides + static bool supports_strides() { return false; } +#endif }; @@ -440,12 +439,11 @@ class Matrix : public ::testing::Test { } template - test_pair gen_in_vec(const test_pair& mtx, int nrhs, - int stride) + test_pair gen_in_vec(const test_pair& mtx, int nrhs) { auto size = gko::dim<2>{mtx.ref->get_size()[1], static_cast(nrhs)}; - auto result = VecType::create(ref, size, stride); + auto result = VecType::create(ref, size); result->read(gen_dense_data(size)); return {std::move(result), exec}; @@ -466,12 +464,11 @@ class Matrix : public ::testing::Test { } template - test_pair gen_out_vec(const test_pair& mtx, int nrhs, - int stride) + test_pair gen_out_vec(const test_pair& mtx, int nrhs) { auto size = gko::dim<2>{mtx.ref->get_size()[0], static_cast(nrhs)}; - auto result = VecType::create(ref, size, stride); + auto result = VecType::create(ref, size); result->read(gen_dense_data(size)); return {std::move(result), exec}; @@ -484,9 +481,10 @@ class Matrix : public ::testing::Test { template void forall_matrix_data_scenarios(TestFunction fn) { - auto guarded_fn = [&](auto mtx) { + auto guarded_fn = [&](auto data) { try { - fn(std::move(mtx)); + Config::modify_data(data); + fn(std::move(data)); } catch (std::exception& e) { FAIL() << e.what(); } @@ -507,6 +505,14 @@ class Matrix : public ::testing::Test { SCOPED_TRACE("Zero matrix (200x100)"); guarded_fn(gen_mtx_data(200, 100, 0, 0)); } + { + SCOPED_TRACE("Small Sparse Matrix with variable row nnz (10x10)"); + guarded_fn(gen_mtx_data(10, 10, 1, 5)); + } + { + SCOPED_TRACE("Small Dense Matrix (10x10)"); + guarded_fn(gen_mtx_data(10, 10, 10, 10)); + } { SCOPED_TRACE("Sparse Matrix with some zeros rows (200x100)"); guarded_fn(gen_mtx_data(200, 100, 0, 50)); @@ -567,6 +573,34 @@ class Matrix : public ::testing::Test { }); } + template + void run_strided(const test_pair& mtx, int rhs, int in_stride, + int out_stride, TestFunction fn) + { + // create slightly bigger vectors + auto in_padded = gen_in_vec(mtx, in_stride); + auto out_padded = gen_out_vec(mtx, out_stride); + const auto in_rows = gko::span(0, mtx.ref->get_size()[1]); + const auto out_rows = gko::span(0, mtx.ref->get_size()[0]); + const auto cols = gko::span(0, rhs); + const auto out_pad_cols = gko::span(rhs, out_stride); + // create views of the padding and in/out vectors + auto out_padding = test_pair{ + out_padded.ref->create_submatrix(out_rows, out_pad_cols), + out_padded.dev->create_submatrix(out_rows, out_pad_cols)}; + auto orig_padding = out_padding.ref->clone(); + auto in = + test_pair{in_padded.ref->create_submatrix(in_rows, cols), + in_padded.dev->create_submatrix(in_rows, cols)}; + auto out = test_pair{ + out_padded.ref->create_submatrix(out_rows, cols), + out_padded.dev->create_submatrix(out_rows, cols)}; + fn(std::move(in), std::move(out)); + // check that padding was unmodified + GKO_ASSERT_MTX_NEAR(out_padding.ref, orig_padding, 0.0); + GKO_ASSERT_MTX_NEAR(out_padding.dev, orig_padding, 0.0); + } + template void forall_vector_scenarios(const test_pair& mtx, TestFunction fn) { @@ -579,53 +613,48 @@ class Matrix : public ::testing::Test { }; { SCOPED_TRACE("Multivector with 0 columns"); - guarded_fn(gen_in_vec(mtx, 0, 0), - gen_out_vec(mtx, 0, 0)); + guarded_fn(gen_in_vec(mtx, 0), + gen_out_vec(mtx, 0)); } { SCOPED_TRACE("Single vector"); - guarded_fn(gen_in_vec(mtx, 1, 1), - gen_out_vec(mtx, 1, 1)); + guarded_fn(gen_in_vec(mtx, 1), + gen_out_vec(mtx, 1)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Single strided vector"); - guarded_fn(gen_in_vec(mtx, 1, 2), - gen_out_vec(mtx, 1, 3)); + run_strided(mtx, 1, 2, 3, guarded_fn); } if (!gko::is_complex()) { // check application of real matrix to complex vector // viewed as interleaved real/imag vector using complex_vec = gko::to_complex; - { + if (Config::supports_strides()) { SCOPED_TRACE("Single strided complex vector"); - guarded_fn(gen_in_vec(mtx, 1, 2), - gen_out_vec(mtx, 1, 3)); + run_strided(mtx, 1, 2, 3, guarded_fn); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Strided complex multivector with 2 columns"); - guarded_fn(gen_in_vec(mtx, 2, 3), - gen_out_vec(mtx, 2, 4)); + run_strided(mtx, 2, 3, 4, guarded_fn); } } { SCOPED_TRACE("Multivector with 2 columns"); - guarded_fn(gen_in_vec(mtx, 2, 2), - gen_out_vec(mtx, 2, 2)); + guarded_fn(gen_in_vec(mtx, 2), + gen_out_vec(mtx, 2)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Strided multivector with 2 columns"); - guarded_fn(gen_in_vec(mtx, 2, 3), - gen_out_vec(mtx, 2, 4)); + run_strided(mtx, 2, 3, 4, guarded_fn); } { SCOPED_TRACE("Multivector with 40 columns"); - guarded_fn(gen_in_vec(mtx, 40, 40), - gen_out_vec(mtx, 40, 40)); + guarded_fn(gen_in_vec(mtx, 40), + gen_out_vec(mtx, 40)); } - { + if (Config::supports_strides()) { SCOPED_TRACE("Strided multivector with 40 columns"); - guarded_fn(gen_in_vec(mtx, 40, 43), - gen_out_vec(mtx, 40, 45)); + run_strided(mtx, 40, 43, 45, guarded_fn); } } @@ -637,18 +666,21 @@ class Matrix : public ::testing::Test { using MatrixTypes = ::testing::Types< DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy, - // The strategies have issues with zero rows - /* - #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \ - defined(GKO_COMPILING_DPCPP) - CsrWithClassicalStrategy, CsrWithMergePathStrategy, - CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy, - CsrWithAutomaticalStrategy, - #endif - */ +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \ + defined(GKO_COMPILING_DPCPP) + CsrWithClassicalStrategy, CsrWithMergePathStrategy, + CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy, + CsrWithAutomaticalStrategy, +#endif Ell, - // Fbcsr is slightly broken - /*FbcsrBlocksize1, FbcsrBlocksize2,*/ +#ifdef GKO_COMPILING_OMP + // CUDA doesn't allow blocksize 1 + Fbcsr<1>, +#endif +#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA) || \ + defined(GKO_COMPILING_HIP) + Fbcsr<2>, Fbcsr<3>, +#endif SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy, HybridColumnLimitStrategy, HybridImbalanceLimitStrategy, HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy, @@ -771,13 +803,30 @@ TYPED_TEST(Matrix, ConvertToDenseIsEquivalentToRef) using Mtx = typename TestFixture::Mtx; using Dense = gko::matrix::Dense; this->forall_matrix_scenarios([&](auto mtx) { - auto ref_result = Dense::create(this->ref); - auto dev_result = Dense::create(this->exec); + const auto size = mtx.ref->get_size(); + const auto stride = size[1] + 5; + const auto padded_size = gko::dim<2>{size[0], stride}; + auto ref_padded = Dense::create(this->ref, padded_size); + auto dev_padded = Dense::create(this->exec, padded_size); + ref_padded->fill(12345); + dev_padded->fill(12345); + const auto rows = gko::span{0, size[0]}; + const auto cols = gko::span{0, size[1]}; + const auto pad_cols = gko::span{size[1], stride}; + auto ref_result = ref_padded->create_submatrix(rows, cols); + auto dev_result = dev_padded->create_submatrix(rows, cols); + auto ref_padding = ref_padded->create_submatrix(rows, pad_cols); + auto dev_padding = dev_padded->create_submatrix(rows, pad_cols); + auto orig_padding = ref_padding->clone(); mtx.ref->convert_to(ref_result.get()); mtx.dev->convert_to(dev_result.get()); GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0); + ASSERT_EQ(ref_result->get_stride(), stride); + ASSERT_EQ(dev_result->get_stride(), stride); + GKO_ASSERT_MTX_NEAR(ref_padding, orig_padding, 0.0); + GKO_ASSERT_MTX_NEAR(dev_padding, orig_padding, 0.0); }); } @@ -788,10 +837,13 @@ TYPED_TEST(Matrix, ConvertFromDenseIsEquivalentToRef) using Mtx = typename TestFixture::Mtx; using Dense = gko::matrix::Dense; this->forall_matrix_data_scenarios([&](auto data) { - auto ref_src = Dense::create(this->ref); - auto dev_src = Dense::create(this->exec); + const auto stride = data.size[0] + 2; + auto ref_src = Dense::create(this->ref, data.size, stride); + auto dev_src = Dense::create(this->exec, data.size, stride); ref_src->read(data); dev_src->read(data); + ASSERT_EQ(ref_src->get_stride(), stride); + ASSERT_EQ(dev_src->get_stride(), stride); auto ref_result = TestConfig::create(this->ref, data.size); auto dev_result = TestConfig::create(this->exec, data.size); @@ -813,7 +865,6 @@ TYPED_TEST(Matrix, ReadWriteRoundtrip) auto new_mtx = TestConfig::create(this->exec, data.size); gko::matrix_data out_data; - TestConfig::modify_data(data); new_mtx->read(data); new_mtx->write(out_data);