From 1e5af84d019a47759797a33a01c4a0b9ebe592c6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 28 Feb 2022 15:27:13 +0100
Subject: [PATCH 01/17] fix OpenMP Fbcsr SpMV and move-read

---
 core/matrix/fbcsr.cpp              |  9 +++--
 omp/matrix/fbcsr_kernels.cpp       | 56 +++++++++++++++++++++++++-----
 reference/matrix/fbcsr_kernels.cpp | 15 ++++----
 test/matrix/matrix.cpp             | 11 +++---
 4 files changed, 66 insertions(+), 25 deletions(-)
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 3325638fbde..a651fa41cec 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -232,9 +232,12 @@ void Fbcsr<ValueType, IndexType>::read(device_mat_data&& data)
     this->set_size(data.get_size());
     row_ptrs_.resize_and_reset(row_blocks + 1);
     auto exec = this->get_executor();
-    auto local_data = make_temporary_clone(exec, &data);
-    exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_,
-                                              col_idxs_, values_));
+    {
+        auto local_data = make_temporary_clone(exec, &data);
+        exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_,
+                                                  col_idxs_, values_));
+    }
+    // this needs to happen after the temporary clone copy-back
     data.empty_out();
 }
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index ce647f79935..ea01355cffa 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -86,9 +86,10 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
 
 #pragma omp parallel for
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i) {
-            c->get_values()[i] = zero<ValueType>();
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) = zero<ValueType>();
+            }
         }
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
@@ -130,10 +131,11 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
 
 #pragma omp parallel for
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i)
-            c->get_values()[i] *= vbeta;
-
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) *= vbeta;
+            }
+        }
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
             for (int ib = 0; ib < bs; ib++) {
@@ -164,7 +166,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     components::soa_to_aos(exec, data, block_ordered);
     const auto in_nnz = data.get_num_elems();
     auto block_ordered_ptr = block_ordered.get_data();
-    std::stable_sort(
+    std::sort(
         block_ordered_ptr, block_ordered_ptr + in_nnz,
         [block_size](auto a, auto b) {
             return std::make_tuple(a.row / block_size, a.column / block_size) <
@@ -248,7 +250,43 @@ template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const OmpExecutor> exec,
                     const matrix::Fbcsr<ValueType, IndexType>* const source,
                     matrix::Csr<ValueType, IndexType>* const result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto nbrows = source->get_num_block_rows();
+    const auto bs = source->get_block_size();
+    const auto block_row_ptrs = source->get_const_row_ptrs();
+    const auto block_col_idxs = source->get_const_col_idxs();
+    const auto row_ptrs = result->get_row_ptrs();
+    const auto col_idxs = result->get_col_idxs();
+    const auto vals = result->get_values();
+    auto sizes =
+        gko::to_std_array<acc::size_type>(block_row_ptrs[nbrows], bs, bs);
+    const auto block_vals =
+        acc::range<acc::block_col_major<const ValueType, 3>>(
+            sizes, source->get_const_values());
+#pragma omp parallel for
+    for (IndexType block_row = 0; block_row < nbrows; block_row++) {
+        const auto block_row_begin = block_row_ptrs[block_row];
+        const auto block_row_end = block_row_ptrs[block_row + 1];
+        const auto block_row_size = block_row_end - block_row_begin;
+        const auto row_size = block_row_size * bs;
+        const auto row_base = block_row_begin * bs * bs;
+        for (int local_row = 0; local_row < bs; local_row++) {
+            const auto row = block_row * bs + local_row;
+            row_ptrs[row] = row_base + row_size * local_row;
+            for (auto block = block_row_begin; block < block_row_end; block++) {
+                const auto block_base =
+                    row_ptrs[row] + bs * (block - block_row_begin);
+                for (int local_col = 0; local_col < bs; local_col++) {
+                    const auto col = block_col_idxs[block] * bs + local_col;
+                    const auto out_idx = block_base + local_col;
+                    col_idxs[out_idx] = col;
+                    vals[out_idx] = block_vals(block, local_row, local_col);
+                }
+            }
+        }
+    }
+    row_ptrs[result->get_size()[0]] = source->get_num_stored_elements();
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 9b093a97b68..deca89b976a 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -85,9 +85,10 @@ void spmv(const std::shared_ptr<const ReferenceExecutor>,
         to_std_array<acc::size_type>(nbnz, bs, bs), a->get_const_values()};
 
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i) {
-            c->get_values()[i] = zero<ValueType>();
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) = zero<ValueType>();
+            }
         }
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
@@ -128,9 +129,11 @@ void advanced_spmv(const std::shared_ptr<const ReferenceExecutor>,
         to_std_array<acc::size_type>(nbnz, bs, bs), a->get_const_values()};
 
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i)
-            c->get_values()[i] *= vbeta;
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) *= vbeta;
+            }
+        }
 
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index c87874776c3..ed0bbc32c99 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -646,13 +646,10 @@ using MatrixTypes = ::testing::Types<
         CsrWithAutomaticalStrategy,
     #endif
     */
-    Ell,
-    // Fbcsr is slightly broken
-    /*FbcsrBlocksize1, FbcsrBlocksize2,*/
-    SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy,
-    HybridColumnLimitStrategy, HybridImbalanceLimitStrategy,
-    HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy,
-    HybridAutomaticStrategy, SparsityCsr>;
+    Ell, FbcsrBlocksize1, FbcsrBlocksize2, SellpDefaultParameters,
+    Sellp32Factor2, HybridDefaultStrategy, HybridColumnLimitStrategy,
+    HybridImbalanceLimitStrategy, HybridImbalanceBoundedLimitStrategy,
+    HybridMinStorageStrategy, HybridAutomaticStrategy, SparsityCsr>;
 
 TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator);
 

From 3b0030f201703db2a651c4ff8d44c178b43e3482 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 1 Mar 2022 15:17:38 +0100
Subject: [PATCH 02/17] add CUDA Fbcsr -> Dense conversion

---
 common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 32 ++++++++++++++++++++
 cuda/matrix/fbcsr_kernels.cu                 | 14 ++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
index 51feaf82fe6..8af2d8eb27d 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -116,6 +116,38 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr(
 }
 
 
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_dense(
+    const IndexType* block_row_ptrs, const IndexType* block_col_idxs,
+    const ValueType* blocks, ValueType* values, size_type stride,
+    size_type num_block_rows, int block_size)
+{
+    const auto block_row = thread::get_subwarp_id_flat<config::warp_size>();
+    if (block_row >= num_block_rows) {
+        return;
+    }
+    const auto block_begin = block_row_ptrs[block_row];
+    const auto block_end = block_row_ptrs[block_row + 1];
+    const auto num_blocks = block_end - block_begin;
+    const auto block_row_begin = block_begin * block_size * block_size;
+    const auto num_entries = num_blocks * block_size * block_size;
+    const auto bs_sq = block_size * block_size;
+    const auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = warp.thread_rank();
+    for (IndexType nz = lane; nz < num_entries; nz += config::warp_size) {
+        const auto local_id = nz % bs_sq;
+        const auto block = nz / bs_sq;
+        const auto local_row = local_id % block_size;
+        const auto local_col = local_id / block_size;
+        const auto col =
+            block_col_idxs[block + block_begin] * block_size + local_col;
+        const auto row = block_row * block_size + local_row;
+        values[row * stride + col] = blocks[nz + block_row_begin];
+    }
+}
+
+
 }  // namespace kernel
 
 
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index c9c4782d262..b4fe2fc1325 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -230,7 +230,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void fill_in_dense(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* source,
-                   matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+                   matrix::Dense<ValueType>* result)
+{
+    constexpr auto warps_per_block = default_block_size / config::warp_size;
+    const auto num_blocks =
+        ceildiv(source->get_num_block_rows(), warps_per_block);
+    if (num_blocks > 0) {
+        kernel::fill_in_dense<<<num_blocks, default_block_size>>>(
+            source->get_const_row_ptrs(), source->get_const_col_idxs(),
+            as_cuda_type(source->get_const_values()),
+            as_cuda_type(result->get_values()), result->get_stride(),
+            source->get_num_block_rows(), source->get_block_size());
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);

From 1f1f27688a4367d0ab8bff1d6bda071bfad316f3 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 1 Mar 2022 19:08:59 +0100
Subject: [PATCH 03/17] fix (Fb)Csr strategies and add Fbcsr conversions

---
 common/cuda_hip/matrix/csr_common.hpp.inc    | 111 ++++++++++
 common/cuda_hip/matrix/csr_kernels.hpp.inc   | 200 +++++++++---------
 common/cuda_hip/matrix/dense_kernels.hpp.inc | 157 ++++++++++++++
 common/cuda_hip/matrix/fbcsr_kernels.hpp.inc |  45 ++--
 cuda/matrix/csr_kernels.cu                   |  37 ++--
 cuda/matrix/dense_kernels.cu                 |  41 +++-
 cuda/matrix/fbcsr_kernels.cu                 |  88 ++++----
 dpcpp/matrix/csr_kernels.dp.cpp              |   4 -
 hip/base/hipsparse_block_bindings.hip.hpp    | 164 +++++++++++++++
 hip/matrix/csr_kernels.hip.cpp               |  37 ++--
 hip/matrix/dense_kernels.hip.cpp             |  33 ++-
 hip/matrix/fbcsr_kernels.hip.cpp             | 207 +++++++++++++++++--
 test/matrix/matrix.cpp                       |  72 ++++---
 13 files changed, 934 insertions(+), 262 deletions(-)
 create mode 100644 common/cuda_hip/matrix/csr_common.hpp.inc
 create mode 100644 hip/base/hipsparse_block_bindings.hip.hpp

diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc
new file mode 100644
index 00000000000..d22981c1edf
--- /dev/null
+++ b/common/cuda_hip/matrix/csr_common.hpp.inc
@@ -0,0 +1,111 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void check_unsorted(
+    const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag)
+{
+    __shared__ bool sh_flag;
+    auto block = group::this_thread_block();
+    if (block.thread_rank() == 0) {
+        sh_flag = *flag;
+    }
+    block.sync();
+
+    auto row = thread::get_thread_id_flat<IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    // fail early
+    if (sh_flag) {
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) {
+            if (col_idxs[nz] > col_idxs[nz + 1]) {
+                *flag = false;
+                sh_flag = false;
+                return;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void conjugate(
+    size_type num_nonzeros, ValueType* __restrict__ val)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_nonzeros) {
+        val[tidx] = conj(val[tidx]);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
+    const IndexType num_min_rows_cols,
+    const IndexType* const __restrict__ row_ptrs,
+    const IndexType* const __restrict__ col_idxs,
+    bool* const __restrict__ has_all_diags)
+{
+    constexpr int warp_size = config::warp_size;
+    auto tile_grp =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+    const auto row = thread::get_subwarp_id_flat<warp_size, IndexType>();
+    if (row < num_min_rows_cols) {
+        const auto tid_in_warp = tile_grp.thread_rank();
+        const IndexType row_start = row_ptrs[row];
+        const IndexType num_nz = row_ptrs[row + 1] - row_start;
+        bool row_has_diag_local{false};
+        for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) {
+            if (col_idxs[iz + row_start] == row) {
+                row_has_diag_local = true;
+                break;
+            }
+        }
+        auto row_has_diag = static_cast<bool>(tile_grp.any(row_has_diag_local));
+        if (!row_has_diag) {
+            if (tile_grp.thread_rank() == 0) {
+                *has_all_diags = false;
+            }
+            return;
+        }
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 075e1607ace..4ed54310797 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -601,36 +601,6 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 }
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void check_unsorted(
-    const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag)
-{
-    __shared__ bool sh_flag;
-    auto block = group::this_thread_block();
-    if (block.thread_rank() == 0) {
-        sh_flag = *flag;
-    }
-    block.sync();
-
-    auto row = thread::get_thread_id_flat<IndexType>();
-    if (row >= num_rows) {
-        return;
-    }
-
-    // fail early
-    if (sh_flag) {
-        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) {
-            if (col_idxs[nz] > col_idxs[nz + 1]) {
-                *flag = false;
-                sh_flag = false;
-                return;
-            }
-        }
-    }
-}
-
-
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void extract_diagonal(
     size_type diag_size, size_type nnz,
@@ -657,29 +627,8 @@ __global__ __launch_bounds__(default_block_size) void extract_diagonal(
 }
 
 
-}  // namespace kernel
-
-
-namespace {
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void conjugate_kernel(
-    size_type num_nonzeros, ValueType* __restrict__ val)
-{
-    const auto tidx = thread::get_thread_id_flat();
-
-    if (tidx < num_nonzeros) {
-        val[tidx] = conj(val[tidx]);
-    }
-}
-
-
-}  //  namespace
-
-
 template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void row_ptr_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz)
 {
@@ -694,11 +643,9 @@ __global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel(
 
 
 template <typename IndexType>
-__global__
-    __launch_bounds__(default_block_size) void inv_row_ptr_permute_kernel(
-        size_type num_rows, const IndexType* __restrict__ permutation,
-        const IndexType* __restrict__ in_row_ptrs,
-        IndexType* __restrict__ out_nnz)
+__global__ __launch_bounds__(default_block_size) void inv_row_ptr_permute(
+    size_type num_rows, const IndexType* __restrict__ permutation,
+    const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz)
 {
     auto tid = thread::get_thread_id_flat();
     if (tid >= num_rows) {
@@ -711,7 +658,7 @@ __global__
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void row_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void row_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs,
     const IndexType* __restrict__ in_cols,
@@ -737,7 +684,7 @@ __global__ __launch_bounds__(default_block_size) void row_permute_kernel(
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void inv_row_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs,
     const IndexType* __restrict__ in_cols,
@@ -763,7 +710,7 @@ __global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel(
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void inv_symm_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs,
     const IndexType* __restrict__ in_cols,
@@ -788,9 +735,6 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
 }
 
 
-namespace kernel {
-
-
 template <typename ValueType, typename IndexType>
 __global__
     __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
@@ -869,37 +813,105 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity(
 }
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
-    const IndexType num_min_rows_cols,
-    const IndexType* const __restrict__ row_ptrs,
-    const IndexType* const __restrict__ col_idxs,
-    bool* const __restrict__ has_all_diags)
-{
-    constexpr int warp_size = config::warp_size;
-    auto tile_grp =
-        group::tiled_partition<warp_size>(group::this_thread_block());
-    const auto row = thread::get_subwarp_id_flat<warp_size, IndexType>();
-    if (row < num_min_rows_cols) {
-        const auto tid_in_warp = tile_grp.thread_rank();
-        const IndexType row_start = row_ptrs[row];
-        const IndexType num_nz = row_ptrs[row + 1] - row_start;
-        bool row_has_diag_local{false};
-        for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) {
-            if (col_idxs[iz + row_start] == row) {
-                row_has_diag_local = true;
-                break;
-            }
-        }
-        auto row_has_diag = static_cast<bool>(tile_grp.any(row_has_diag_local));
-        if (!row_has_diag) {
-            if (tile_grp.thread_rank() == 0) {
-                *has_all_diags = false;
-            }
-            return;
-        }
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* source, int bs,
+                      Array<IndexType>& block_row_ptr_array,
+                      Array<IndexType>& block_col_idx_array,
+                      Array<ValueType>& block_value_array)
+{
+    using tuple_type = thrust::tuple<IndexType, IndexType>;
+    const auto nnz = source->get_num_stored_elements();
+    Array<IndexType> in_row_idxs{exec, nnz};
+    Array<IndexType> in_col_idxs{exec, nnz};
+    Array<ValueType> in_values{exec, nnz};
+    exec->copy(nnz, source->get_const_col_idxs(), in_col_idxs.get_data());
+    exec->copy(nnz, source->get_const_values(), in_values.get_data());
+    components::convert_ptrs_to_idxs(exec, source->get_const_row_ptrs(),
+                                     source->get_size()[0],
+                                     in_row_idxs.get_data());
+    auto block_row_ptrs = block_row_ptr_array.get_data();
+    auto num_block_rows = block_row_ptr_array.get_num_elems() - 1;
+    if (nnz == 0) {
+        components::fill_array(exec, block_row_ptrs, num_block_rows + 1,
+                               IndexType{});
+        block_col_idx_array.resize_and_reset(0);
+        block_value_array.resize_and_reset(0);
+        return;
     }
+    auto in_rows = in_row_idxs.get_data();
+    auto in_cols = in_col_idxs.get_data();
+    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
+    // due to overly generic assignment operator and constructor leading to
+    // ambiguities. So we need to use our own fake_complex type
+    auto in_vals =
+        reinterpret_cast<device_member_type<ValueType>*>(in_values.get_data());
+    auto in_loc_it = thrust::make_zip_iterator(
+        thrust::make_tuple(thrust::device_pointer_cast(in_rows),
+                           thrust::device_pointer_cast(in_cols)));
+    thrust::sort_by_key(thrust::device, in_loc_it, in_loc_it + nnz,
+                        thrust::device_pointer_cast(in_vals),
+                        [bs] __device__(tuple_type a, tuple_type b) {
+                            return thrust::make_pair(thrust::get<0>(a) / bs,
+                                                     thrust::get<1>(a) / bs) <
+                                   thrust::make_pair(thrust::get<0>(b) / bs,
+                                                     thrust::get<1>(b) / bs);
+                        });
+    // build block pattern
+    auto adj_predicate = [bs, in_rows, in_cols, nnz] __device__(size_type i) {
+        const auto a_block_row = i > 0 ? in_rows[i - 1] / bs : -1;
+        const auto a_block_col = i > 0 ? in_cols[i - 1] / bs : -1;
+        const auto b_block_row = in_rows[i] / bs;
+        const auto b_block_col = in_cols[i] / bs;
+        return (a_block_row != b_block_row) || (a_block_col != b_block_col);
+    };
+    auto iota = thrust::make_counting_iterator(size_type{});
+    // count how many blocks we have by counting how often the block changes
+    auto num_blocks = static_cast<size_type>(
+        thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate));
+    // allocate storage
+    Array<IndexType> block_row_idx_array{exec, num_blocks};
+    Array<size_type> block_ptr_array{exec, num_blocks};
+    block_col_idx_array.resize_and_reset(num_blocks);
+    block_value_array.resize_and_reset(num_blocks * bs * bs);
+    auto row_idxs = block_row_idx_array.get_data();
+    auto col_idxs = block_col_idx_array.get_data();
+    auto values = as_device_type(block_value_array.get_data());
+    auto block_ptrs = block_ptr_array.get_data();
+    auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
+    // write (block_row, block_col, block_start_idx) tuples for each block
+    thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it,
+                    adj_predicate);
+    auto block_output_it = thrust::make_zip_iterator(
+        thrust::make_tuple(thrust::device_pointer_cast(row_idxs),
+                           thrust::device_pointer_cast(col_idxs)));
+    thrust::transform(
+        thrust::device, block_ptr_it, block_ptr_it + num_blocks,
+        block_output_it, [bs, in_rows, in_cols] __device__(size_type i) {
+            return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs);
+        });
+    // build row pointers from row indices
+    components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(),
+                                     block_row_idx_array.get_num_elems(),
+                                     num_block_rows, block_row_ptrs);
+    // fill in values
+    components::fill_array(exec, block_value_array.get_data(),
+                           num_blocks * bs * bs, zero<ValueType>());
+    thrust::for_each_n(
+        thrust::device, iota, num_blocks,
+        [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
+         values] __device__(size_type i) {
+            const auto block_begin = block_ptrs[i];
+            const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
+            for (auto nz = block_begin; nz < block_end; nz++) {
+                values[i * bs * bs + (in_cols[nz] % bs) * bs +
+                       (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
+            }
+        });
 }
 
-
-}  // namespace kernel
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index c4cdc214e76..9f0a5c0e4b7 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -33,6 +33,163 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace kernel {
 
 
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void count_nonzero_blocks_per_row(
+        size_type num_block_rows, size_type num_block_cols, size_type stride,
+        int block_size, const ValueType* __restrict__ source,
+        IndexType* __restrict__ block_row_nnz)
+{
+    const auto brow = thread::get_subwarp_id_flat<config::warp_size>();
+
+    if (brow >= num_block_rows) {
+        return;
+    }
+
+    const auto bs_sq = block_size * block_size;
+    const auto num_cols = num_block_cols * block_size;
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = static_cast<IndexType>(warp.thread_rank());
+    const auto block_mask =
+        block_size >= config::warp_size
+            ? ~config::lane_mask_type{}
+            : (config::lane_mask_type{1} << block_size) - 1u;
+    bool previous_block_nonzero = false;
+    IndexType block_count{};
+    for (IndexType base_col = 0; base_col < num_cols;
+         base_col += config::warp_size) {
+        const auto col = base_col + lane;
+        const auto block_local_col = col % block_size;
+        // which is the first column in the current block?
+        const auto block_base_col = col - block_local_col;
+        // which lowest thread in the warp does this column belong to?
+        const auto block_base_thread = block_base_col - base_col;
+        // collect nonzero bitmask
+        config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u};
+        for (int local_row = 0; local_row < block_size; local_row++) {
+            const auto row = local_row + brow * block_size;
+            nonzero_mask |= warp.ballot(col < num_cols &&
+                                        is_nonzero(source[row * stride + col]));
+        }
+        const auto local_mask =
+            (block_base_thread < 0 ? nonzero_mask << -block_base_thread
+                                   : nonzero_mask >> block_base_thread) &
+            block_mask;
+        if (block_local_col == block_size - 1) {
+            block_count += local_mask ? 1 : 0;
+        }
+        // if we need to store something for the next iteration
+        if ((base_col + config::warp_size) % block_size != 0) {
+            // what's the first column of the last block in the warp?
+            const auto last_block_base_col =
+                (base_col + static_cast<IndexType>(config::warp_size) - 1) /
+                block_size * block_size;
+            // which thread does it belong to?
+            const auto last_block_base_thread = last_block_base_col - base_col;
+            previous_block_nonzero =
+                bool((last_block_base_thread < 0
+                          ? nonzero_mask << -last_block_base_thread
+                          : nonzero_mask >> last_block_base_thread) &
+                     block_mask);
+        } else {
+            previous_block_nonzero = false;
+        }
+    }
+    block_count = reduce(warp, block_count,
+                         [](IndexType a, IndexType b) { return a + b; });
+    if (lane == 0) {
+        block_row_nnz[brow] = block_count;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
+    size_type num_block_rows, size_type num_block_cols, size_type stride,
+    int block_size, const ValueType* __restrict__ source,
+    const IndexType* __restrict__ block_row_ptrs,
+    IndexType* __restrict__ block_cols, ValueType* __restrict__ blocks)
+{
+    const auto brow = thread::get_subwarp_id_flat<config::warp_size>();
+
+    if (brow >= num_block_rows) {
+        return;
+    }
+
+    const auto bs_sq = block_size * block_size;
+    const auto num_cols = num_block_cols * block_size;
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = static_cast<IndexType>(warp.thread_rank());
+    auto lane_prefix_mask =
+        (config::lane_mask_type(1) << warp.thread_rank()) - 1;
+    const auto block_mask =
+        block_size >= config::warp_size
+            ? ~config::lane_mask_type{}
+            : (config::lane_mask_type{1} << block_size) - 1u;
+    bool previous_block_nonzero = false;
+    auto block_base_nz = block_row_ptrs[brow];
+    for (IndexType base_col = 0; base_col < num_cols;
+         base_col += config::warp_size) {
+        const auto col = base_col + lane;
+        const auto block_local_col = col % block_size;
+        // which is the first column in the current block?
+        const auto block_base_col = col - block_local_col;
+        // which lowest thread in the warp does this column belong to?
+        const auto block_base_thread = block_base_col - base_col;
+        // collect nonzero bitmask
+        config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u};
+        for (int local_row = 0; local_row < block_size; local_row++) {
+            const auto row = local_row + brow * block_size;
+            nonzero_mask |= warp.ballot(col < num_cols &&
+                                        is_nonzero(source[row * stride + col]));
+        }
+        const auto local_mask =
+            (block_base_thread < 0 ? nonzero_mask << -block_base_thread
+                                   : nonzero_mask >> block_base_thread) &
+            block_mask;
+        const auto block_nonzero_mask =
+            warp.ballot(local_mask && (block_local_col == block_size - 1));
+
+        // count how many blocks come before this block in the current block
+        const auto block_nz =
+            block_base_nz + popcnt(block_nonzero_mask & lane_prefix_mask);
+        // now in a second sweep, store the actual elements
+        if (local_mask) {
+            if (block_local_col == block_size - 1) {
+                block_cols[block_nz] = col / block_size;
+            }
+            if (col < num_cols) {
+                for (int local_row = 0; local_row < block_size; local_row++) {
+                    const auto row = local_row + brow * block_size;
+                    blocks[local_row + block_local_col * block_size +
+                           block_nz * bs_sq] = source[row * stride + col];
+                }
+            }
+        }
+        // if we need to store something for the next iteration
+        if ((base_col + config::warp_size) % block_size != 0) {
+            // what's the first column of the last block in the warp?
+            const auto last_block_base_col =
+                (base_col + static_cast<IndexType>(config::warp_size) - 1) /
+                block_size * block_size;
+            // which thread does it belong to?
+            const auto last_block_base_thread = last_block_base_col - base_col;
+            previous_block_nonzero =
+                bool((last_block_base_thread < 0
+                          ? nonzero_mask << -last_block_base_thread
+                          : nonzero_mask >> last_block_base_thread) &
+                     block_mask);
+        } else {
+            previous_block_nonzero = false;
+        }
+        // advance by the completed blocks
+        block_base_nz += popcnt(block_nonzero_mask);
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void fill_in_coo(
     size_type num_rows, size_type num_cols, size_type stride,
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
index 8af2d8eb27d..66067d3ffa6 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -93,9 +93,9 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr(
     if (block_row == 0 && lane == 0) {
         row_ptrs[0] = 0;
     }
-    if (lane < block_size) {
-        row_ptrs[first_row + lane + 1] =
-            block_row_begin + num_blocks * block_size * (lane + 1);
+    for (auto i = lane; i < block_size; i += config::warp_size) {
+        row_ptrs[first_row + i + 1] =
+            block_row_begin + num_blocks * block_size * (i + 1);
     }
     for (IndexType i = lane; i < num_blocks * block_size * block_size;
          i += config::warp_size) {
@@ -154,19 +154,20 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 template <typename ValueType, typename IndexType>
 void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
                          device_matrix_data<ValueType, IndexType>& data,
-                         int block_size, Array<IndexType>& row_ptr_array,
-                         Array<IndexType>& col_idx_array,
-                         Array<ValueType>& value_array)
+                         int block_size, Array<IndexType>& block_row_ptr_array,
+                         Array<IndexType>& block_col_idx_array,
+                         Array<ValueType>& block_value_array)
 {
     using tuple_type = thrust::tuple<IndexType, IndexType>;
     const auto nnz = data.get_num_elems();
     const auto bs = block_size;
-    auto row_ptrs = row_ptr_array.get_data();
-    auto num_rows = row_ptr_array.get_num_elems() - 1;
+    auto block_row_ptrs = block_row_ptr_array.get_data();
+    auto num_block_rows = block_row_ptr_array.get_num_elems() - 1;
     if (nnz == 0) {
-        components::fill_array(exec, row_ptrs, num_rows + 1, IndexType{});
-        col_idx_array.resize_and_reset(0);
-        value_array.resize_and_reset(0);
+        components::fill_array(exec, block_row_ptrs, num_block_rows + 1,
+                               IndexType{});
+        block_col_idx_array.resize_and_reset(0);
+        block_value_array.resize_and_reset(0);
         return;
     }
     auto in_rows = data.get_row_idxs();
@@ -200,13 +201,13 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     auto num_blocks = static_cast<size_type>(
         thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate));
     // allocate storage
-    Array<IndexType> row_idx_array{exec, num_blocks};
+    Array<IndexType> block_row_idx_array{exec, num_blocks};
     Array<size_type> block_ptr_array{exec, num_blocks};
-    col_idx_array.resize_and_reset(num_blocks);
-    value_array.resize_and_reset(num_blocks * bs * bs);
-    auto row_idxs = row_idx_array.get_data();
-    auto col_idxs = col_idx_array.get_data();
-    auto values = as_device_type(value_array.get_data());
+    block_col_idx_array.resize_and_reset(num_blocks);
+    block_value_array.resize_and_reset(num_blocks * bs * bs);
+    auto row_idxs = block_row_idx_array.get_data();
+    auto col_idxs = block_col_idx_array.get_data();
+    auto values = as_device_type(block_value_array.get_data());
     auto block_ptrs = block_ptr_array.get_data();
     auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
     // write (block_row, block_col, block_start_idx) tuples for each block
@@ -221,12 +222,12 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
             return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs);
         });
     // build row pointers from row indices
-    components::convert_idxs_to_ptrs(exec, row_idx_array.get_const_data(),
-                                     row_idx_array.get_num_elems(), num_rows,
-                                     row_ptrs);
+    components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(),
+                                     block_row_idx_array.get_num_elems(),
+                                     num_block_rows, block_row_ptrs);
     // fill in values
-    components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs,
-                           zero<ValueType>());
+    components::fill_array(exec, block_value_array.get_data(),
+                           num_blocks * bs * bs, zero<ValueType>());
     thrust::for_each_n(
         thrust::device, iota, num_blocks,
         [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index d60323c40c0..8afefc44972 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -36,6 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -47,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -97,6 +107,7 @@ using spgeam_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
 
 
@@ -304,8 +315,6 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
                     as_cuda_type(c->get_stride()));
             }
         }
-    } else {
-        GKO_NOT_SUPPORTED(nwarps);
     }
 }
 
@@ -897,16 +906,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
-void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* source, int bs,
-                      Array<IndexType>& row_ptrs, Array<IndexType>& col_idxs,
-                      Array<ValueType>& values) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
                const matrix::Csr<ValueType, IndexType>* orig,
@@ -1006,7 +1005,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
             idxBase, alg, buffer);
 #endif
         if (grid_size > 0) {
-            conjugate_kernel<<<grid_size, block_size, 0, 0>>>(
+            kernel::conjugate<<<grid_size, block_size, 0, 0>>>(
                 trans->get_num_stored_elements(),
                 as_cuda_type(trans->get_values()));
         }
@@ -1028,7 +1027,7 @@ void inv_symm_permute(std::shared_ptr<const CudaExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        inv_row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size>>>(
             num_rows, perm, orig->get_const_row_ptrs(),
             permuted->get_row_ptrs());
     }
@@ -1036,7 +1035,7 @@ void inv_symm_permute(std::shared_ptr<const CudaExecutor> exec,
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
-        inv_symm_permute_kernel<config::warp_size>
+        kernel::inv_symm_permute<config::warp_size>
             <<<copy_num_blocks, default_block_size>>>(
                 num_rows, perm, orig->get_const_row_ptrs(),
                 orig->get_const_col_idxs(),
@@ -1059,7 +1058,7 @@ void row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        kernel::row_ptr_permute<<<count_num_blocks, default_block_size>>>(
             num_rows, perm, orig->get_const_row_ptrs(),
             row_permuted->get_row_ptrs());
     }
@@ -1067,7 +1066,7 @@ void row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
-        row_permute_kernel<config::warp_size>
+        kernel::row_permute<config::warp_size>
             <<<copy_num_blocks, default_block_size>>>(
                 num_rows, perm, orig->get_const_row_ptrs(),
                 orig->get_const_col_idxs(),
@@ -1090,7 +1089,7 @@ void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        inv_row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size>>>(
             num_rows, perm, orig->get_const_row_ptrs(),
             row_permuted->get_row_ptrs());
     }
@@ -1098,7 +1097,7 @@ void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
-        inv_row_permute_kernel<config::warp_size>
+        kernel::inv_row_permute<config::warp_size>
             <<<copy_num_blocks, default_block_size>>>(
                 num_rows, perm, orig->get_const_row_ptrs(),
                 orig->get_const_col_idxs(),
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 9537d6595f0..bc55a60ac93 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
@@ -291,7 +292,18 @@ template <typename ValueType, typename IndexType>
 void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Fbcsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto num_block_rows = result->get_num_block_rows();
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::convert_to_fbcsr<<<num_blocks, default_block_size>>>(
+            num_block_rows, result->get_num_block_cols(), source->get_stride(),
+            result->get_block_size(), as_cuda_type(source->get_const_values()),
+            result->get_const_row_ptrs(), result->get_col_idxs(),
+            as_cuda_type(result->get_values()));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
@@ -300,8 +312,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                   const matrix::Dense<ValueType>* source,
-                                  int bs,
-                                  IndexType* result) GKO_NOT_IMPLEMENTED;
+                                  int bs, IndexType* result)
+{
+    const auto num_block_rows = source->get_size()[0] / bs;
+    const auto num_block_cols = source->get_size()[1] / bs;
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::
+            count_nonzero_blocks_per_row<<<num_blocks, default_block_size>>>(
+                num_block_rows, num_block_cols, source->get_stride(), bs,
+                as_cuda_type(source->get_const_values()), result);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
@@ -407,9 +430,9 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
             auto beta = zero<ValueType>();
             cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0],
                          orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta,
-                         static_cast<ValueType*>(nullptr), trans->get_size()[1],
-                         trans->get_values(), trans->get_stride());
+                         orig->get_stride(), &beta, trans->get_values(),
+                         trans->get_stride(), trans->get_values(),
+                         trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -432,9 +455,9 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
             auto beta = zero<ValueType>();
             cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0],
                          orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta,
-                         static_cast<ValueType*>(nullptr), trans->get_size()[1],
-                         trans->get_values(), trans->get_stride());
+                         orig->get_stride(), &beta, trans->get_values(),
+                         trans->get_stride(), trans->get_values(),
+                         trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index b4fe2fc1325..d0a73d67c2a 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
@@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/merging.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
@@ -77,20 +79,6 @@ namespace kernels {
 namespace cuda {
 
 
-namespace csr_reuse {
-
-
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
-
-
-}  // namespace csr_reuse
-
-
 /**
  * @brief The fixed-size block compressed sparse row matrix format namespace.
  *
@@ -102,7 +90,7 @@ namespace fbcsr {
 constexpr int default_block_size{512};
 
 
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
 
 
@@ -114,6 +102,9 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
                      const size_type orig_stride, const ValueType* const orig,
                      const size_type trans_stride, ValueType* const trans)
 {
+    if (nrows == 0) {
+        return;
+    }
     if (cublas::is_supported<ValueType>::value) {
         auto handle = exec->get_cublas_handle();
         {
@@ -121,8 +112,7 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
             cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha,
-                         orig, orig_stride, &beta,
-                         static_cast<ValueType*>(nullptr), nrows, trans,
+                         orig, orig_stride, &beta, trans, trans_stride, trans,
                          trans_stride);
         }
     } else {
@@ -139,6 +129,15 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Dense<ValueType>* const b,
           matrix::Dense<ValueType>* const c)
 {
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: fill output with zero
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         auto handle = exec->get_cusparse_handle();
         cusparse::pointer_mode_guard pm_guard(handle);
@@ -153,21 +152,24 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         const IndexType nb = a->get_num_block_cols();
         const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
         const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        assert(nrhs == c->get_size()[1]);
-        if (nrhs == 1) {
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
             cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
                             nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs,
                             b->get_const_values(), &beta, c->get_values());
         } else {
-            auto trans_c =
-                Array<ValueType>(exec, c->get_size()[0] * c->get_size()[1]);
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
             cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                             CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                             &alpha, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), nrhs, &beta,
-                            trans_c.get_data(), mb * bs);
-            dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(),
-                            c->get_stride(), c->get_values());
+                            b->get_const_values(), in_stride, &beta,
+                            trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
         }
         cusparse::destroy(descr);
     } else {
@@ -186,6 +188,15 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType>* const beta,
                    matrix::Dense<ValueType>* const c)
 {
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         auto handle = exec->get_cusparse_handle();
         const auto alphp = alpha->get_const_values();
@@ -199,23 +210,26 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         const IndexType nb = a->get_num_block_cols();
         const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
         const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        assert(nrhs == c->get_size()[1]);
-        if (nrhs == 1) {
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
             cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
                             b->get_const_values(), betap, c->get_values());
         } else {
-            auto trans_c =
-                Array<ValueType>(exec, c->get_size()[0] * c->get_size()[1]);
-            dense_transpose(exec, mb * bs, nrhs, c->get_stride(),
-                            c->get_values(), mb * bs, trans_c.get_data());
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
+            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
+                            trans_stride, trans_c.get_data());
             cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                             CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                             alphp, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), nrhs, betap,
-                            trans_c.get_data(), mb * bs);
-            dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(),
-                            c->get_stride(), c->get_values());
+                            b->get_const_values(), in_stride, betap,
+                            trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
         }
         cusparse::destroy(descr);
     } else {
@@ -341,7 +355,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
         ceildiv(trans->get_num_stored_elements(), default_block_size);
     transpose(exec, orig, trans);
     if (grid_size > 0) {
-        csr_reuse::conjugate_kernel<<<grid_size, default_block_size>>>(
+        kernel::conjugate<<<grid_size, default_block_size>>>(
             trans->get_num_stored_elements(),
             as_cuda_type(trans->get_values()));
     }
@@ -367,7 +381,7 @@ void is_sorted_by_column_index(
         static_cast<IndexType>(to_check->get_num_block_rows());
     const auto num_blocks = ceildiv(num_brows, block_size);
     if (num_blocks > 0) {
-        csr_reuse::kernel::check_unsorted<<<num_blocks, block_size>>>(
+        kernel::check_unsorted<<<num_blocks, block_size>>>(
             to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
             num_brows, gpu_array.get_data());
     }
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index f108eacec05..c9989afe471 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1139,8 +1139,6 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
                 a->get_const_col_idxs(), a->get_const_row_ptrs(),
                 a->get_const_srow(), b->get_const_values(), b->get_stride(),
                 c->get_values(), c->get_stride());
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -1235,8 +1233,6 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
                 a->get_const_col_idxs(), a->get_const_row_ptrs(),
                 a->get_const_srow(), b->get_const_values(), b->get_stride(),
                 c->get_values(), c->get_stride());
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "sparselib" ||
                a->get_strategy()->get_name() == "cusparse") {
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
new file mode 100644
index 00000000000..e92c329fba1
--- /dev/null
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -0,0 +1,164 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
+#define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
+
+
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The HIPSPARSE namespace.
+ *
+ * @ingroup hipsparse
+ */
+namespace hipsparse {
+
+
+/// Default storage layout within each small dense block
+constexpr hipsparseDirection_t blockDir = HIPSPARSE_DIRECTION_COLUMN;
+
+
+#define GKO_BIND_HIPSPARSE32_BSRMV(ValueType, HipsparseName)                   \
+    inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA,   \
+                      int32 mb, int32 nb, int32 nnzb, const ValueType* alpha,  \
+                      const hipsparseMatDescr_t descrA, const ValueType* valA, \
+                      const int32* rowPtrA, const int32* colIndA,              \
+                      int block_size, const ValueType* x,                      \
+                      const ValueType* beta, ValueType* y)                     \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, blockDir, transA, mb, nb, nnzb, as_hiplibs_type(alpha),    \
+            descrA, as_hiplibs_type(valA), rowPtrA, colIndA, block_size,       \
+            as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y)));   \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_BSRMV(ValueType, HipsparseName)                   \
+    inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA,   \
+                      int64 mb, int64 nb, int64 nnzb, const ValueType* alpha,  \
+                      const hipsparseMatDescr_t descrA, const ValueType* valA, \
+                      const int64* rowPtrA, const int64* colIndA,              \
+                      int block_size, const ValueType* x,                      \
+                      const ValueType* beta, ValueType* y)                     \
+        GKO_NOT_IMPLEMENTED;                                                   \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_BSRMV(float, hipsparseSbsrmv);
+GKO_BIND_HIPSPARSE32_BSRMV(double, hipsparseDbsrmv);
+GKO_BIND_HIPSPARSE32_BSRMV(std::complex<float>, hipsparseCbsrmv);
+GKO_BIND_HIPSPARSE32_BSRMV(std::complex<double>, hipsparseZbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(float, hipsparseSbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(double, hipsparseDbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(std::complex<float>, hipsparseCbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(std::complex<double>, hipsparseZbsrmv);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_BSRMV(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_BSRMV(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_BSRMV
+#undef GKO_BIND_HIPSPARSE64_BSRMV
+
+
+#define GKO_BIND_HIPSPARSE32_BSRMM(ValueType, HipsparseName)                   \
+    inline void bsrmm(hipsparseHandle_t handle, hipsparseOperation_t transA,   \
+                      hipsparseOperation_t transB, int32 mb, int32 n,          \
+                      int32 kb, int32 nnzb, const ValueType* alpha,            \
+                      const hipsparseMatDescr_t descrA, const ValueType* valA, \
+                      const int32* rowPtrA, const int32* colIndA,              \
+                      int block_size, const ValueType* B, int32 ldb,           \
+                      const ValueType* beta, ValueType* C, int32 ldc)          \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, blockDir, transA, transB, mb, n, kb, nnzb,                 \
+            as_hiplibs_type(alpha), descrA, as_hiplibs_type(valA), rowPtrA,    \
+            colIndA, block_size, as_hiplibs_type(B), ldb,                      \
+            as_hiplibs_type(beta), as_hiplibs_type(C), ldc));                  \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_BSRMM(ValueType, HipsparseName)                  \
+    inline void bsrmm(                                                        \
+        hipsparseHandle_t handle, hipsparseOperation_t transA,                \
+        hipsparseOperation_t transB, int64 mb, int64 n, int64 kb, int64 nnzb, \
+        const ValueType* alpha, const hipsparseMatDescr_t descrA,             \
+        const ValueType* valA, const int64* rowPtrA, const int64* colIndA,    \
+        int block_size, const ValueType* B, int64 ldb, const ValueType* beta, \
+        ValueType* C, int64 ldc) GKO_NOT_IMPLEMENTED;                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_BSRMM(float, hipsparseSbsrmm);
+GKO_BIND_HIPSPARSE32_BSRMM(double, hipsparseDbsrmm);
+GKO_BIND_HIPSPARSE32_BSRMM(std::complex<float>, hipsparseCbsrmm);
+GKO_BIND_HIPSPARSE32_BSRMM(std::complex<double>, hipsparseZbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(float, hipsparseSbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(double, hipsparseDbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(std::complex<float>, hipsparseCbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(std::complex<double>, hipsparseZbsrmm);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_BSRMM(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_BSRMM(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_BSRMM
+#undef GKO_BIND_HIPSPARSE64_BSRMM
+
+
+}  // namespace hipsparse
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index a9fcb49e8e6..847eee34c3d 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -37,6 +37,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
 
 
 #include <ginkgo/core/base/array.hpp>
@@ -50,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -99,6 +107,7 @@ using spgeam_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
 
 
@@ -296,8 +305,6 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
                 as_hip_type(b->get_const_values()),
                 as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
                 as_hip_type(c->get_stride()));
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -392,8 +399,6 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
                 as_hip_type(b->get_const_values()),
                 as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
                 as_hip_type(c->get_stride()));
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -730,16 +735,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
-void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* source, int bs,
-                      Array<IndexType>& row_ptrs, Array<IndexType>& col_idxs,
-                      Array<ValueType>& values) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const HipExecutor> exec,
                const matrix::Csr<ValueType, IndexType>* orig,
@@ -790,7 +785,7 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
             trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
 
         if (grid_size > 0) {
-            hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0,
+            hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0,
                                trans->get_num_stored_elements(),
                                as_hip_type(trans->get_values()));
         }
@@ -812,7 +807,7 @@ void inv_symm_permute(std::shared_ptr<const HipExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute),
                            count_num_blocks, default_block_size, 0, 0, num_rows,
                            perm, orig->get_const_row_ptrs(),
                            permuted->get_row_ptrs());
@@ -822,7 +817,7 @@ void inv_symm_permute(std::shared_ptr<const HipExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(inv_symm_permute_kernel<config::warp_size>),
+            HIP_KERNEL_NAME(kernel::inv_symm_permute<config::warp_size>),
             copy_num_blocks, default_block_size, 0, 0, num_rows, perm,
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
             as_hip_type(orig->get_const_values()), permuted->get_row_ptrs(),
@@ -842,7 +837,7 @@ void row_permute(std::shared_ptr<const HipExecutor> exec, const IndexType* perm,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(row_ptr_permute_kernel),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::row_ptr_permute),
                            count_num_blocks, default_block_size, 0, 0, num_rows,
                            perm, orig->get_const_row_ptrs(),
                            row_permuted->get_row_ptrs());
@@ -852,7 +847,7 @@ void row_permute(std::shared_ptr<const HipExecutor> exec, const IndexType* perm,
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(row_permute_kernel<config::warp_size>),
+            HIP_KERNEL_NAME(kernel::row_permute<config::warp_size>),
             copy_num_blocks, default_block_size, 0, 0, num_rows, perm,
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
             as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
@@ -874,7 +869,7 @@ void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute),
                            count_num_blocks, default_block_size, 0, 0, num_rows,
                            perm, orig->get_const_row_ptrs(),
                            row_permuted->get_row_ptrs());
@@ -884,7 +879,7 @@ void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(inv_row_permute_kernel<config::warp_size>),
+            HIP_KERNEL_NAME(kernel::inv_row_permute<config::warp_size>),
             copy_num_blocks, default_block_size, 0, 0, num_rows, perm,
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
             as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 6ace089e2ea..4b63be8f905 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
@@ -296,7 +297,18 @@ template <typename ValueType, typename IndexType>
 void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Fbcsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto num_block_rows = result->get_num_block_rows();
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::convert_to_fbcsr<<<num_blocks, default_block_size>>>(
+            num_block_rows, result->get_num_block_cols(), source->get_stride(),
+            result->get_block_size(), as_hip_type(source->get_const_values()),
+            result->get_const_row_ptrs(), result->get_col_idxs(),
+            as_hip_type(result->get_values()));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
@@ -305,8 +317,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                   const matrix::Dense<ValueType>* source,
-                                  int bs,
-                                  IndexType* result) GKO_NOT_IMPLEMENTED;
+                                  int bs, IndexType* result)
+{
+    const auto num_block_rows = source->get_size()[0] / bs;
+    const auto num_block_cols = source->get_size()[1] / bs;
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::
+            count_nonzero_blocks_per_row<<<num_blocks, default_block_size>>>(
+                num_block_rows, num_block_cols, source->get_stride(), bs,
+                as_hip_type(source->get_const_values()), result);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
@@ -417,7 +440,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
             hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N,
                           orig->get_size()[0], orig->get_size()[1], &alpha,
                           orig->get_const_values(), orig->get_stride(), &beta,
-                          orig->get_const_values(), trans->get_size()[1],
+                          trans->get_const_values(), trans->get_stride(),
                           trans->get_values(), trans->get_stride());
         }
     } else {
@@ -442,7 +465,7 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
             hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N,
                           orig->get_size()[0], orig->get_size()[1], &alpha,
                           orig->get_const_values(), orig->get_stride(), &beta,
-                          orig->get_const_values(), trans->get_size()[1],
+                          trans->get_values(), trans->get_stride(),
                           trans->get_values(), trans->get_stride());
         }
     } else {
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 33ed4a08e2e..096554cb9df 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -54,15 +54,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "core/base/block_sizes.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/hipsparse_block_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
 
 namespace gko {
 namespace kernels {
 namespace hip {
+
+
 /**
  * @brief The fixed-size block compressed sparse row matrix format namespace.
  *
@@ -74,25 +90,152 @@ namespace fbcsr {
 constexpr int default_block_size{512};
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
 
 
+namespace {
+
+template <typename ValueType>
+void dense_transpose(std::shared_ptr<const HipExecutor> exec,
+                     const size_type nrows, const size_type ncols,
+                     const size_type orig_stride, const ValueType* const orig,
+                     const size_type trans_stride, ValueType* const trans)
+{
+    if (nrows == 0) {
+        return;
+    }
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        {
+            hipblas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols,
+                          &alpha, orig, orig_stride, &beta, trans, trans_stride,
+                          trans, trans_stride);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b,
-          matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
+          const matrix::Fbcsr<ValueType, IndexType>* const a,
+          const matrix::Dense<ValueType>* const b,
+          matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0) {
+        // empty input: fill output with zero
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        hipsparse::pointer_mode_guard pm_guard(handle);
+        const auto alpha = one<ValueType>();
+        const auto beta = zero<ValueType>();
+        auto descr = hipsparse::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
+                             bs, b->get_const_values(), &beta, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
+            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             &alpha, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, &beta,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        hipsparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
+                   const matrix::Dense<ValueType>* const alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* const a,
+                   const matrix::Dense<ValueType>* const b,
+                   const matrix::Dense<ValueType>* const beta,
+                   matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        const auto alphp = alpha->get_const_values();
+        const auto betap = beta->get_const_values();
+        auto descr = hipsparse::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), betap, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
+            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
+                            trans_stride, trans_c.get_data());
+            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, betap,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        hipsparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
@@ -101,7 +244,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void fill_in_dense(std::shared_ptr<const HipExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* source,
-                   matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+                   matrix::Dense<ValueType>* result)
+{
+    constexpr auto warps_per_block = default_block_size / config::warp_size;
+    const auto num_blocks =
+        ceildiv(source->get_num_block_rows(), warps_per_block);
+    if (num_blocks > 0) {
+        kernel::fill_in_dense<<<num_blocks, default_block_size>>>(
+            source->get_const_row_ptrs(), source->get_const_col_idxs(),
+            as_hip_type(source->get_const_values()),
+            as_hip_type(result->get_values()), result->get_stride(),
+            source->get_num_block_rows(), source->get_block_size());
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
@@ -116,10 +271,8 @@ void convert_to_csr(const std::shared_ptr<const HipExecutor> exec,
     const auto num_blocks =
         ceildiv(source->get_num_block_rows(), warps_per_block);
     if (num_blocks > 0) {
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks,
-            default_block_size, 0, 0, source->get_const_row_ptrs(),
-            source->get_const_col_idxs(),
+        kernel::convert_to_csr<<<num_blocks, default_block_size>>>(
+            source->get_const_row_ptrs(), source->get_const_col_idxs(),
             as_hip_type(source->get_const_values()), result->get_row_ptrs(),
             result->get_col_idxs(), as_hip_type(result->get_values()),
             source->get_num_block_rows(), source->get_block_size());
@@ -131,9 +284,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const HipExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* orig,
-               matrix::Fbcsr<ValueType, IndexType>* trans) GKO_NOT_IMPLEMENTED;
+void transpose(const std::shared_ptr<const HipExecutor> exec,
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
+    GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
@@ -152,8 +306,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const HipExecutor> exec,
-    const matrix::Fbcsr<ValueType, IndexType>* to_check,
-    bool* is_sorted) GKO_NOT_IMPLEMENTED;
+    const matrix::Fbcsr<ValueType, IndexType>* const to_check,
+    bool* const is_sorted)
+{
+    *is_sorted = true;
+    auto gpu_array = Array<bool>(exec, 1);
+    // need to initialize the GPU value to true
+    exec->copy_from<bool>(exec->get_master().get(), 1, is_sorted,
+                          gpu_array.get_data());
+    auto block_size = default_block_size;
+    const auto num_brows =
+        static_cast<IndexType>(to_check->get_num_block_rows());
+    const auto num_blocks = ceildiv(num_brows, block_size);
+    if (num_blocks > 0) {
+        kernel::check_unsorted<<<num_blocks, block_size>>>(
+            to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+            num_brows, gpu_array.get_data());
+    }
+    *is_sorted = exec->copy_val_to_host(gpu_array.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index ed0bbc32c99..29e6479a282 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -200,35 +200,27 @@ struct CsrWithAutomaticalStrategy
 struct Ell : SimpleMatrixTest<gko::matrix::Ell<matrix_value_type, int>> {};
 
 
-struct FbcsrBlocksize1
-    : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
+template <int block_size>
+struct Fbcsr : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
     static bool preserves_zeros() { return false; }
 
     static std::unique_ptr<matrix_type> create(
         std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
     {
-        return matrix_type::create(exec, size, 0, 1);
+        size[0] = gko::ceildiv(size[0], block_size) * block_size;
+        size[1] = gko::ceildiv(size[1], block_size) * block_size;
+        return matrix_type::create(exec, size, 0, block_size);
     }
 
     static void check_property(const std::unique_ptr<matrix_type>& mtx)
     {
-        ASSERT_EQ(mtx->get_block_size(), 1);
+        ASSERT_EQ(mtx->get_block_size(), block_size);
     }
-};
-
-struct FbcsrBlocksize2
-    : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
-    static bool preserves_zeros() { return false; }
 
-    static std::unique_ptr<matrix_type> create(
-        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
-    {
-        return matrix_type::create(exec, size, 0, 2);
-    }
-
-    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    static void modify_data(gko::matrix_data<matrix_value_type, int>& data)
     {
-        ASSERT_EQ(mtx->get_block_size(), 2);
+        data.size[0] = gko::ceildiv(data.size[0], block_size) * block_size;
+        data.size[1] = gko::ceildiv(data.size[1], block_size) * block_size;
     }
 };
 
@@ -484,9 +476,10 @@ class Matrix : public ::testing::Test {
     template <typename TestFunction>
     void forall_matrix_data_scenarios(TestFunction fn)
     {
-        auto guarded_fn = [&](auto mtx) {
+        auto guarded_fn = [&](auto data) {
             try {
-                fn(std::move(mtx));
+                Config::modify_data(data);
+                fn(std::move(data));
             } catch (std::exception& e) {
                 FAIL() << e.what();
             }
@@ -507,6 +500,14 @@ class Matrix : public ::testing::Test {
             SCOPED_TRACE("Zero matrix (200x100)");
             guarded_fn(gen_mtx_data(200, 100, 0, 0));
         }
+        {
+            SCOPED_TRACE("Small Sparse Matrix with variable row nnz (10x10)");
+            guarded_fn(gen_mtx_data(10, 10, 1, 5));
+        }
+        {
+            SCOPED_TRACE("Small Dense Matrix (10x10)");
+            guarded_fn(gen_mtx_data(10, 10, 10, 10));
+        }
         {
             SCOPED_TRACE("Sparse Matrix with some zeros rows (200x100)");
             guarded_fn(gen_mtx_data(200, 100, 0, 50));
@@ -637,19 +638,25 @@ class Matrix : public ::testing::Test {
 
 using MatrixTypes = ::testing::Types<
     DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy,
-    // The strategies have issues with zero rows
-    /*
-    #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
-        defined(GKO_COMPILING_DPCPP)
-        CsrWithClassicalStrategy, CsrWithMergePathStrategy,
-        CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy,
-        CsrWithAutomaticalStrategy,
-    #endif
-    */
-    Ell, FbcsrBlocksize1, FbcsrBlocksize2, SellpDefaultParameters,
-    Sellp32Factor2, HybridDefaultStrategy, HybridColumnLimitStrategy,
-    HybridImbalanceLimitStrategy, HybridImbalanceBoundedLimitStrategy,
-    HybridMinStorageStrategy, HybridAutomaticStrategy, SparsityCsr>;
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
+    defined(GKO_COMPILING_DPCPP)
+    CsrWithClassicalStrategy, CsrWithMergePathStrategy,
+    CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy,
+    CsrWithAutomaticalStrategy,
+#endif
+    Ell,
+#ifdef GKO_COMPILING_OMP
+    // CUDA doesn't allow blocksize 1
+    Fbcsr<1>,
+#endif
+#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA)
+    // Fbcsr is broken on ROCm
+    Fbcsr<2>, Fbcsr<3>,
+#endif
+    SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy,
+    HybridColumnLimitStrategy, HybridImbalanceLimitStrategy,
+    HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy,
+    HybridAutomaticStrategy, SparsityCsr>;
 
 TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator);
 
@@ -810,7 +817,6 @@ TYPED_TEST(Matrix, ReadWriteRoundtrip)
         auto new_mtx = TestConfig::create(this->exec, data.size);
         gko::matrix_data<value_type, index_type> out_data;
 
-        TestConfig::modify_data(data);
         new_mtx->read(data);
         new_mtx->write(out_data);
 

From 20562f4aaa5478b73c4e96fee91bb8797df4b523 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 2 Mar 2022 16:37:46 +0100
Subject: [PATCH 04/17] improve and fix Dense -> Fbcsr conversion

---
 common/cuda_hip/matrix/dense_kernels.hpp.inc | 116 ++++++++++---------
 core/matrix/dense.cpp                        |   1 +
 2 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index 9f0a5c0e4b7..f60747ae190 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -40,7 +40,8 @@ __global__
         int block_size, const ValueType* __restrict__ source,
         IndexType* __restrict__ block_row_nnz)
 {
-    const auto brow = thread::get_subwarp_id_flat<config::warp_size>();
+    const auto brow =
+        thread::get_subwarp_id_flat<config::warp_size, IndexType>();
 
     if (brow >= num_block_rows) {
         return;
@@ -51,11 +52,9 @@ __global__
     auto warp =
         group::tiled_partition<config::warp_size>(group::this_thread_block());
     const auto lane = static_cast<IndexType>(warp.thread_rank());
-    const auto block_mask =
-        block_size >= config::warp_size
-            ? ~config::lane_mask_type{}
-            : (config::lane_mask_type{1} << block_size) - 1u;
-    bool previous_block_nonzero = false;
+    constexpr auto full_mask = ~config::lane_mask_type{};
+    constexpr auto one_mask = config::lane_mask_type{1};
+    bool first_block_nonzero = false;
     IndexType block_count{};
     for (IndexType base_col = 0; base_col < num_cols;
          base_col += config::warp_size) {
@@ -63,37 +62,39 @@ __global__
         const auto block_local_col = col % block_size;
         // which is the first column in the current block?
         const auto block_base_col = col - block_local_col;
-        // which lowest thread in the warp does this column belong to?
-        const auto block_base_thread = block_base_col - base_col;
         // collect nonzero bitmask
-        config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u};
+        bool local_nonzero = false;
         for (int local_row = 0; local_row < block_size; local_row++) {
             const auto row = local_row + brow * block_size;
-            nonzero_mask |= warp.ballot(col < num_cols &&
-                                        is_nonzero(source[row * stride + col]));
+            local_nonzero |=
+                col < num_cols && is_nonzero(source[row * stride + col]);
         }
-        const auto local_mask =
-            (block_base_thread < 0 ? nonzero_mask << -block_base_thread
-                                   : nonzero_mask >> block_base_thread) &
-            block_mask;
+        auto nonzero_mask =
+            warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u);
+        // only consider threads in the current block
+        const auto first_thread = block_base_col - base_col;
+        const auto last_thread = first_thread + block_size;
+        assert(first_thread < int(config::warp_size));
+        assert(last_thread >= 0);
+        // mask off everything below first_thread
+        const auto lower_mask =
+            first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u);
+        // mask off everything from last_thread
+        const auto upper_mask = last_thread >= config::warp_size
+                                    ? full_mask
+                                    : ((one_mask << last_thread) - 1u);
+        const auto block_mask = upper_mask & lower_mask;
+        const auto local_mask = nonzero_mask & block_mask;
         if (block_local_col == block_size - 1) {
             block_count += local_mask ? 1 : 0;
         }
         // if we need to store something for the next iteration
         if ((base_col + config::warp_size) % block_size != 0) {
-            // what's the first column of the last block in the warp?
-            const auto last_block_base_col =
-                (base_col + static_cast<IndexType>(config::warp_size) - 1) /
-                block_size * block_size;
-            // which thread does it belong to?
-            const auto last_block_base_thread = last_block_base_col - base_col;
-            previous_block_nonzero =
-                bool((last_block_base_thread < 0
-                          ? nonzero_mask << -last_block_base_thread
-                          : nonzero_mask >> last_block_base_thread) &
-                     block_mask);
+            // check whether the last block (incomplete) in this warp is nonzero
+            first_block_nonzero =
+                bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1));
         } else {
-            previous_block_nonzero = false;
+            first_block_nonzero = false;
         }
     }
     block_count = reduce(warp, block_count,
@@ -111,7 +112,8 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
     const IndexType* __restrict__ block_row_ptrs,
     IndexType* __restrict__ block_cols, ValueType* __restrict__ blocks)
 {
-    const auto brow = thread::get_subwarp_id_flat<config::warp_size>();
+    const auto brow =
+        thread::get_subwarp_id_flat<config::warp_size, IndexType>();
 
     if (brow >= num_block_rows) {
         return;
@@ -122,13 +124,10 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
     auto warp =
         group::tiled_partition<config::warp_size>(group::this_thread_block());
     const auto lane = static_cast<IndexType>(warp.thread_rank());
-    auto lane_prefix_mask =
-        (config::lane_mask_type(1) << warp.thread_rank()) - 1;
-    const auto block_mask =
-        block_size >= config::warp_size
-            ? ~config::lane_mask_type{}
-            : (config::lane_mask_type{1} << block_size) - 1u;
-    bool previous_block_nonzero = false;
+    constexpr auto full_mask = ~config::lane_mask_type{};
+    constexpr auto one_mask = config::lane_mask_type{1};
+    const auto lane_prefix_mask = (one_mask << warp.thread_rank()) - 1u;
+    bool first_block_nonzero = false;
     auto block_base_nz = block_row_ptrs[brow];
     for (IndexType base_col = 0; base_col < num_cols;
          base_col += config::warp_size) {
@@ -136,19 +135,29 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
         const auto block_local_col = col % block_size;
         // which is the first column in the current block?
         const auto block_base_col = col - block_local_col;
-        // which lowest thread in the warp does this column belong to?
-        const auto block_base_thread = block_base_col - base_col;
         // collect nonzero bitmask
-        config::lane_mask_type nonzero_mask{previous_block_nonzero ? 1u : 0u};
+        bool local_nonzero = false;
         for (int local_row = 0; local_row < block_size; local_row++) {
             const auto row = local_row + brow * block_size;
-            nonzero_mask |= warp.ballot(col < num_cols &&
-                                        is_nonzero(source[row * stride + col]));
+            local_nonzero |=
+                col < num_cols && is_nonzero(source[row * stride + col]);
         }
-        const auto local_mask =
-            (block_base_thread < 0 ? nonzero_mask << -block_base_thread
-                                   : nonzero_mask >> block_base_thread) &
-            block_mask;
+        auto nonzero_mask =
+            warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u);
+        // only consider threads in the current block
+        const auto first_thread = block_base_col - base_col;
+        const auto last_thread = first_thread + block_size;
+        assert(first_thread < int(config::warp_size));
+        assert(last_thread >= 0);
+        // mask off everything below first_thread
+        const auto lower_mask =
+            first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u);
+        // mask off everything from last_thread
+        const auto upper_mask = last_thread >= config::warp_size
+                                    ? full_mask
+                                    : ((one_mask << last_thread) - 1u);
+        const auto block_mask = upper_mask & lower_mask;
+        const auto local_mask = nonzero_mask & block_mask;
         const auto block_nonzero_mask =
             warp.ballot(local_mask && (block_local_col == block_size - 1));
 
@@ -160,7 +169,8 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
             if (block_local_col == block_size - 1) {
                 block_cols[block_nz] = col / block_size;
             }
-            if (col < num_cols) {
+            // only if we encountered elements in this column
+            if (local_nonzero) {
                 for (int local_row = 0; local_row < block_size; local_row++) {
                     const auto row = local_row + brow * block_size;
                     blocks[local_row + block_local_col * block_size +
@@ -170,19 +180,11 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
         }
         // if we need to store something for the next iteration
         if ((base_col + config::warp_size) % block_size != 0) {
-            // what's the first column of the last block in the warp?
-            const auto last_block_base_col =
-                (base_col + static_cast<IndexType>(config::warp_size) - 1) /
-                block_size * block_size;
-            // which thread does it belong to?
-            const auto last_block_base_thread = last_block_base_col - base_col;
-            previous_block_nonzero =
-                bool((last_block_base_thread < 0
-                          ? nonzero_mask << -last_block_base_thread
-                          : nonzero_mask >> last_block_base_thread) &
-                     block_mask);
+            // check whether the last block (incomplete) in this warp is nonzero
+            first_block_nonzero =
+                bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1));
         } else {
-            previous_block_nonzero = false;
+            first_block_nonzero = false;
         }
         // advance by the completed blocks
         block_base_nz += popcnt(block_nonzero_mask);
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index b2a92cbbe95..817233997a3 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -628,6 +628,7 @@ void Dense<ValueType>::convert_impl(Fbcsr<ValueType, IndexType>* result) const
         exec->copy_val_to_host(tmp->get_const_row_ptrs() + row_blocks);
     tmp->col_idxs_.resize_and_reset(nnz_blocks);
     tmp->values_.resize_and_reset(nnz_blocks * bs * bs);
+    tmp->values_.fill(zero<ValueType>());
     tmp->set_size(this->get_size());
     exec->run(dense::make_convert_to_fbcsr(this, tmp.get()));
 }

From 8ae8a9472441e59f4cc79da8bfcd1e639f3269f1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 3 Mar 2022 10:54:25 +0100
Subject: [PATCH 05/17] fix HIP Fbcsr SpMV

It's still broken for strided accesses though
---
 hip/matrix/fbcsr_kernels.hip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 096554cb9df..37b136e18d6 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -133,7 +133,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         // empty output: nothing to do
         return;
     }
-    if (b->get_size()[0] == 0) {
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
         // empty input: fill output with zero
         dense::fill(exec, c, zero<ValueType>());
         return;
@@ -192,7 +192,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         // empty output: nothing to do
         return;
     }
-    if (b->get_size()[0] == 0) {
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
         // empty input: scale output
         dense::scale(exec, beta, c);
         return;

From 5dbb787a4520d9dc3910c6566e31efc978a7139a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 3 Mar 2022 11:09:58 +0100
Subject: [PATCH 06/17] review updates

Co-authored-by: Yuhsiang Tsai <yhmtsai@gmail.com>
---
 common/cuda_hip/matrix/dense_kernels.hpp.inc |  6 +++---
 common/cuda_hip/matrix/fbcsr_kernels.hpp.inc | 17 +++++++++--------
 hip/matrix/fbcsr_kernels.hip.cpp             |  2 ++
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index f60747ae190..b40d650a3a5 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -85,9 +85,9 @@ __global__
                                     : ((one_mask << last_thread) - 1u);
         const auto block_mask = upper_mask & lower_mask;
         const auto local_mask = nonzero_mask & block_mask;
-        if (block_local_col == block_size - 1) {
-            block_count += local_mask ? 1 : 0;
-        }
+        // last column in the block increments the counter
+        block_count +=
+            (block_local_col == block_size - 1 && local_mask) ? 1 : 0;
         // if we need to store something for the next iteration
         if ((base_col + config::warp_size) % block_size != 0) {
             // check whether the last block (incomplete) in this warp is nonzero
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
index 66067d3ffa6..b703fc783ce 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -205,17 +205,17 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     Array<size_type> block_ptr_array{exec, num_blocks};
     block_col_idx_array.resize_and_reset(num_blocks);
     block_value_array.resize_and_reset(num_blocks * bs * bs);
-    auto row_idxs = block_row_idx_array.get_data();
-    auto col_idxs = block_col_idx_array.get_data();
-    auto values = as_device_type(block_value_array.get_data());
+    auto block_row_idxs = block_row_idx_array.get_data();
+    auto block_col_idxs = block_col_idx_array.get_data();
+    auto block_values = as_device_type(block_value_array.get_data());
     auto block_ptrs = block_ptr_array.get_data();
     auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
     // write (block_row, block_col, block_start_idx) tuples for each block
     thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it,
                     adj_predicate);
     auto block_output_it = thrust::make_zip_iterator(
-        thrust::make_tuple(thrust::device_pointer_cast(row_idxs),
-                           thrust::device_pointer_cast(col_idxs)));
+        thrust::make_tuple(thrust::device_pointer_cast(block_row_idxs),
+                           thrust::device_pointer_cast(block_col_idxs)));
     thrust::transform(
         thrust::device, block_ptr_it, block_ptr_it + num_blocks,
         block_output_it, [bs, in_rows, in_cols] __device__(size_type i) {
@@ -231,12 +231,13 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     thrust::for_each_n(
         thrust::device, iota, num_blocks,
         [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
-         values] __device__(size_type i) {
+         block_values] __device__(size_type i) {
             const auto block_begin = block_ptrs[i];
             const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
             for (auto nz = block_begin; nz < block_end; nz++) {
-                values[i * bs * bs + (in_cols[nz] % bs) * bs +
-                       (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
+                block_values[i * bs * bs + (in_cols[nz] % bs) * bs +
+                             (in_rows[nz] % bs)] =
+                    fake_complex_unpack(in_vals[nz]);
             }
         });
 }
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 37b136e18d6..0e021bce36b 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -96,6 +96,7 @@ constexpr int default_block_size{512};
 
 namespace {
 
+
 template <typename ValueType>
 void dense_transpose(std::shared_ptr<const HipExecutor> exec,
                      const size_type nrows, const size_type ncols,
@@ -120,6 +121,7 @@ void dense_transpose(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
+
 }  // namespace
 
 

From b57106a0615c104ea0e543cbd29c3817f7c6f08b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 3 Mar 2022 16:46:46 +0100
Subject: [PATCH 07/17] replace Array<...>::view by make_array_view

---
 core/base/composition.cpp                     |  7 ++-
 core/factorization/par_ic.cpp                 |  6 +--
 core/factorization/par_ict.cpp                |  4 +-
 core/factorization/par_ilut.cpp               |  8 +--
 core/matrix/dense.cpp                         |  4 +-
 core/preconditioner/jacobi.cpp                |  6 +--
 core/test/base/array.cpp                      | 22 ++++----
 core/test/matrix/coo.cpp                      |  6 +--
 core/test/matrix/csr.cpp                      |  6 +--
 core/test/matrix/dense.cpp                    |  2 +-
 core/test/matrix/diagonal.cpp                 |  2 +-
 core/test/matrix/ell.cpp                      |  4 +-
 core/test/matrix/fbcsr.cpp                    |  6 +--
 core/test/matrix/permutation.cpp              |  8 +--
 core/test/matrix/row_gatherer.cpp             |  4 +-
 core/test/matrix/sparsity_csr.cpp             |  4 +-
 core/test/utils/assertions_test.cpp           |  2 +-
 .../par_ilut_approx_filter_kernel.cu          |  4 +-
 cuda/factorization/par_ilut_filter_kernel.cu  |  4 +-
 cuda/factorization/par_ilut_select_kernel.cu  |  2 +-
 cuda/matrix/csr_kernels.cu                    |  2 +-
 .../par_ilut_approx_filter_kernel.hip.cpp     |  4 +-
 .../par_ilut_filter_kernel.hip.cpp            |  4 +-
 .../par_ilut_select_kernel.hip.cpp            |  2 +-
 hip/matrix/csr_kernels.hip.cpp                |  2 +-
 include/ginkgo/core/matrix/dense.hpp          | 16 +++---
 omp/factorization/par_ilut_kernels.cpp        |  4 +-
 reference/factorization/par_ilut_kernels.cpp  |  4 +-
 .../precision_conversion_kernels.cpp          |  2 +-
 reference/test/matrix/permutation.cpp         | 54 +++++++------------
 30 files changed, 91 insertions(+), 114 deletions(-)

diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index 21140fe37e9..8f927fc8baf 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -82,8 +82,8 @@ std::unique_ptr<LinOp> apply_inner_operators(
     auto op_size = operators.back()->get_size();
     auto out_dim = gko::dim<2>{op_size[0], num_rhs};
     auto out_size = out_dim[0] * num_rhs;
-    auto out = Dense::create(
-        exec, out_dim, Array<ValueType>::view(exec, out_size, data), num_rhs);
+    auto out = Dense::create(exec, out_dim,
+                             make_array_view(exec, out_size, data), num_rhs);
     // for operators with initial guess: set initial guess
     if (operators.back()->apply_uses_initial_guess()) {
         if (op_size[0] == op_size[1]) {
@@ -111,8 +111,7 @@ std::unique_ptr<LinOp> apply_inner_operators(
             data + (reversed_storage ? storage_size - out_size : size_type{});
         reversed_storage = !reversed_storage;
         out = Dense::create(exec, out_dim,
-                            Array<ValueType>::view(exec, out_size, out_data),
-                            num_rhs);
+                            make_array_view(exec, out_size, out_data), num_rhs);
         // for operators with initial guess: set initial guess
         if (operators[i]->apply_uses_initial_guess()) {
             if (op_size[0] == op_size[1]) {
diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp
index e54538ad9e3..a2d10271aff 100644
--- a/core/factorization/par_ic.cpp
+++ b/core/factorization/par_ic.cpp
@@ -122,12 +122,10 @@ std::unique_ptr<Composition<ValueType>> ParIc<ValueType, IndexType>::generate(
     // build COO representation of lower factor
     Array<IndexType> l_row_idxs{exec, l_nnz};
     // copy values from l_factor, which are the lower triangular values of A
-    auto l_vals_view =
-        Array<ValueType>::view(exec, l_nnz, l_factor->get_values());
+    auto l_vals_view = make_array_view(exec, l_nnz, l_factor->get_values());
     auto a_vals = Array<ValueType>{exec, l_vals_view};
     auto a_row_idxs = Array<IndexType>{exec, l_nnz};
-    auto a_col_idxs =
-        Array<IndexType>::view(exec, l_nnz, l_factor->get_col_idxs());
+    auto a_col_idxs = make_array_view(exec, l_nnz, l_factor->get_col_idxs());
     auto a_lower_coo =
         CooMatrix::create(exec, matrix_size, std::move(a_vals),
                           std::move(a_col_idxs), std::move(a_row_idxs));
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
index cfbdfeee88a..742c8b9811f 100644
--- a/core/factorization/par_ict.cpp
+++ b/core/factorization/par_ict.cpp
@@ -243,9 +243,9 @@ void ParIctState<ValueType, IndexType>::iterate()
         l_builder.get_row_idx_array().resize_and_reset(l_nnz);
         // update arrays that will be aliased
         l_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, l_nnz, l_new->get_col_idxs());
+            make_array_view(exec, l_nnz, l_new->get_col_idxs());
         l_builder.get_value_array() =
-            Array<ValueType>::view(exec, l_nnz, l_new->get_values());
+            make_array_view(exec, l_nnz, l_new->get_values());
     }
 
     // convert L into COO format
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
index e54eac93af5..8f2c0edbfa1 100644
--- a/core/factorization/par_ilut.cpp
+++ b/core/factorization/par_ilut.cpp
@@ -277,13 +277,13 @@ void ParIlutState<ValueType, IndexType>::iterate()
         u_csc_builder.get_value_array().resize_and_reset(u_nnz);
         // update arrays that will be aliased
         l_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, l_nnz, l_new->get_col_idxs());
+            make_array_view(exec, l_nnz, l_new->get_col_idxs());
         u_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, u_nnz, u_new->get_col_idxs());
+            make_array_view(exec, u_nnz, u_new->get_col_idxs());
         l_builder.get_value_array() =
-            Array<ValueType>::view(exec, l_nnz, l_new->get_values());
+            make_array_view(exec, l_nnz, l_new->get_values());
         u_builder.get_value_array() =
-            Array<ValueType>::view(exec, u_nnz, u_new->get_values());
+            make_array_view(exec, u_nnz, u_new->get_values());
     }
 
     // convert U' into CSC format
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 817233997a3..06fef2b96c4 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -471,8 +471,8 @@ void Dense<ValueType>::convert_to(Dense<ValueType>* result) const
         // matrix on the array to avoid special-casing cross-executor copies
         auto tmp_result =
             Dense{exec, result->get_size(),
-                  Array<ValueType>::view(exec, result_array->get_num_elems(),
-                                         result_array->get_data()),
+                  make_array_view(exec, result_array->get_num_elems(),
+                                  result_array->get_data()),
                   result->get_stride()};
         exec->run(dense::make_copy(this, &tmp_result));
     } else {
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index e9f9352bf7b..22c7b61012e 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -273,9 +273,9 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
         if (!diag_vt) {
             GKO_NOT_SUPPORTED(system_matrix);
         }
-        auto temp = Array<ValueType>::view(diag_vt->get_executor(),
-                                           diag_vt->get_size()[0],
-                                           diag_vt->get_values());
+        auto temp =
+            make_array_view(diag_vt->get_executor(), diag_vt->get_size()[0],
+                            diag_vt->get_values());
         this->blocks_ = Array<ValueType>(exec, temp.get_num_elems());
         exec->run(jacobi::make_invert_diagonal(temp, this->blocks_));
         this->num_blocks_ = diag_vt->get_size()[0];
diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index d0acb5c68cf..d48bd0bea7e 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -356,7 +356,7 @@ TYPED_TEST(Array, CanBeResized)
 TYPED_TEST(Array, ViewCannotBeResized)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
 
     EXPECT_THROW(view.resize_and_reset(1), gko::NotSupported);
     EXPECT_EQ(view.get_num_elems(), 3);
@@ -416,7 +416,7 @@ TYPED_TEST(Array, ChangesExecutors)
 TYPED_TEST(Array, ViewModifiesOriginalData)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
 
     TypeParam new_data[] = {5, 4, 2};
     std::copy(new_data, new_data + 3, view.get_data());
@@ -448,11 +448,11 @@ TYPED_TEST(Array, CopyArrayToArray)
 TYPED_TEST(Array, CopyViewToView)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
     TypeParam data2[] = {5, 4, 2};
-    auto view2 = gko::Array<TypeParam>::view(this->exec, 3, data2);
+    auto view2 = gko::make_array_view(this->exec, 3, data2);
     TypeParam data_size4[] = {5, 4, 2, 1};
-    auto view_size4 = gko::Array<TypeParam>::view(this->exec, 4, data_size4);
+    auto view_size4 = gko::make_array_view(this->exec, 4, data_size4);
 
     view = view2;
     view2.get_data()[0] = 2;
@@ -470,7 +470,7 @@ TYPED_TEST(Array, CopyViewToView)
 TYPED_TEST(Array, CopyViewToArray)
 {
     TypeParam data[] = {1, 2, 3, 4};
-    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    auto view = gko::make_array_view(this->exec, 4, data);
     gko::Array<TypeParam> array(this->exec, {5, 4, 2});
 
     array = view;
@@ -488,7 +488,7 @@ TYPED_TEST(Array, CopyViewToArray)
 TYPED_TEST(Array, CopyArrayToView)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
     gko::Array<TypeParam> array_size2(this->exec, {5, 4});
     gko::Array<TypeParam> array_size4(this->exec, {5, 4, 2, 1});
 
@@ -525,9 +525,9 @@ TYPED_TEST(Array, MoveArrayToArray)
 TYPED_TEST(Array, MoveViewToView)
 {
     TypeParam data[] = {1, 2, 3, 4};
-    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    auto view = gko::make_array_view(this->exec, 4, data);
     TypeParam data2[] = {5, 4, 2};
-    auto view2 = gko::Array<TypeParam>::view(this->exec, 3, data2);
+    auto view2 = gko::make_array_view(this->exec, 3, data2);
 
     view = std::move(view2);
 
@@ -550,7 +550,7 @@ TYPED_TEST(Array, MoveViewToArray)
 {
     TypeParam data[] = {1, 2, 3, 4};
     gko::Array<TypeParam> array(this->exec, {5, 4, 2});
-    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    auto view = gko::make_array_view(this->exec, 4, data);
 
     array = std::move(view);
 
@@ -572,7 +572,7 @@ TYPED_TEST(Array, MoveViewToArray)
 TYPED_TEST(Array, MoveArrayToView)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
     gko::Array<TypeParam> array_size2(this->exec, {5, 4});
     gko::Array<TypeParam> array_size4(this->exec, {5, 4, 2, 1});
     auto size2_ptr = array_size2.get_data();
diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp
index 18438a6dd68..4cc17b145b9 100644
--- a/core/test/matrix/coo.cpp
+++ b/core/test/matrix/coo.cpp
@@ -142,9 +142,9 @@ TYPED_TEST(Coo, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Coo<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 4, values),
-        gko::Array<index_type>::view(this->exec, 4, col_idxs),
-        gko::Array<index_type>::view(this->exec, 4, row_idxs));
+        gko::make_array_view(this->exec, 4, values),
+        gko::make_array_view(this->exec, 4, col_idxs),
+        gko::make_array_view(this->exec, 4, row_idxs));
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index 166b1adb86b..85163b95e01 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -147,9 +147,9 @@ TYPED_TEST(Csr, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Csr<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 4, values),
-        gko::Array<index_type>::view(this->exec, 4, col_idxs),
-        gko::Array<index_type>::view(this->exec, 4, row_ptrs),
+        gko::make_array_view(this->exec, 4, values),
+        gko::make_array_view(this->exec, 4, col_idxs),
+        gko::make_array_view(this->exec, 4, row_ptrs),
         std::make_shared<typename Mtx::load_balance>(2));
 
     ASSERT_EQ(mtx->get_num_srow_elements(), 1);
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index 2c670fe8862..44b235f804d 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -131,7 +131,7 @@ TYPED_TEST(Dense, CanBeConstructedFromExistingData)
 
     auto m = gko::matrix::Dense<TypeParam>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 9, data), 3);
+        gko::make_array_view(this->exec, 9, data), 3);
 
     ASSERT_EQ(m->get_const_values(), data);
     ASSERT_EQ(m->at(2, 1), value_type{6.0});
diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp
index a9fca07aa37..bf8bcc221bf 100644
--- a/core/test/matrix/diagonal.cpp
+++ b/core/test/matrix/diagonal.cpp
@@ -108,7 +108,7 @@ TYPED_TEST(Diagonal, CanBeCreatedFromExistingData)
     value_type values[] = {1.0, 2.0, 3.0};
 
     auto diag = gko::matrix::Diagonal<value_type>::create(
-        this->exec, 3, gko::Array<value_type>::view(this->exec, 3, values));
+        this->exec, 3, gko::make_array_view(this->exec, 3, values));
 
     ASSERT_EQ(diag->get_const_values(), values);
 }
diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp
index e7c9fd622e0..d792e00d302 100644
--- a/core/test/matrix/ell.cpp
+++ b/core/test/matrix/ell.cpp
@@ -146,8 +146,8 @@ TYPED_TEST(Ell, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Ell<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 8, values),
-        gko::Array<index_type>::view(this->exec, 8, col_idxs), 2, 4);
+        gko::make_array_view(this->exec, 8, values),
+        gko::make_array_view(this->exec, 8, col_idxs), 2, 4);
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp
index eddf6d4b753..4dc15755f48 100644
--- a/core/test/matrix/fbcsr.cpp
+++ b/core/test/matrix/fbcsr.cpp
@@ -398,9 +398,9 @@ TYPED_TEST(Fbcsr, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Fbcsr<value_type, index_type>::create(
         this->exec, gko::dim<2>{nbrows * bs, nbcols * bs}, bs,
-        gko::Array<value_type>::view(this->exec, bnnz * bs * bs, values),
-        gko::Array<index_type>::view(this->exec, bnnz, col_idxs),
-        gko::Array<index_type>::view(this->exec, nbrows + 1, row_ptrs));
+        gko::make_array_view(this->exec, bnnz * bs * bs, values),
+        gko::make_array_view(this->exec, bnnz, col_idxs),
+        gko::make_array_view(this->exec, nbrows + 1, row_ptrs));
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
index a5d21ba1782..5214f439ac9 100644
--- a/core/test/matrix/permutation.cpp
+++ b/core/test/matrix/permutation.cpp
@@ -136,7 +136,7 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData)
 
     auto m = gko::matrix::Permutation<i_type>::create(
         this->exec, gko::dim<2>{3, 5},
-        gko::Array<i_type>::view(this->exec, 3, data));
+        gko::make_array_view(this->exec, 3, data));
 
     ASSERT_EQ(m->get_const_permutation(), data);
 }
@@ -191,7 +191,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions)
 
     ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
                      this->exec, gko::dim<2>{4, 2},
-                     gko::Array<i_type>::view(this->exec, 3, data)),
+                     gko::make_array_view(this->exec, 3, data)),
                  gko::ValueMismatch);
 }
 
@@ -203,7 +203,7 @@ TYPED_TEST(Permutation, SettingMaskDoesNotModifyData)
 
     auto m = gko::matrix::Permutation<i_type>::create(
         this->exec, gko::dim<2>{3, 5},
-        gko::Array<i_type>::view(this->exec, 3, data));
+        gko::make_array_view(this->exec, 3, data));
 
     auto mask = m->get_permute_mask();
     ASSERT_EQ(m->get_const_permutation(), data);
@@ -225,7 +225,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions)
 
     ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
                      this->exec, gko::dim<2>{3, 4},
-                     gko::Array<i_type>::view(this->exec, 3, data),
+                     gko::make_array_view(this->exec, 3, data),
                      gko::matrix::column_permute),
                  gko::ValueMismatch);
 }
diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp
index 51096a94893..68c3142ca0c 100644
--- a/core/test/matrix/row_gatherer.cpp
+++ b/core/test/matrix/row_gatherer.cpp
@@ -127,7 +127,7 @@ TYPED_TEST(RowGatherer, RowGathererCanBeConstructedFromExistingData)
 
     auto m = gko::matrix::RowGatherer<i_type>::create(
         this->exec, gko::dim<2>{3, 5},
-        gko::Array<i_type>::view(this->exec, 3, data));
+        gko::make_array_view(this->exec, 3, data));
 
     ASSERT_EQ(m->get_const_row_idxs(), data);
 }
@@ -140,7 +140,7 @@ TYPED_TEST(RowGatherer, RowGathererThrowsforWrongRowPermDimensions)
 
     ASSERT_THROW(gko::matrix::RowGatherer<i_type>::create(
                      this->exec, gko::dim<2>{4, 2},
-                     gko::Array<i_type>::view(this->exec, 3, data)),
+                     gko::make_array_view(this->exec, 3, data)),
                  gko::ValueMismatch);
 }
 
diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp
index ec59661dfe0..d14a8897d19 100644
--- a/core/test/matrix/sparsity_csr.cpp
+++ b/core/test/matrix/sparsity_csr.cpp
@@ -154,8 +154,8 @@ TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::SparsityCsr<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<index_type>::view(this->exec, 4, col_idxs),
-        gko::Array<index_type>::view(this->exec, 4, row_ptrs), 2.0);
+        gko::make_array_view(this->exec, 4, col_idxs),
+        gko::make_array_view(this->exec, 4, row_ptrs), 2.0);
 
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
     ASSERT_EQ(mtx->get_const_row_ptrs(), row_ptrs);
diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp
index 7c8609573f0..5f8d454fac7 100644
--- a/core/test/utils/assertions_test.cpp
+++ b/core/test/utils/assertions_test.cpp
@@ -54,7 +54,7 @@ class MatricesNear : public ::testing::Test {
     template <typename Type, std::size_t size>
     gko::Array<Type> make_view(std::array<Type, size>& array)
     {
-        return gko::Array<Type>::view(exec, size, array.data());
+        return gko::make_array_view(exec, size, array.data());
     }
 
     MatricesNear()
diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu
index be86588a909..e79042b8eec 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu
@@ -161,9 +161,9 @@ void threshold_filter_approx(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu
index 6dd83c41835..0451582187f 100644
--- a/cuda/factorization/par_ilut_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_filter_kernel.cu
@@ -117,9 +117,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu
index 6655db78296..92bd9e28ab8 100644
--- a/cuda/factorization/par_ilut_select_kernel.cu
+++ b/cuda/factorization/par_ilut_select_kernel.cu
@@ -159,7 +159,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
         if (step > 5) {
             Array<AbsType> cpu_out_array{
                 exec->get_master(),
-                Array<AbsType>::view(exec, bucket.size, tmp_out)};
+                make_array_view(exec, bucket.size, tmp_out)};
             auto begin = cpu_out_array.get_data();
             auto end = begin + bucket.size;
             auto middle = begin + rank;
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 8afefc44972..2b83c4fdc22 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -1246,7 +1246,7 @@ void is_sorted_by_column_index(
     const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     *is_sorted = true;
-    auto cpu_array = Array<bool>::view(exec->get_master(), 1, is_sorted);
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
     auto gpu_array = Array<bool>{exec, cpu_array};
     auto block_size = default_block_size;
     auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
index d484af93d2a..f86bed4fa76 100644
--- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
@@ -165,9 +165,9 @@ void threshold_filter_approx(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp
index cf691f58f8b..7b335c08175 100644
--- a/hip/factorization/par_ilut_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp
@@ -120,9 +120,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp
index 8130a76cb18..7e38a6c67a0 100644
--- a/hip/factorization/par_ilut_select_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernel.hip.cpp
@@ -163,7 +163,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
         if (step > 5) {
             Array<AbsType> cpu_out_array{
                 exec->get_master(),
-                Array<AbsType>::view(exec, bucket.size, tmp_out)};
+                make_array_view(exec, bucket.size, tmp_out)};
             auto begin = cpu_out_array.get_data();
             auto end = begin + bucket.size;
             auto middle = begin + rank;
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 847eee34c3d..1b2c8ae6a51 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -1022,7 +1022,7 @@ void is_sorted_by_column_index(
     const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     *is_sorted = true;
-    auto cpu_array = Array<bool>::view(exec->get_master(), 1, is_sorted);
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
     auto gpu_array = Array<bool>{exec, cpu_array};
     auto block_size = default_block_size;
     auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index bcaba840fdf..d5b4811032d 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -866,10 +866,9 @@ class Dense
 
         return Dense<remove_complex<ValueType>>::create(
             this->get_executor(), dim<2>{num_rows, num_cols},
-            Array<remove_complex<ValueType>>::view(
-                this->get_executor(), num_rows * stride,
-                reinterpret_cast<remove_complex<ValueType>*>(
-                    this->get_values())),
+            make_array_view(this->get_executor(), num_rows * stride,
+                            reinterpret_cast<remove_complex<ValueType>*>(
+                                this->get_values())),
             stride);
     }
 
@@ -888,7 +887,7 @@ class Dense
 
         return Dense<remove_complex<ValueType>>::create(
             this->get_executor(), dim<2>{num_rows, num_cols},
-            Array<remove_complex<ValueType>>::view(
+            make_array_view(
                 this->get_executor(), num_rows * stride,
                 const_cast<remove_complex<ValueType>*>(
                     reinterpret_cast<const remove_complex<ValueType>*>(
@@ -1114,10 +1113,9 @@ class Dense
         return Dense::create(
             this->get_executor(),
             dim<2>{range_result.length(0), range_result.length(1)},
-            Array<ValueType>::view(
-                this->get_executor(),
-                range_result.length(0) * range_this.length(1) - columns.begin,
-                range_result->data),
+            make_array_view(this->get_executor(),
+                            range_result.length(0) * this->get_stride(),
+                            range_result->data),
             stride);
     }
 
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
index 8f0b597686d..90183ec37df 100644
--- a/omp/factorization/par_ilut_kernels.cpp
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -136,9 +136,9 @@ void abstract_filter(std::shared_ptr<const DefaultExecutor> exec,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
 
diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp
index 56cebc91366..691de096660 100644
--- a/reference/factorization/par_ilut_kernels.cpp
+++ b/reference/factorization/par_ilut_kernels.cpp
@@ -137,9 +137,9 @@ void abstract_filter(std::shared_ptr<const DefaultExecutor> exec,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
 
diff --git a/reference/test/components/precision_conversion_kernels.cpp b/reference/test/components/precision_conversion_kernels.cpp
index a82b9a2ae5c..4ff17753098 100644
--- a/reference/test/components/precision_conversion_kernels.cpp
+++ b/reference/test/components/precision_conversion_kernels.cpp
@@ -128,7 +128,7 @@ TEST_F(PrecisionConversion, ConvertsRealFromView)
     gko::Array<double> tmp{ref};
     gko::Array<float> out{ref};
 
-    tmp = gko::Array<float>::view(ref, vals.get_num_elems(), vals.get_data());
+    tmp = gko::make_array_view(ref, vals.get_num_elems(), vals.get_data());
     out = tmp;
 
     GKO_ASSERT_ARRAY_EQ(vals, out);
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
index bdb0f30059d..40773359229 100644
--- a/reference/test/matrix/permutation.cpp
+++ b/reference/test/matrix/permutation.cpp
@@ -81,8 +81,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToDense)
     i_type rdata[] = {1, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, rdata));
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata));
 
     perm->apply(x.get(), y.get());
     // clang-format off
@@ -108,8 +107,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToDense)
     i_type rdata[] = {1, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, rdata),
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata),
         gko::matrix::column_permute);
 
     perm->apply(x.get(), y.get());
@@ -138,11 +136,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense)
     i_type rdata[] = {1, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, rdata));
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata));
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, cdata),
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, cdata),
         gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
@@ -170,8 +166,7 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray)
     i_type data[] = {1, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, data),
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, data),
         gko::matrix::row_permute | gko::matrix::column_permute);
 
     perm->apply(x.get(), y1.get());
@@ -200,12 +195,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
@@ -234,8 +227,7 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray)
     i_type data[] = {1, 2, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, data),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, data),
         gko::matrix::column_permute | gko::matrix::row_permute |
             gko::matrix::inverse_permute);
 
@@ -264,8 +256,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
 
     rperm->apply(x.get(), y.get());
@@ -293,8 +284,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToDense)
     i_type cdata[] = {1, 2, 0};
 
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     cperm->apply(x.get(), y.get());
@@ -323,8 +313,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata));
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata));
 
     perm->apply(x.get(), y.get());
     // clang-format off
@@ -352,8 +341,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToCsr)
     i_type cdata[] = {1, 2, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::column_permute);
 
     perm->apply(x.get(), y.get());
@@ -384,11 +372,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata));
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata));
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
@@ -417,8 +403,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
 
     rperm->apply(x.get(), y.get());
@@ -446,8 +431,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr)
     i_type cdata[] = {1, 2, 0};
 
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     cperm->apply(x.get(), y.get());
@@ -477,12 +461,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());

From f7e327dd63ff68da55349387c6edffea681f0f9f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 3 Mar 2022 16:48:45 +0100
Subject: [PATCH 08/17] add stride to convert_to(Dense) tests

This also changes the Dense convert_to(Dense) behavior
so that it preserves strides only if no reallocation is necessary,
or on moves
---
 core/matrix/dense.cpp                | 93 ++++++++++++++++++++++------
 core/test/matrix/dense.cpp           | 11 ++--
 include/ginkgo/core/matrix/dense.hpp | 56 ++---------------
 test/matrix/matrix.cpp               | 49 +++++++++++----
 4 files changed, 124 insertions(+), 85 deletions(-)

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 06fef2b96c4..d7e4a0f428d 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -462,31 +462,32 @@ void Dense<ValueType>::compute_norm1_impl(LinOp* result) const
 template <typename ValueType>
 void Dense<ValueType>::convert_to(Dense<ValueType>* result) const
 {
-    if (this->get_size() && result->get_size() == this->get_size()) {
-        // we need to create a executor-local clone of the target data, that
-        // will be copied back later.
-        auto exec = this->get_executor();
-        auto result_array = make_temporary_output_clone(exec, &result->values_);
-        // create a (value, not pointer to avoid allocation overhead) view
-        // matrix on the array to avoid special-casing cross-executor copies
-        auto tmp_result =
-            Dense{exec, result->get_size(),
-                  make_array_view(exec, result_array->get_num_elems(),
-                                  result_array->get_data()),
-                  result->get_stride()};
-        exec->run(dense::make_copy(this, &tmp_result));
-    } else {
-        result->values_ = this->values_;
-        result->stride_ = this->stride_;
-        result->set_size(this->get_size());
+    if (result->get_size() != this->get_size()) {
+        result->resize(this->get_size());
     }
+    // we need to create a clone of the local data, so the target executor can
+    // copy it.
+    auto result_exec = result->get_executor();
+    auto result_input = make_temporary_clone(result_exec, &values_);
+    // create a (value, not pointer to avoid allocation overhead) view
+    // matrix on the array to avoid special-casing cross-executor copies
+    auto tmp_this = Dense{
+        result_exec, this->get_size(),
+        gko::detail::array_const_cast(gko::detail::ConstArrayView<ValueType>(
+            result_exec, result_input->get_num_elems(),
+            result_input->get_const_data())),
+        this->get_stride()};
+    result_exec->run(dense::make_copy(&tmp_this, result));
 }
 
 
 template <typename ValueType>
 void Dense<ValueType>::move_to(Dense<ValueType>* result)
 {
-    this->convert_to(result);
+    result->values_ = std::move(this->values_);
+    result->stride_ = std::exchange(this->stride_, 0);
+    result->set_size(this->get_size());
+    this->resize(gko::dim<2>{});
 }
 
 
@@ -1585,6 +1586,62 @@ void Dense<ValueType>::add_scaled_identity_impl(const LinOp* const a,
 }
 
 
+template <typename ValueType>
+std::unique_ptr<typename Dense<ValueType>::real_type>
+Dense<ValueType>::create_real_view()
+{
+    const auto num_rows = this->get_size()[0];
+    const bool complex = is_complex<ValueType>();
+    const auto num_cols =
+        complex ? 2 * this->get_size()[1] : this->get_size()[1];
+    const auto stride = complex ? 2 * this->get_stride() : this->get_stride();
+
+    return Dense<remove_complex<ValueType>>::create(
+        this->get_executor(), dim<2>{num_rows, num_cols},
+        make_array_view(
+            this->get_executor(), num_rows * stride,
+            reinterpret_cast<remove_complex<ValueType>*>(this->get_values())),
+        stride);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const typename Dense<ValueType>::real_type>
+Dense<ValueType>::create_real_view() const
+{
+    const auto num_rows = this->get_size()[0];
+    const bool complex = is_complex<ValueType>();
+    const auto num_cols =
+        complex ? 2 * this->get_size()[1] : this->get_size()[1];
+    const auto stride = complex ? 2 * this->get_stride() : this->get_stride();
+
+    return Dense<remove_complex<ValueType>>::create(
+        this->get_executor(), dim<2>{num_rows, num_cols},
+        make_array_view(this->get_executor(), num_rows * stride,
+                        const_cast<remove_complex<ValueType>*>(
+                            reinterpret_cast<const remove_complex<ValueType>*>(
+                                this->get_const_values()))),
+        stride);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::create_submatrix_impl(
+    const span& rows, const span& columns, const size_type stride)
+{
+    row_major_range range_this{this->get_values(), this->get_size()[0],
+                               this->get_size()[1], this->get_stride()};
+    auto range_result = range_this(rows, columns);
+    return Dense::create(
+        this->get_executor(),
+        dim<2>{range_result.length(0), range_result.length(1)},
+        make_array_view(this->get_executor(),
+                        range_result.length(0) * this->get_stride(),
+                        range_result->data),
+        stride);
+}
+
+
 #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX);
 
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index 44b235f804d..cbb94a5c605 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -60,8 +60,7 @@ class Dense : public ::testing::Test {
     static void assert_equal_to_original_mtx(gko::matrix::Dense<value_type>* m)
     {
         ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
-        ASSERT_EQ(m->get_stride(), 4);
-        ASSERT_EQ(m->get_num_stored_elements(), 2 * 4);
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * m->get_stride());
         EXPECT_EQ(m->at(0, 0), value_type{1.0});
         EXPECT_EQ(m->at(0, 1), value_type{2.0});
         EXPECT_EQ(m->at(0, 2), value_type{3.0});
@@ -172,6 +171,7 @@ TYPED_TEST(Dense, CreateWithSameConfigKeepsStride)
 TYPED_TEST(Dense, KnowsItsSizeAndValues)
 {
     this->assert_equal_to_original_mtx(this->mtx.get());
+    ASSERT_EQ(this->mtx->get_stride(), 4);
 }
 
 
@@ -241,6 +241,8 @@ TYPED_TEST(Dense, CanBeCopied)
     this->assert_equal_to_original_mtx(this->mtx.get());
     this->mtx->at(0) = 7;
     this->assert_equal_to_original_mtx(mtx_copy.get());
+    ASSERT_EQ(this->mtx->get_stride(), 4);
+    ASSERT_EQ(mtx_copy->get_stride(), 3);
 }
 
 
@@ -249,14 +251,15 @@ TYPED_TEST(Dense, CanBeMoved)
     auto mtx_copy = gko::matrix::Dense<TypeParam>::create(this->exec);
     mtx_copy->copy_from(std::move(this->mtx));
     this->assert_equal_to_original_mtx(mtx_copy.get());
+    ASSERT_EQ(mtx_copy->get_stride(), 4);
 }
 
 
 TYPED_TEST(Dense, CanBeCloned)
 {
     auto mtx_clone = this->mtx->clone();
-    this->assert_equal_to_original_mtx(
-        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+    this->assert_equal_to_original_mtx(mtx_clone.get());
+    ASSERT_EQ(mtx_clone->get_stride(), 3);
 }
 
 
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index d5b4811032d..20479295ad7 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -855,45 +855,12 @@ class Dense
      * real with a reinterpret_cast with twice the number of columns and
      * double the stride.
      */
-    std::unique_ptr<Dense<remove_complex<ValueType>>> create_real_view()
-    {
-        const auto num_rows = this->get_size()[0];
-        const bool complex = is_complex<ValueType>();
-        const auto num_cols =
-            complex ? 2 * this->get_size()[1] : this->get_size()[1];
-        const auto stride =
-            complex ? 2 * this->get_stride() : this->get_stride();
-
-        return Dense<remove_complex<ValueType>>::create(
-            this->get_executor(), dim<2>{num_rows, num_cols},
-            make_array_view(this->get_executor(), num_rows * stride,
-                            reinterpret_cast<remove_complex<ValueType>*>(
-                                this->get_values())),
-            stride);
-    }
+    std::unique_ptr<real_type> create_real_view();
 
     /**
      * @copydoc create_real_view()
      */
-    std::unique_ptr<const Dense<remove_complex<ValueType>>> create_real_view()
-        const
-    {
-        const auto num_rows = this->get_size()[0];
-        const bool complex = is_complex<ValueType>();
-        const auto num_cols =
-            complex ? 2 * this->get_size()[1] : this->get_size()[1];
-        const auto stride =
-            complex ? 2 * this->get_stride() : this->get_stride();
-
-        return Dense<remove_complex<ValueType>>::create(
-            this->get_executor(), dim<2>{num_rows, num_cols},
-            make_array_view(
-                this->get_executor(), num_rows * stride,
-                const_cast<remove_complex<ValueType>*>(
-                    reinterpret_cast<const remove_complex<ValueType>*>(
-                        this->get_const_values()))),
-            stride);
-    }
+    std::unique_ptr<const real_type> create_real_view() const;
 
     /**
      * Creates a constant (immutable) Dense matrix from a constant array.
@@ -1101,23 +1068,8 @@ class Dense
      *        instead of create_submatrix(const span, const span, const
      *        size_type).
      */
-    virtual std::unique_ptr<Dense> create_submatrix_impl(const span& rows,
-                                                         const span& columns,
-                                                         const size_type stride)
-    {
-        row_major_range range_this{this->get_values(), this->get_size()[0],
-                                   this->get_size()[1], this->get_stride()};
-        auto range_result = range_this(rows, columns);
-        // TODO: can result in HUGE padding - which will be copied with the
-        // vector
-        return Dense::create(
-            this->get_executor(),
-            dim<2>{range_result.length(0), range_result.length(1)},
-            make_array_view(this->get_executor(),
-                            range_result.length(0) * this->get_stride(),
-                            range_result->data),
-            stride);
-    }
+    virtual std::unique_ptr<Dense> create_submatrix_impl(
+        const span& rows, const span& columns, const size_type stride);
 
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 29e6479a282..475f8e84fb8 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -81,6 +81,8 @@ struct SimpleMatrixTest {
     {}
 
     static void check_property(const std::unique_ptr<matrix_type>&) {}
+
+    static bool supports_strides() { return true; }
 };
 
 struct DenseWithDefaultStride
@@ -222,6 +224,11 @@ struct Fbcsr : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
         data.size[0] = gko::ceildiv(data.size[0], block_size) * block_size;
         data.size[1] = gko::ceildiv(data.size[1], block_size) * block_size;
     }
+
+#ifdef GKO_COMPILING_HIP
+    // Fbcsr support in rocSPARSE is buggy w.r.t. strides
+    static bool supports_strides() { return false; }
+#endif
 };
 
 
@@ -588,7 +595,7 @@ class Matrix : public ::testing::Test {
             guarded_fn(gen_in_vec<VecType>(mtx, 1, 1),
                        gen_out_vec<VecType>(mtx, 1, 1));
         }
-        {
+        if (Config::supports_strides()) {
             SCOPED_TRACE("Single strided vector");
             guarded_fn(gen_in_vec<VecType>(mtx, 1, 2),
                        gen_out_vec<VecType>(mtx, 1, 3));
@@ -597,12 +604,12 @@ class Matrix : public ::testing::Test {
             // check application of real matrix to complex vector
             // viewed as interleaved real/imag vector
             using complex_vec = gko::to_complex<VecType>;
-            {
+            if (Config::supports_strides()) {
                 SCOPED_TRACE("Single strided complex vector");
                 guarded_fn(gen_in_vec<complex_vec>(mtx, 1, 2),
                            gen_out_vec<complex_vec>(mtx, 1, 3));
             }
-            {
+            if (Config::supports_strides()) {
                 SCOPED_TRACE("Strided complex multivector with 2 columns");
                 guarded_fn(gen_in_vec<complex_vec>(mtx, 2, 3),
                            gen_out_vec<complex_vec>(mtx, 2, 4));
@@ -613,7 +620,7 @@ class Matrix : public ::testing::Test {
             guarded_fn(gen_in_vec<VecType>(mtx, 2, 2),
                        gen_out_vec<VecType>(mtx, 2, 2));
         }
-        {
+        if (Config::supports_strides()) {
             SCOPED_TRACE("Strided multivector with 2 columns");
             guarded_fn(gen_in_vec<VecType>(mtx, 2, 3),
                        gen_out_vec<VecType>(mtx, 2, 4));
@@ -623,7 +630,7 @@ class Matrix : public ::testing::Test {
             guarded_fn(gen_in_vec<VecType>(mtx, 40, 40),
                        gen_out_vec<VecType>(mtx, 40, 40));
         }
-        {
+        if (Config::supports_strides()) {
             SCOPED_TRACE("Strided multivector with 40 columns");
             guarded_fn(gen_in_vec<VecType>(mtx, 40, 43),
                        gen_out_vec<VecType>(mtx, 40, 45));
@@ -649,8 +656,8 @@ using MatrixTypes = ::testing::Types<
     // CUDA doesn't allow blocksize 1
     Fbcsr<1>,
 #endif
-#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA)
-    // Fbcsr is broken on ROCm
+#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA) || \
+    defined(GKO_COMPILING_HIP)
     Fbcsr<2>, Fbcsr<3>,
 #endif
     SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy,
@@ -775,13 +782,30 @@ TYPED_TEST(Matrix, ConvertToDenseIsEquivalentToRef)
     using Mtx = typename TestFixture::Mtx;
     using Dense = gko::matrix::Dense<typename Mtx::value_type>;
     this->forall_matrix_scenarios([&](auto mtx) {
-        auto ref_result = Dense::create(this->ref);
-        auto dev_result = Dense::create(this->exec);
+        const auto size = mtx.ref->get_size();
+        const auto stride = size[1] + 5;
+        const auto padded_size = gko::dim<2>{size[0], stride};
+        auto ref_padded = Dense::create(this->ref, padded_size);
+        auto dev_padded = Dense::create(this->exec, padded_size);
+        ref_padded->fill(12345);
+        dev_padded->fill(12345);
+        const auto rows = gko::span{0, size[0]};
+        const auto cols = gko::span{0, size[1]};
+        const auto pad_cols = gko::span{size[1], stride};
+        auto ref_result = ref_padded->create_submatrix(rows, cols);
+        auto dev_result = dev_padded->create_submatrix(rows, cols);
+        auto ref_padding = ref_padded->create_submatrix(rows, pad_cols);
+        auto dev_padding = dev_padded->create_submatrix(rows, pad_cols);
+        auto orig_padding = ref_padding->clone();
 
         mtx.ref->convert_to(ref_result.get());
         mtx.dev->convert_to(dev_result.get());
 
         GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0);
+        ASSERT_EQ(ref_result->get_stride(), stride);
+        ASSERT_EQ(dev_result->get_stride(), stride);
+        GKO_ASSERT_MTX_NEAR(ref_padding, orig_padding, 0.0);
+        GKO_ASSERT_MTX_NEAR(dev_padding, orig_padding, 0.0);
     });
 }
 
@@ -792,10 +816,13 @@ TYPED_TEST(Matrix, ConvertFromDenseIsEquivalentToRef)
     using Mtx = typename TestFixture::Mtx;
     using Dense = gko::matrix::Dense<typename Mtx::value_type>;
     this->forall_matrix_data_scenarios([&](auto data) {
-        auto ref_src = Dense::create(this->ref);
-        auto dev_src = Dense::create(this->exec);
+        const auto stride = data.size[0] + 2;
+        auto ref_src = Dense::create(this->ref, data.size, stride);
+        auto dev_src = Dense::create(this->exec, data.size, stride);
         ref_src->read(data);
         dev_src->read(data);
+        ASSERT_EQ(ref_src->get_stride(), stride);
+        ASSERT_EQ(dev_src->get_stride(), stride);
         auto ref_result = TestConfig::create(this->ref, data.size);
         auto dev_result = TestConfig::create(this->exec, data.size);
 

From bcd48e5f9e96ff1f0665aa20e71572fa9f6fa936 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 3 Mar 2022 17:15:56 +0100
Subject: [PATCH 09/17] fix submatrix generation for empty and full rows

---
 core/matrix/dense.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index d7e4a0f428d..89833e5b514 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -1632,12 +1632,14 @@ std::unique_ptr<Dense<ValueType>> Dense<ValueType>::create_submatrix_impl(
     row_major_range range_this{this->get_values(), this->get_size()[0],
                                this->get_size()[1], this->get_stride()};
     auto range_result = range_this(rows, columns);
+    size_type storage_size =
+        rows.length() > 0
+            ? range_result.length(0) * this->get_stride() - columns.begin
+            : 0;
     return Dense::create(
         this->get_executor(),
         dim<2>{range_result.length(0), range_result.length(1)},
-        make_array_view(this->get_executor(),
-                        range_result.length(0) * this->get_stride(),
-                        range_result->data),
+        make_array_view(this->get_executor(), storage_size, range_result->data),
         stride);
 }
 

From 5d798ea125f166f000adab66fbb71ca94e5e344d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 3 Mar 2022 18:45:11 +0100
Subject: [PATCH 10/17] test and fix SpMV interaction with stride/padding

---
 cuda/matrix/ell_kernels.cu      |  4 +-
 dpcpp/matrix/csr_kernels.dp.cpp |  3 +-
 dpcpp/matrix/ell_kernels.dp.cpp |  4 +-
 hip/matrix/csr_kernels.hip.cpp  |  3 +-
 hip/matrix/ell_kernels.hip.cpp  |  4 +-
 test/matrix/matrix.cpp          | 69 +++++++++++++++++++++------------
 6 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index c906bb7fd1a..58c59af2cc2 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -242,9 +242,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
      */
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(),
-                               zero<OutputValueType>());
+        dense::fill(exec, c, zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index c9989afe471..86e6784203f 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1126,8 +1126,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
     } else if (a->get_strategy()->get_name() == "load_balance") {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+        dense::fill(exec, c, zero<ValueType>());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
             const dim3 csr_block(config::warp_size, warps_in_block, 1);
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index fbd1063bf13..8be02fb7988 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -429,9 +429,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
      */
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(),
-                               zero<OutputValueType>());
+        dense::fill(exec, c, zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 1b2c8ae6a51..5bac002f63c 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -289,8 +289,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
     } else if (a->get_strategy()->get_name() == "load_balance") {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+        dense::fill(exec, c, zero<ValueType>());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
             const dim3 csr_block(config::warp_size, warps_in_block, 1);
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 2486a666ab0..a56b9b96d13 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -246,9 +246,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
      */
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(),
-                               zero<OutputValueType>());
+        dense::fill(exec, c, zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 475f8e84fb8..30563438c65 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -439,12 +439,11 @@ class Matrix : public ::testing::Test {
     }
 
     template <typename VecType = Vec>
-    test_pair<VecType> gen_in_vec(const test_pair<Mtx>& mtx, int nrhs,
-                                  int stride)
+    test_pair<VecType> gen_in_vec(const test_pair<Mtx>& mtx, int nrhs)
     {
         auto size = gko::dim<2>{mtx.ref->get_size()[1],
                                 static_cast<gko::size_type>(nrhs)};
-        auto result = VecType::create(ref, size, stride);
+        auto result = VecType::create(ref, size);
         result->read(gen_dense_data<typename VecType::value_type,
                                     typename Mtx::index_type>(size));
         return {std::move(result), exec};
@@ -465,12 +464,11 @@ class Matrix : public ::testing::Test {
     }
 
     template <typename VecType = Vec>
-    test_pair<VecType> gen_out_vec(const test_pair<Mtx>& mtx, int nrhs,
-                                   int stride)
+    test_pair<VecType> gen_out_vec(const test_pair<Mtx>& mtx, int nrhs)
     {
         auto size = gko::dim<2>{mtx.ref->get_size()[0],
                                 static_cast<gko::size_type>(nrhs)};
-        auto result = VecType::create(ref, size, stride);
+        auto result = VecType::create(ref, size);
         result->read(gen_dense_data<typename VecType::value_type,
                                     typename Mtx::index_type>(size));
         return {std::move(result), exec};
@@ -575,6 +573,34 @@ class Matrix : public ::testing::Test {
         });
     }
 
+    template <typename VecType = Vec, typename MtxType, typename TestFunction>
+    void run_strided(const test_pair<MtxType>& mtx, int rhs, int in_stride,
+                     int out_stride, TestFunction fn)
+    {
+        // create slightly bigger vectors
+        auto in_padded = gen_in_vec<VecType>(mtx, in_stride);
+        auto out_padded = gen_out_vec<VecType>(mtx, out_stride);
+        const auto in_rows = gko::span(0, mtx.ref->get_size()[1]);
+        const auto out_rows = gko::span(0, mtx.ref->get_size()[0]);
+        const auto cols = gko::span(0, rhs);
+        const auto out_pad_cols = gko::span(rhs, out_stride);
+        // create views of the padding and in/out vectors
+        auto out_padding = test_pair<VecType>{
+            out_padded.ref->create_submatrix(out_rows, out_pad_cols),
+            out_padded.dev->create_submatrix(out_rows, out_pad_cols)};
+        auto orig_padding = out_padding.ref->clone();
+        auto in =
+            test_pair<VecType>{in_padded.ref->create_submatrix(in_rows, cols),
+                               in_padded.dev->create_submatrix(in_rows, cols)};
+        auto out = test_pair<VecType>{
+            out_padded.ref->create_submatrix(out_rows, cols),
+            out_padded.dev->create_submatrix(out_rows, cols)};
+        fn(std::move(in), std::move(out));
+        // check that padding was unmodified
+        GKO_ASSERT_MTX_NEAR(out_padding.ref, orig_padding, 0.0);
+        GKO_ASSERT_MTX_NEAR(out_padding.dev, orig_padding, 0.0);
+    }
+
     template <typename VecType = Vec, typename MtxType, typename TestFunction>
     void forall_vector_scenarios(const test_pair<MtxType>& mtx, TestFunction fn)
     {
@@ -587,18 +613,17 @@ class Matrix : public ::testing::Test {
         };
         {
             SCOPED_TRACE("Multivector with 0 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 0, 0),
-                       gen_out_vec<VecType>(mtx, 0, 0));
+            guarded_fn(gen_in_vec<VecType>(mtx, 0),
+                       gen_out_vec<VecType>(mtx, 0));
         }
         {
             SCOPED_TRACE("Single vector");
-            guarded_fn(gen_in_vec<VecType>(mtx, 1, 1),
-                       gen_out_vec<VecType>(mtx, 1, 1));
+            guarded_fn(gen_in_vec<VecType>(mtx, 1),
+                       gen_out_vec<VecType>(mtx, 1));
         }
         if (Config::supports_strides()) {
             SCOPED_TRACE("Single strided vector");
-            guarded_fn(gen_in_vec<VecType>(mtx, 1, 2),
-                       gen_out_vec<VecType>(mtx, 1, 3));
+            run_strided(mtx, 1, 2, 3, guarded_fn);
         }
         if (!gko::is_complex<value_type>()) {
             // check application of real matrix to complex vector
@@ -606,34 +631,30 @@ class Matrix : public ::testing::Test {
             using complex_vec = gko::to_complex<VecType>;
             if (Config::supports_strides()) {
                 SCOPED_TRACE("Single strided complex vector");
-                guarded_fn(gen_in_vec<complex_vec>(mtx, 1, 2),
-                           gen_out_vec<complex_vec>(mtx, 1, 3));
+                run_strided<complex_vec>(mtx, 1, 2, 3, guarded_fn);
             }
             if (Config::supports_strides()) {
                 SCOPED_TRACE("Strided complex multivector with 2 columns");
-                guarded_fn(gen_in_vec<complex_vec>(mtx, 2, 3),
-                           gen_out_vec<complex_vec>(mtx, 2, 4));
+                run_strided<complex_vec>(mtx, 2, 3, 4, guarded_fn);
             }
         }
         {
             SCOPED_TRACE("Multivector with 2 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 2, 2),
-                       gen_out_vec<VecType>(mtx, 2, 2));
+            guarded_fn(gen_in_vec<VecType>(mtx, 2),
+                       gen_out_vec<VecType>(mtx, 2));
         }
         if (Config::supports_strides()) {
             SCOPED_TRACE("Strided multivector with 2 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 2, 3),
-                       gen_out_vec<VecType>(mtx, 2, 4));
+            run_strided(mtx, 2, 3, 4, guarded_fn);
         }
         {
             SCOPED_TRACE("Multivector with 40 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 40, 40),
-                       gen_out_vec<VecType>(mtx, 40, 40));
+            guarded_fn(gen_in_vec<VecType>(mtx, 40),
+                       gen_out_vec<VecType>(mtx, 40));
         }
         if (Config::supports_strides()) {
             SCOPED_TRACE("Strided multivector with 40 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 40, 43),
-                       gen_out_vec<VecType>(mtx, 40, 45));
+            run_strided(mtx, 40, 43, 45, guarded_fn);
         }
     }
 

From 592d93fe27db166bcc06a8c1b2407208097e7e92 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 4 Mar 2022 00:19:10 +0100
Subject: [PATCH 11/17] fix DPC++ Csr merge_path SpMV segfault

---
 dpcpp/matrix/csr_kernels.dp.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 86e6784203f..ef686807f97 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1125,7 +1125,14 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) {
+        // empty input: zero output
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
+    if (a->get_strategy()->get_name() == "load_balance") {
         dense::fill(exec, c, zero<ValueType>());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
@@ -1216,7 +1223,14 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
+    if (a->get_strategy()->get_name() == "load_balance") {
         dense::scale(exec, beta, c);
 
         const IndexType nwarps = a->get_num_srow_elements();

From 0b74a6513a4a7cf43ceb23b0436deb22c065a26a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 4 Mar 2022 14:47:43 +0100
Subject: [PATCH 12/17] initiate Dense cross-executor copy from source

This means that padding in the target will be copied and written back
after the copy has finished.

This fixes issues in builds without OpenMP executor and
reference/test/matrix/dense_kernels
---
 core/matrix/dense.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 89833e5b514..616cd9f482f 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -465,19 +465,17 @@ void Dense<ValueType>::convert_to(Dense<ValueType>* result) const
     if (result->get_size() != this->get_size()) {
         result->resize(this->get_size());
     }
-    // we need to create a clone of the local data, so the target executor can
-    // copy it.
-    auto result_exec = result->get_executor();
-    auto result_input = make_temporary_clone(result_exec, &values_);
+    // we need to create a temporary clone of the target data to write to
+    auto exec = this->get_executor();
+    auto result_output = make_temporary_clone(exec, &result->values_);
     // create a (value, not pointer to avoid allocation overhead) view
     // matrix on the array to avoid special-casing cross-executor copies
-    auto tmp_this = Dense{
-        result_exec, this->get_size(),
-        gko::detail::array_const_cast(gko::detail::ConstArrayView<ValueType>(
-            result_exec, result_input->get_num_elems(),
-            result_input->get_const_data())),
-        this->get_stride()};
-    result_exec->run(dense::make_copy(&tmp_this, result));
+    auto tmp_result =
+        Dense{exec, this->get_size(),
+              make_array_view(exec, result_output->get_num_elems(),
+                              result_output->get_data()),
+              result->get_stride()};
+    exec->run(dense::make_copy(this, &tmp_result));
 }
 
 

From 3dd7aa5c85d6630372ce1ede6c562c5a36b3d3d0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 4 Mar 2022 17:23:23 +0100
Subject: [PATCH 13/17] fix dpcpp Csr classical SpMV

---
 dpcpp/matrix/csr_kernels.dp.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index ef686807f97..7c503cfd972 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -719,12 +719,14 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
                              ValueType* c, const size_type c_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             abstract_classical_spmv<subgroup_size>(
-                                 num_rows, alpha, val, col_idxs, row_ptrs, b,
-                                 b_stride, beta, c, c_stride, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subgroup_size)]] {
+                abstract_classical_spmv<subgroup_size>(
+                    num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta,
+                    c, c_stride, item_ct1);
+            });
     });
 }
 

From 11737d76a23c1c25c2ea413446afed3093ebb694 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 4 Mar 2022 18:27:50 +0100
Subject: [PATCH 14/17] remove assertion causing hip-nvcc to emit traps

This is only relevant for really old HIP/CUDA,
but still causes test failures.
---
 common/cuda_hip/matrix/dense_kernels.hpp.inc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index b40d650a3a5..4002aefbbbb 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -74,8 +74,9 @@ __global__
         // only consider threads in the current block
         const auto first_thread = block_base_col - base_col;
         const auto last_thread = first_thread + block_size;
-        assert(first_thread < int(config::warp_size));
-        assert(last_thread >= 0);
+        // HIP compiles these assertions in Release, traps unconditionally
+        // assert(first_thread < int(config::warp_size));
+        // assert(last_thread >= 0);
         // mask off everything below first_thread
         const auto lower_mask =
             first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u);
@@ -147,8 +148,9 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
         // only consider threads in the current block
         const auto first_thread = block_base_col - base_col;
         const auto last_thread = first_thread + block_size;
-        assert(first_thread < int(config::warp_size));
-        assert(last_thread >= 0);
+        // HIP compiles these assertions in Release, traps unconditionally
+        // assert(first_thread < int(config::warp_size));
+        // assert(last_thread >= 0);
         // mask off everything below first_thread
         const auto lower_mask =
             first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u);

From 480bbb51fc9616e8728056f0b37b6a3e835c2162 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 7 Mar 2022 14:12:31 +0100
Subject: [PATCH 15/17] fix dpcpp Csr classical SpMV with subgroup_size 1

---
 dpcpp/matrix/csr_kernels.dp.cpp | 62 ++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 7c503cfd972..fb8066dbb61 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -676,16 +676,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
                              const size_type b_stride, ValueType* c,
                              const size_type c_stride)
 {
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            subgroup_size)]] {
-                abstract_classical_spmv<subgroup_size>(num_rows, val, col_idxs,
-                                                       row_ptrs, b, b_stride, c,
-                                                       c_stride, item_ct1);
-            });
-    });
+    if (subgroup_size > 1) {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=
+            ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                                subgroup_size)]] {
+                    abstract_classical_spmv<subgroup_size>(
+                        num_rows, val, col_idxs, row_ptrs, b, b_stride, c,
+                        c_stride, item_ct1);
+                });
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(sycl_nd_range(grid, block),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 abstract_classical_spmv<subgroup_size>(
+                                     num_rows, val, col_idxs, row_ptrs, b,
+                                     b_stride, c, c_stride, item_ct1);
+                             });
+        });
+    }
 }
 
 
@@ -718,16 +729,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
                              const size_type b_stride, const ValueType* beta,
                              ValueType* c, const size_type c_stride)
 {
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            subgroup_size)]] {
-                abstract_classical_spmv<subgroup_size>(
-                    num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta,
-                    c, c_stride, item_ct1);
-            });
-    });
+    if (subgroup_size > 1) {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=
+            ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                                subgroup_size)]] {
+                    abstract_classical_spmv<subgroup_size>(
+                        num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride,
+                        beta, c, c_stride, item_ct1);
+                });
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(sycl_nd_range(grid, block),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 abstract_classical_spmv<subgroup_size>(
+                                     num_rows, alpha, val, col_idxs, row_ptrs,
+                                     b, b_stride, beta, c, c_stride, item_ct1);
+                             });
+        });
+    }
 }
 
 

From 1d7ac95a17b465e4977ec60bcc6f050adb9f986d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 7 Apr 2022 18:49:40 +0200
Subject: [PATCH 16/17] review updates

* use ConstArrayView for Dense::create_real_view
* improve readability of Dense -> Fbcsr conversion
* update documentation
* remove unnecessary template parameters
* more constexpr
* fix generic matrix test formatting

Co-authored-by: Aditya Kashi <aditya.kashi@kit.edu>
Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
---
 common/cuda_hip/matrix/dense_kernels.hpp.inc | 15 ++++++++++-----
 core/matrix/dense.cpp                        | 14 +++++++-------
 cuda/matrix/fbcsr_kernels.cu                 |  4 ++--
 hip/matrix/fbcsr_kernels.hip.cpp             |  4 ++--
 include/ginkgo/core/base/array.hpp           | 19 +++++++++++++++++++
 5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index 4002aefbbbb..acb148db555 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -92,8 +92,10 @@ __global__
         // if we need to store something for the next iteration
         if ((base_col + config::warp_size) % block_size != 0) {
             // check whether the last block (incomplete) in this warp is nonzero
-            first_block_nonzero =
-                bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1));
+            auto local_block_nonzero_mask = warp.ballot(local_mask != 0u);
+            bool last_block_nonzero =
+                (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u;
+            first_block_nonzero = last_block_nonzero;
         } else {
             first_block_nonzero = false;
         }
@@ -163,7 +165,8 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
         const auto block_nonzero_mask =
             warp.ballot(local_mask && (block_local_col == block_size - 1));
 
-        // count how many blocks come before this block in the current block
+        // count how many Fbcsr blocks come before the Fbcsr block handled by
+        // the local group of threads
         const auto block_nz =
             block_base_nz + popcnt(block_nonzero_mask & lane_prefix_mask);
         // now in a second sweep, store the actual elements
@@ -183,8 +186,10 @@ __global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
         // if we need to store something for the next iteration
         if ((base_col + config::warp_size) % block_size != 0) {
             // check whether the last block (incomplete) in this warp is nonzero
-            first_block_nonzero =
-                bool(warp.ballot(bool(local_mask)) >> (config::warp_size - 1));
+            auto local_block_nonzero_mask = warp.ballot(local_mask != 0u);
+            bool last_block_nonzero =
+                (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u;
+            first_block_nonzero = last_block_nonzero;
         } else {
             first_block_nonzero = false;
         }
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 616cd9f482f..94cb35c42c9 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -1589,7 +1589,7 @@ std::unique_ptr<typename Dense<ValueType>::real_type>
 Dense<ValueType>::create_real_view()
 {
     const auto num_rows = this->get_size()[0];
-    const bool complex = is_complex<ValueType>();
+    constexpr bool complex = is_complex<ValueType>();
     const auto num_cols =
         complex ? 2 * this->get_size()[1] : this->get_size()[1];
     const auto stride = complex ? 2 * this->get_stride() : this->get_stride();
@@ -1608,17 +1608,17 @@ std::unique_ptr<const typename Dense<ValueType>::real_type>
 Dense<ValueType>::create_real_view() const
 {
     const auto num_rows = this->get_size()[0];
-    const bool complex = is_complex<ValueType>();
+    constexpr bool complex = is_complex<ValueType>();
     const auto num_cols =
         complex ? 2 * this->get_size()[1] : this->get_size()[1];
     const auto stride = complex ? 2 * this->get_stride() : this->get_stride();
 
-    return Dense<remove_complex<ValueType>>::create(
+    return Dense<remove_complex<ValueType>>::create_const(
         this->get_executor(), dim<2>{num_rows, num_cols},
-        make_array_view(this->get_executor(), num_rows * stride,
-                        const_cast<remove_complex<ValueType>*>(
-                            reinterpret_cast<const remove_complex<ValueType>*>(
-                                this->get_const_values()))),
+        make_const_array_view(
+            this->get_executor(), num_rows * stride,
+            reinterpret_cast<const remove_complex<ValueType>*>(
+                this->get_const_values())),
         stride);
 }
 
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index d0a73d67c2a..bec6ce29807 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -374,8 +374,8 @@ void is_sorted_by_column_index(
     *is_sorted = true;
     auto gpu_array = Array<bool>(exec, 1);
     // need to initialize the GPU value to true
-    exec->copy_from<bool>(exec->get_master().get(), 1, is_sorted,
-                          gpu_array.get_data());
+    exec->copy_from(exec->get_master().get(), 1, is_sorted,
+                    gpu_array.get_data());
     auto block_size = default_block_size;
     const auto num_brows =
         static_cast<IndexType>(to_check->get_num_block_rows());
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 0e021bce36b..bf707859b02 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -314,8 +314,8 @@ void is_sorted_by_column_index(
     *is_sorted = true;
     auto gpu_array = Array<bool>(exec, 1);
     // need to initialize the GPU value to true
-    exec->copy_from<bool>(exec->get_master().get(), 1, is_sorted,
-                          gpu_array.get_data());
+    exec->copy_from(exec->get_master().get(), 1, is_sorted,
+                    gpu_array.get_data());
     auto block_size = default_block_size;
     const auto num_brows =
         static_cast<IndexType>(to_check->get_num_block_rows());
diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp
index 2f95d5a4da7..35ed7985ca2 100644
--- a/include/ginkgo/core/base/array.hpp
+++ b/include/ginkgo/core/base/array.hpp
@@ -755,6 +755,25 @@ Array<ValueType> make_array_view(std::shared_ptr<const Executor> exec,
 }
 
 
+/**
+ * Helper function to create a const array view deducing the value type.
+ *
+ * @param exec  the executor on which the array resides
+ * @param size  the number of elements for the array
+ * @param data  the pointer to the array we create a view on.
+ *
+ * @tparam ValueType  the type of the array elements
+ *
+ * @return `Array<ValueType>::const_view(exec, size, data)`
+ */
+template <typename ValueType>
+detail::ConstArrayView<ValueType> make_const_array_view(
+    std::shared_ptr<const Executor> exec, size_type size, const ValueType* data)
+{
+    return Array<ValueType>::const_view(exec, size, data);
+}
+
+
 namespace detail {
 
 

From 9ba1ee94d7c92099ae9571a177710489d65c97cb Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 22 Apr 2022 16:11:02 +0200
Subject: [PATCH 17/17] use ROCm 4.0 instead of 3.5

---
 .gitlab-ci.yml    | 10 +++++-----
 .gitlab/image.yml |  4 ++--
 CHANGELOG.md      |  2 +-
 README.md         |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 282eac6f039..b3ee2f02a35 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -530,12 +530,12 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
     CUDA_ARCH: 61
 
-# ROCm 3.5 and friends
-build/amd/nompi/gcc/rocm35/debug/shared:
+# ROCm 4.0 and friends
+build/amd/nompi/gcc/rocm40/debug/shared:
   <<: *default_build_with_test
   extends:
     - .quick_test_condition
-    - .use_gko-rocm35-openmpi-gnu5-llvm50
+    - .use_gko-rocm40-openmpi-gnu5-llvm50
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
@@ -544,11 +544,11 @@ build/amd/nompi/gcc/rocm35/debug/shared:
     BUILD_TYPE: "Debug"
     FAST_TESTS: "ON"
 
-build/amd/openmpi/clang/rocm35/release/static:
+build/amd/openmpi/clang/rocm40/release/static:
   <<: *default_build_with_test
   extends:
     - .full_test_condition
-    - .use_gko-rocm35-openmpi-gnu5-llvm50
+    - .use_gko-rocm40-openmpi-gnu5-llvm50
   variables:
     <<: *default_variables
     C_COMPILER: "clang"
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index e64b2cfeb60..a8a60d062ad 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -72,8 +72,8 @@
     - private_ci
     - nvidia-gpu
 
-.use_gko-rocm35-openmpi-gnu5-llvm50:
-  image: ginkgohub/rocm:35-openmpi-gnu5-llvm50
+.use_gko-rocm40-openmpi-gnu5-llvm50:
+  image: ginkgohub/rocm:40-openmpi-gnu5-llvm50
   tags:
     - private_ci
     - amdci
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 238f8921921..e1e411bea44 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,7 +37,7 @@ Supported systems and requirements:
   + Intel compiler: 2018+
   + Apple LLVM: 8.0+
   + CUDA module: CUDA 9.0+
-  + HIP module: ROCm 3.5+
+  + HIP module: ROCm 4.0+
   + DPC++ module: Intel OneAPI 2021.3. Set the CXX compiler to `dpcpp`.
 + Windows
   + MinGW and Cygwin: gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+
diff --git a/README.md b/README.md
index 53c096a3a7e..f234291e16d 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ The Ginkgo CUDA module has the following __additional__ requirements:
 
 The Ginkgo HIP module has the following __additional__ requirements:
 
-* _ROCm 3.5+_
+* _ROCm 4.0+_
 *    the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
     * _AMD_ backend (using the `clang` compiler)
     * _9.2 <= CUDA < 11_ backend