diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 282eac6f039..b3ee2f02a35 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -530,12 +530,12 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
     CUDA_ARCH: 61
 
-# ROCm 3.5 and friends
-build/amd/nompi/gcc/rocm35/debug/shared:
+# ROCm 4.0 and friends
+build/amd/nompi/gcc/rocm40/debug/shared:
   <<: *default_build_with_test
   extends:
     - .quick_test_condition
-    - .use_gko-rocm35-openmpi-gnu5-llvm50
+    - .use_gko-rocm40-openmpi-gnu5-llvm50
   variables:
     <<: *default_variables
     BUILD_OMP: "ON"
@@ -544,11 +544,11 @@ build/amd/nompi/gcc/rocm35/debug/shared:
     BUILD_TYPE: "Debug"
     FAST_TESTS: "ON"
 
-build/amd/openmpi/clang/rocm35/release/static:
+build/amd/openmpi/clang/rocm40/release/static:
   <<: *default_build_with_test
   extends:
     - .full_test_condition
-    - .use_gko-rocm35-openmpi-gnu5-llvm50
+    - .use_gko-rocm40-openmpi-gnu5-llvm50
   variables:
     <<: *default_variables
     C_COMPILER: "clang"
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index e64b2cfeb60..a8a60d062ad 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -72,8 +72,8 @@
     - private_ci
     - nvidia-gpu
 
-.use_gko-rocm35-openmpi-gnu5-llvm50:
-  image: ginkgohub/rocm:35-openmpi-gnu5-llvm50
+.use_gko-rocm40-openmpi-gnu5-llvm50:
+  image: ginkgohub/rocm:40-openmpi-gnu5-llvm50
   tags:
     - private_ci
     - amdci
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 238f8921921..e1e411bea44 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -37,7 +37,7 @@ Supported systems and requirements:
   + Intel compiler: 2018+
   + Apple LLVM: 8.0+
   + CUDA module: CUDA 9.0+
-  + HIP module: ROCm 3.5+
+  + HIP module: ROCm 4.0+
   + DPC++ module: Intel OneAPI 2021.3. Set the CXX compiler to `dpcpp`.
 + Windows
   + MinGW and Cygwin: gcc 5.3+, 6.3+, 7.3+, all versions after 8.1+
diff --git a/README.md b/README.md
index 53c096a3a7e..f234291e16d 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ The Ginkgo CUDA module has the following __additional__ requirements:
 
 The Ginkgo HIP module has the following __additional__ requirements:
 
-* _ROCm 3.5+_
+* _ROCm 4.0+_
 *    the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
     * _AMD_ backend (using the `clang` compiler)
     * _9.2 <= CUDA < 11_ backend
diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc
new file mode 100644
index 00000000000..d22981c1edf
--- /dev/null
+++ b/common/cuda_hip/matrix/csr_common.hpp.inc
@@ -0,0 +1,111 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void check_unsorted(
+    const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag)
+{
+    __shared__ bool sh_flag;
+    auto block = group::this_thread_block();
+    if (block.thread_rank() == 0) {
+        sh_flag = *flag;
+    }
+    block.sync();
+
+    auto row = thread::get_thread_id_flat<IndexType>();
+    if (row >= num_rows) {
+        return;
+    }
+
+    // fail early
+    if (sh_flag) {
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) {
+            if (col_idxs[nz] > col_idxs[nz + 1]) {
+                *flag = false;
+                sh_flag = false;
+                return;
+            }
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void conjugate(
+    size_type num_nonzeros, ValueType* __restrict__ val)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    if (tidx < num_nonzeros) {
+        val[tidx] = conj(val[tidx]);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
+    const IndexType num_min_rows_cols,
+    const IndexType* const __restrict__ row_ptrs,
+    const IndexType* const __restrict__ col_idxs,
+    bool* const __restrict__ has_all_diags)
+{
+    constexpr int warp_size = config::warp_size;
+    auto tile_grp =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+    const auto row = thread::get_subwarp_id_flat<warp_size, IndexType>();
+    if (row < num_min_rows_cols) {
+        const auto tid_in_warp = tile_grp.thread_rank();
+        const IndexType row_start = row_ptrs[row];
+        const IndexType num_nz = row_ptrs[row + 1] - row_start;
+        bool row_has_diag_local{false};
+        for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) {
+            if (col_idxs[iz + row_start] == row) {
+                row_has_diag_local = true;
+                break;
+            }
+        }
+        auto row_has_diag = static_cast<bool>(tile_grp.any(row_has_diag_local));
+        if (!row_has_diag) {
+            if (tile_grp.thread_rank() == 0) {
+                *has_all_diags = false;
+            }
+            return;
+        }
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 075e1607ace..4ed54310797 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -601,36 +601,6 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 }
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void check_unsorted(
-    const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs, IndexType num_rows, bool* flag)
-{
-    __shared__ bool sh_flag;
-    auto block = group::this_thread_block();
-    if (block.thread_rank() == 0) {
-        sh_flag = *flag;
-    }
-    block.sync();
-
-    auto row = thread::get_thread_id_flat<IndexType>();
-    if (row >= num_rows) {
-        return;
-    }
-
-    // fail early
-    if (sh_flag) {
-        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1] - 1; ++nz) {
-            if (col_idxs[nz] > col_idxs[nz + 1]) {
-                *flag = false;
-                sh_flag = false;
-                return;
-            }
-        }
-    }
-}
-
-
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void extract_diagonal(
     size_type diag_size, size_type nnz,
@@ -657,29 +627,8 @@ __global__ __launch_bounds__(default_block_size) void extract_diagonal(
 }
 
 
-}  // namespace kernel
-
-
-namespace {
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void conjugate_kernel(
-    size_type num_nonzeros, ValueType* __restrict__ val)
-{
-    const auto tidx = thread::get_thread_id_flat();
-
-    if (tidx < num_nonzeros) {
-        val[tidx] = conj(val[tidx]);
-    }
-}
-
-
-}  //  namespace
-
-
 template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void row_ptr_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz)
 {
@@ -694,11 +643,9 @@ __global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel(
 
 
 template <typename IndexType>
-__global__
-    __launch_bounds__(default_block_size) void inv_row_ptr_permute_kernel(
-        size_type num_rows, const IndexType* __restrict__ permutation,
-        const IndexType* __restrict__ in_row_ptrs,
-        IndexType* __restrict__ out_nnz)
+__global__ __launch_bounds__(default_block_size) void inv_row_ptr_permute(
+    size_type num_rows, const IndexType* __restrict__ permutation,
+    const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz)
 {
     auto tid = thread::get_thread_id_flat();
     if (tid >= num_rows) {
@@ -711,7 +658,7 @@ __global__
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void row_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void row_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs,
     const IndexType* __restrict__ in_cols,
@@ -737,7 +684,7 @@ __global__ __launch_bounds__(default_block_size) void row_permute_kernel(
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void inv_row_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs,
     const IndexType* __restrict__ in_cols,
@@ -763,7 +710,7 @@ __global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel(
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
+__global__ __launch_bounds__(default_block_size) void inv_symm_permute(
     size_type num_rows, const IndexType* __restrict__ permutation,
     const IndexType* __restrict__ in_row_ptrs,
     const IndexType* __restrict__ in_cols,
@@ -788,9 +735,6 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
 }
 
 
-namespace kernel {
-
-
 template <typename ValueType, typename IndexType>
 __global__
     __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
@@ -869,37 +813,105 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity(
 }
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
-    const IndexType num_min_rows_cols,
-    const IndexType* const __restrict__ row_ptrs,
-    const IndexType* const __restrict__ col_idxs,
-    bool* const __restrict__ has_all_diags)
-{
-    constexpr int warp_size = config::warp_size;
-    auto tile_grp =
-        group::tiled_partition<warp_size>(group::this_thread_block());
-    const auto row = thread::get_subwarp_id_flat<warp_size, IndexType>();
-    if (row < num_min_rows_cols) {
-        const auto tid_in_warp = tile_grp.thread_rank();
-        const IndexType row_start = row_ptrs[row];
-        const IndexType num_nz = row_ptrs[row + 1] - row_start;
-        bool row_has_diag_local{false};
-        for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) {
-            if (col_idxs[iz + row_start] == row) {
-                row_has_diag_local = true;
-                break;
-            }
-        }
-        auto row_has_diag = static_cast<bool>(tile_grp.any(row_has_diag_local));
-        if (!row_has_diag) {
-            if (tile_grp.thread_rank() == 0) {
-                *has_all_diags = false;
-            }
-            return;
-        }
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* source, int bs,
+                      Array<IndexType>& block_row_ptr_array,
+                      Array<IndexType>& block_col_idx_array,
+                      Array<ValueType>& block_value_array)
+{
+    using tuple_type = thrust::tuple<IndexType, IndexType>;
+    const auto nnz = source->get_num_stored_elements();
+    Array<IndexType> in_row_idxs{exec, nnz};
+    Array<IndexType> in_col_idxs{exec, nnz};
+    Array<ValueType> in_values{exec, nnz};
+    exec->copy(nnz, source->get_const_col_idxs(), in_col_idxs.get_data());
+    exec->copy(nnz, source->get_const_values(), in_values.get_data());
+    components::convert_ptrs_to_idxs(exec, source->get_const_row_ptrs(),
+                                     source->get_size()[0],
+                                     in_row_idxs.get_data());
+    auto block_row_ptrs = block_row_ptr_array.get_data();
+    auto num_block_rows = block_row_ptr_array.get_num_elems() - 1;
+    if (nnz == 0) {
+        components::fill_array(exec, block_row_ptrs, num_block_rows + 1,
+                               IndexType{});
+        block_col_idx_array.resize_and_reset(0);
+        block_value_array.resize_and_reset(0);
+        return;
     }
+    auto in_rows = in_row_idxs.get_data();
+    auto in_cols = in_col_idxs.get_data();
+    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
+    // due to overly generic assignment operator and constructor leading to
+    // ambiguities. So we need to use our own fake_complex type
+    auto in_vals =
+        reinterpret_cast<device_member_type<ValueType>*>(in_values.get_data());
+    auto in_loc_it = thrust::make_zip_iterator(
+        thrust::make_tuple(thrust::device_pointer_cast(in_rows),
+                           thrust::device_pointer_cast(in_cols)));
+    thrust::sort_by_key(thrust::device, in_loc_it, in_loc_it + nnz,
+                        thrust::device_pointer_cast(in_vals),
+                        [bs] __device__(tuple_type a, tuple_type b) {
+                            return thrust::make_pair(thrust::get<0>(a) / bs,
+                                                     thrust::get<1>(a) / bs) <
+                                   thrust::make_pair(thrust::get<0>(b) / bs,
+                                                     thrust::get<1>(b) / bs);
+                        });
+    // build block pattern
+    auto adj_predicate = [bs, in_rows, in_cols, nnz] __device__(size_type i) {
+        const auto a_block_row = i > 0 ? in_rows[i - 1] / bs : -1;
+        const auto a_block_col = i > 0 ? in_cols[i - 1] / bs : -1;
+        const auto b_block_row = in_rows[i] / bs;
+        const auto b_block_col = in_cols[i] / bs;
+        return (a_block_row != b_block_row) || (a_block_col != b_block_col);
+    };
+    auto iota = thrust::make_counting_iterator(size_type{});
+    // count how many blocks we have by counting how often the block changes
+    auto num_blocks = static_cast<size_type>(
+        thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate));
+    // allocate storage
+    Array<IndexType> block_row_idx_array{exec, num_blocks};
+    Array<size_type> block_ptr_array{exec, num_blocks};
+    block_col_idx_array.resize_and_reset(num_blocks);
+    block_value_array.resize_and_reset(num_blocks * bs * bs);
+    auto row_idxs = block_row_idx_array.get_data();
+    auto col_idxs = block_col_idx_array.get_data();
+    auto values = as_device_type(block_value_array.get_data());
+    auto block_ptrs = block_ptr_array.get_data();
+    auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
+    // write (block_row, block_col, block_start_idx) tuples for each block
+    thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it,
+                    adj_predicate);
+    auto block_output_it = thrust::make_zip_iterator(
+        thrust::make_tuple(thrust::device_pointer_cast(row_idxs),
+                           thrust::device_pointer_cast(col_idxs)));
+    thrust::transform(
+        thrust::device, block_ptr_it, block_ptr_it + num_blocks,
+        block_output_it, [bs, in_rows, in_cols] __device__(size_type i) {
+            return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs);
+        });
+    // build row pointers from row indices
+    components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(),
+                                     block_row_idx_array.get_num_elems(),
+                                     num_block_rows, block_row_ptrs);
+    // fill in values
+    components::fill_array(exec, block_value_array.get_data(),
+                           num_blocks * bs * bs, zero<ValueType>());
+    thrust::for_each_n(
+        thrust::device, iota, num_blocks,
+        [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
+         values] __device__(size_type i) {
+            const auto block_begin = block_ptrs[i];
+            const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
+            for (auto nz = block_begin; nz < block_end; nz++) {
+                values[i * bs * bs + (in_cols[nz] % bs) * bs +
+                       (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
+            }
+        });
 }
 
-
-}  // namespace kernel
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index c4cdc214e76..acb148db555 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -33,6 +33,172 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace kernel {
 
 
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void count_nonzero_blocks_per_row(
+        size_type num_block_rows, size_type num_block_cols, size_type stride,
+        int block_size, const ValueType* __restrict__ source,
+        IndexType* __restrict__ block_row_nnz)
+{
+    const auto brow =
+        thread::get_subwarp_id_flat<config::warp_size, IndexType>();
+
+    if (brow >= num_block_rows) {
+        return;
+    }
+
+    const auto bs_sq = block_size * block_size;
+    const auto num_cols = num_block_cols * block_size;
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = static_cast<IndexType>(warp.thread_rank());
+    constexpr auto full_mask = ~config::lane_mask_type{};
+    constexpr auto one_mask = config::lane_mask_type{1};
+    bool first_block_nonzero = false;
+    IndexType block_count{};
+    for (IndexType base_col = 0; base_col < num_cols;
+         base_col += config::warp_size) {
+        const auto col = base_col + lane;
+        const auto block_local_col = col % block_size;
+        // which is the first column in the current block?
+        const auto block_base_col = col - block_local_col;
+        // collect nonzero bitmask
+        bool local_nonzero = false;
+        for (int local_row = 0; local_row < block_size; local_row++) {
+            const auto row = local_row + brow * block_size;
+            local_nonzero |=
+                col < num_cols && is_nonzero(source[row * stride + col]);
+        }
+        auto nonzero_mask =
+            warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u);
+        // only consider threads in the current block
+        const auto first_thread = block_base_col - base_col;
+        const auto last_thread = first_thread + block_size;
+        // HIP compiles these assertions in Release, traps unconditionally
+        // assert(first_thread < int(config::warp_size));
+        // assert(last_thread >= 0);
+        // mask off everything below first_thread
+        const auto lower_mask =
+            first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u);
+        // mask off everything from last_thread
+        const auto upper_mask = last_thread >= config::warp_size
+                                    ? full_mask
+                                    : ((one_mask << last_thread) - 1u);
+        const auto block_mask = upper_mask & lower_mask;
+        const auto local_mask = nonzero_mask & block_mask;
+        // last column in the block increments the counter
+        block_count +=
+            (block_local_col == block_size - 1 && local_mask) ? 1 : 0;
+        // if we need to store something for the next iteration
+        if ((base_col + config::warp_size) % block_size != 0) {
+            // check whether the last block (incomplete) in this warp is nonzero
+            auto local_block_nonzero_mask = warp.ballot(local_mask != 0u);
+            bool last_block_nonzero =
+                (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u;
+            first_block_nonzero = last_block_nonzero;
+        } else {
+            first_block_nonzero = false;
+        }
+    }
+    block_count = reduce(warp, block_count,
+                         [](IndexType a, IndexType b) { return a + b; });
+    if (lane == 0) {
+        block_row_nnz[brow] = block_count;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void convert_to_fbcsr(
+    size_type num_block_rows, size_type num_block_cols, size_type stride,
+    int block_size, const ValueType* __restrict__ source,
+    const IndexType* __restrict__ block_row_ptrs,
+    IndexType* __restrict__ block_cols, ValueType* __restrict__ blocks)
+{
+    const auto brow =
+        thread::get_subwarp_id_flat<config::warp_size, IndexType>();
+
+    if (brow >= num_block_rows) {
+        return;
+    }
+
+    const auto bs_sq = block_size * block_size;
+    const auto num_cols = num_block_cols * block_size;
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = static_cast<IndexType>(warp.thread_rank());
+    constexpr auto full_mask = ~config::lane_mask_type{};
+    constexpr auto one_mask = config::lane_mask_type{1};
+    const auto lane_prefix_mask = (one_mask << warp.thread_rank()) - 1u;
+    bool first_block_nonzero = false;
+    auto block_base_nz = block_row_ptrs[brow];
+    for (IndexType base_col = 0; base_col < num_cols;
+         base_col += config::warp_size) {
+        const auto col = base_col + lane;
+        const auto block_local_col = col % block_size;
+        // which is the first column in the current block?
+        const auto block_base_col = col - block_local_col;
+        // collect nonzero bitmask
+        bool local_nonzero = false;
+        for (int local_row = 0; local_row < block_size; local_row++) {
+            const auto row = local_row + brow * block_size;
+            local_nonzero |=
+                col < num_cols && is_nonzero(source[row * stride + col]);
+        }
+        auto nonzero_mask =
+            warp.ballot(local_nonzero) | (first_block_nonzero ? 1u : 0u);
+        // only consider threads in the current block
+        const auto first_thread = block_base_col - base_col;
+        const auto last_thread = first_thread + block_size;
+        // HIP compiles these assertions in Release, traps unconditionally
+        // assert(first_thread < int(config::warp_size));
+        // assert(last_thread >= 0);
+        // mask off everything below first_thread
+        const auto lower_mask =
+            first_thread < 0 ? full_mask : ~((one_mask << first_thread) - 1u);
+        // mask off everything from last_thread
+        const auto upper_mask = last_thread >= config::warp_size
+                                    ? full_mask
+                                    : ((one_mask << last_thread) - 1u);
+        const auto block_mask = upper_mask & lower_mask;
+        const auto local_mask = nonzero_mask & block_mask;
+        const auto block_nonzero_mask =
+            warp.ballot(local_mask && (block_local_col == block_size - 1));
+
+        // count how many Fbcsr blocks come before the Fbcsr block handled by
+        // the local group of threads
+        const auto block_nz =
+            block_base_nz + popcnt(block_nonzero_mask & lane_prefix_mask);
+        // now in a second sweep, store the actual elements
+        if (local_mask) {
+            if (block_local_col == block_size - 1) {
+                block_cols[block_nz] = col / block_size;
+            }
+            // only if we encountered elements in this column
+            if (local_nonzero) {
+                for (int local_row = 0; local_row < block_size; local_row++) {
+                    const auto row = local_row + brow * block_size;
+                    blocks[local_row + block_local_col * block_size +
+                           block_nz * bs_sq] = source[row * stride + col];
+                }
+            }
+        }
+        // if we need to store something for the next iteration
+        if ((base_col + config::warp_size) % block_size != 0) {
+            // check whether the last block (incomplete) in this warp is nonzero
+            auto local_block_nonzero_mask = warp.ballot(local_mask != 0u);
+            bool last_block_nonzero =
+                (local_block_nonzero_mask >> (config::warp_size - 1)) != 0u;
+            first_block_nonzero = last_block_nonzero;
+        } else {
+            first_block_nonzero = false;
+        }
+        // advance by the completed blocks
+        block_base_nz += popcnt(block_nonzero_mask);
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void fill_in_coo(
     size_type num_rows, size_type num_cols, size_type stride,
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
index 51feaf82fe6..b703fc783ce 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -93,9 +93,9 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr(
     if (block_row == 0 && lane == 0) {
         row_ptrs[0] = 0;
     }
-    if (lane < block_size) {
-        row_ptrs[first_row + lane + 1] =
-            block_row_begin + num_blocks * block_size * (lane + 1);
+    for (auto i = lane; i < block_size; i += config::warp_size) {
+        row_ptrs[first_row + i + 1] =
+            block_row_begin + num_blocks * block_size * (i + 1);
     }
     for (IndexType i = lane; i < num_blocks * block_size * block_size;
          i += config::warp_size) {
@@ -116,25 +116,58 @@ __global__ __launch_bounds__(default_block_size) void convert_to_csr(
 }
 
 
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_dense(
+    const IndexType* block_row_ptrs, const IndexType* block_col_idxs,
+    const ValueType* blocks, ValueType* values, size_type stride,
+    size_type num_block_rows, int block_size)
+{
+    const auto block_row = thread::get_subwarp_id_flat<config::warp_size>();
+    if (block_row >= num_block_rows) {
+        return;
+    }
+    const auto block_begin = block_row_ptrs[block_row];
+    const auto block_end = block_row_ptrs[block_row + 1];
+    const auto num_blocks = block_end - block_begin;
+    const auto block_row_begin = block_begin * block_size * block_size;
+    const auto num_entries = num_blocks * block_size * block_size;
+    const auto bs_sq = block_size * block_size;
+    const auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = warp.thread_rank();
+    for (IndexType nz = lane; nz < num_entries; nz += config::warp_size) {
+        const auto local_id = nz % bs_sq;
+        const auto block = nz / bs_sq;
+        const auto local_row = local_id % block_size;
+        const auto local_col = local_id / block_size;
+        const auto col =
+            block_col_idxs[block + block_begin] * block_size + local_col;
+        const auto row = block_row * block_size + local_row;
+        values[row * stride + col] = blocks[nz + block_row_begin];
+    }
+}
+
+
 }  // namespace kernel
 
 
 template <typename ValueType, typename IndexType>
 void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
                          device_matrix_data<ValueType, IndexType>& data,
-                         int block_size, Array<IndexType>& row_ptr_array,
-                         Array<IndexType>& col_idx_array,
-                         Array<ValueType>& value_array)
+                         int block_size, Array<IndexType>& block_row_ptr_array,
+                         Array<IndexType>& block_col_idx_array,
+                         Array<ValueType>& block_value_array)
 {
     using tuple_type = thrust::tuple<IndexType, IndexType>;
     const auto nnz = data.get_num_elems();
     const auto bs = block_size;
-    auto row_ptrs = row_ptr_array.get_data();
-    auto num_rows = row_ptr_array.get_num_elems() - 1;
+    auto block_row_ptrs = block_row_ptr_array.get_data();
+    auto num_block_rows = block_row_ptr_array.get_num_elems() - 1;
     if (nnz == 0) {
-        components::fill_array(exec, row_ptrs, num_rows + 1, IndexType{});
-        col_idx_array.resize_and_reset(0);
-        value_array.resize_and_reset(0);
+        components::fill_array(exec, block_row_ptrs, num_block_rows + 1,
+                               IndexType{});
+        block_col_idx_array.resize_and_reset(0);
+        block_value_array.resize_and_reset(0);
         return;
     }
     auto in_rows = data.get_row_idxs();
@@ -168,42 +201,43 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     auto num_blocks = static_cast<size_type>(
         thrust::count_if(thrust::device, iota, iota + nnz, adj_predicate));
     // allocate storage
-    Array<IndexType> row_idx_array{exec, num_blocks};
+    Array<IndexType> block_row_idx_array{exec, num_blocks};
     Array<size_type> block_ptr_array{exec, num_blocks};
-    col_idx_array.resize_and_reset(num_blocks);
-    value_array.resize_and_reset(num_blocks * bs * bs);
-    auto row_idxs = row_idx_array.get_data();
-    auto col_idxs = col_idx_array.get_data();
-    auto values = as_device_type(value_array.get_data());
+    block_col_idx_array.resize_and_reset(num_blocks);
+    block_value_array.resize_and_reset(num_blocks * bs * bs);
+    auto block_row_idxs = block_row_idx_array.get_data();
+    auto block_col_idxs = block_col_idx_array.get_data();
+    auto block_values = as_device_type(block_value_array.get_data());
     auto block_ptrs = block_ptr_array.get_data();
     auto block_ptr_it = thrust::device_pointer_cast(block_ptrs);
     // write (block_row, block_col, block_start_idx) tuples for each block
     thrust::copy_if(thrust::device, iota, iota + nnz, block_ptr_it,
                     adj_predicate);
     auto block_output_it = thrust::make_zip_iterator(
-        thrust::make_tuple(thrust::device_pointer_cast(row_idxs),
-                           thrust::device_pointer_cast(col_idxs)));
+        thrust::make_tuple(thrust::device_pointer_cast(block_row_idxs),
+                           thrust::device_pointer_cast(block_col_idxs)));
     thrust::transform(
         thrust::device, block_ptr_it, block_ptr_it + num_blocks,
         block_output_it, [bs, in_rows, in_cols] __device__(size_type i) {
             return thrust::make_tuple(in_rows[i] / bs, in_cols[i] / bs);
         });
     // build row pointers from row indices
-    components::convert_idxs_to_ptrs(exec, row_idx_array.get_const_data(),
-                                     row_idx_array.get_num_elems(), num_rows,
-                                     row_ptrs);
+    components::convert_idxs_to_ptrs(exec, block_row_idx_array.get_const_data(),
+                                     block_row_idx_array.get_num_elems(),
+                                     num_block_rows, block_row_ptrs);
     // fill in values
-    components::fill_array(exec, value_array.get_data(), num_blocks * bs * bs,
-                           zero<ValueType>());
+    components::fill_array(exec, block_value_array.get_data(),
+                           num_blocks * bs * bs, zero<ValueType>());
     thrust::for_each_n(
         thrust::device, iota, num_blocks,
         [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
-         values] __device__(size_type i) {
+         block_values] __device__(size_type i) {
             const auto block_begin = block_ptrs[i];
             const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
             for (auto nz = block_begin; nz < block_end; nz++) {
-                values[i * bs * bs + (in_cols[nz] % bs) * bs +
-                       (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
+                block_values[i * bs * bs + (in_cols[nz] % bs) * bs +
+                             (in_rows[nz] % bs)] =
+                    fake_complex_unpack(in_vals[nz]);
             }
         });
 }
diff --git a/core/base/composition.cpp b/core/base/composition.cpp
index 21140fe37e9..8f927fc8baf 100644
--- a/core/base/composition.cpp
+++ b/core/base/composition.cpp
@@ -82,8 +82,8 @@ std::unique_ptr<LinOp> apply_inner_operators(
     auto op_size = operators.back()->get_size();
     auto out_dim = gko::dim<2>{op_size[0], num_rhs};
     auto out_size = out_dim[0] * num_rhs;
-    auto out = Dense::create(
-        exec, out_dim, Array<ValueType>::view(exec, out_size, data), num_rhs);
+    auto out = Dense::create(exec, out_dim,
+                             make_array_view(exec, out_size, data), num_rhs);
     // for operators with initial guess: set initial guess
     if (operators.back()->apply_uses_initial_guess()) {
         if (op_size[0] == op_size[1]) {
@@ -111,8 +111,7 @@ std::unique_ptr<LinOp> apply_inner_operators(
             data + (reversed_storage ? storage_size - out_size : size_type{});
         reversed_storage = !reversed_storage;
         out = Dense::create(exec, out_dim,
-                            Array<ValueType>::view(exec, out_size, out_data),
-                            num_rhs);
+                            make_array_view(exec, out_size, out_data), num_rhs);
         // for operators with initial guess: set initial guess
         if (operators[i]->apply_uses_initial_guess()) {
             if (op_size[0] == op_size[1]) {
diff --git a/core/factorization/par_ic.cpp b/core/factorization/par_ic.cpp
index e54538ad9e3..a2d10271aff 100644
--- a/core/factorization/par_ic.cpp
+++ b/core/factorization/par_ic.cpp
@@ -122,12 +122,10 @@ std::unique_ptr<Composition<ValueType>> ParIc<ValueType, IndexType>::generate(
     // build COO representation of lower factor
     Array<IndexType> l_row_idxs{exec, l_nnz};
     // copy values from l_factor, which are the lower triangular values of A
-    auto l_vals_view =
-        Array<ValueType>::view(exec, l_nnz, l_factor->get_values());
+    auto l_vals_view = make_array_view(exec, l_nnz, l_factor->get_values());
     auto a_vals = Array<ValueType>{exec, l_vals_view};
     auto a_row_idxs = Array<IndexType>{exec, l_nnz};
-    auto a_col_idxs =
-        Array<IndexType>::view(exec, l_nnz, l_factor->get_col_idxs());
+    auto a_col_idxs = make_array_view(exec, l_nnz, l_factor->get_col_idxs());
     auto a_lower_coo =
         CooMatrix::create(exec, matrix_size, std::move(a_vals),
                           std::move(a_col_idxs), std::move(a_row_idxs));
diff --git a/core/factorization/par_ict.cpp b/core/factorization/par_ict.cpp
index cfbdfeee88a..742c8b9811f 100644
--- a/core/factorization/par_ict.cpp
+++ b/core/factorization/par_ict.cpp
@@ -243,9 +243,9 @@ void ParIctState<ValueType, IndexType>::iterate()
         l_builder.get_row_idx_array().resize_and_reset(l_nnz);
         // update arrays that will be aliased
         l_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, l_nnz, l_new->get_col_idxs());
+            make_array_view(exec, l_nnz, l_new->get_col_idxs());
         l_builder.get_value_array() =
-            Array<ValueType>::view(exec, l_nnz, l_new->get_values());
+            make_array_view(exec, l_nnz, l_new->get_values());
     }
 
     // convert L into COO format
diff --git a/core/factorization/par_ilut.cpp b/core/factorization/par_ilut.cpp
index e54eac93af5..8f2c0edbfa1 100644
--- a/core/factorization/par_ilut.cpp
+++ b/core/factorization/par_ilut.cpp
@@ -277,13 +277,13 @@ void ParIlutState<ValueType, IndexType>::iterate()
         u_csc_builder.get_value_array().resize_and_reset(u_nnz);
         // update arrays that will be aliased
         l_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, l_nnz, l_new->get_col_idxs());
+            make_array_view(exec, l_nnz, l_new->get_col_idxs());
         u_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, u_nnz, u_new->get_col_idxs());
+            make_array_view(exec, u_nnz, u_new->get_col_idxs());
         l_builder.get_value_array() =
-            Array<ValueType>::view(exec, l_nnz, l_new->get_values());
+            make_array_view(exec, l_nnz, l_new->get_values());
         u_builder.get_value_array() =
-            Array<ValueType>::view(exec, u_nnz, u_new->get_values());
+            make_array_view(exec, u_nnz, u_new->get_values());
     }
 
     // convert U' into CSC format
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index b2a92cbbe95..94cb35c42c9 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -462,31 +462,30 @@ void Dense<ValueType>::compute_norm1_impl(LinOp* result) const
 template <typename ValueType>
 void Dense<ValueType>::convert_to(Dense<ValueType>* result) const
 {
-    if (this->get_size() && result->get_size() == this->get_size()) {
-        // we need to create a executor-local clone of the target data, that
-        // will be copied back later.
-        auto exec = this->get_executor();
-        auto result_array = make_temporary_output_clone(exec, &result->values_);
-        // create a (value, not pointer to avoid allocation overhead) view
-        // matrix on the array to avoid special-casing cross-executor copies
-        auto tmp_result =
-            Dense{exec, result->get_size(),
-                  Array<ValueType>::view(exec, result_array->get_num_elems(),
-                                         result_array->get_data()),
-                  result->get_stride()};
-        exec->run(dense::make_copy(this, &tmp_result));
-    } else {
-        result->values_ = this->values_;
-        result->stride_ = this->stride_;
-        result->set_size(this->get_size());
+    if (result->get_size() != this->get_size()) {
+        result->resize(this->get_size());
     }
+    // we need to create a temporary clone of the target data to write to
+    auto exec = this->get_executor();
+    auto result_output = make_temporary_clone(exec, &result->values_);
+    // create a (value, not pointer to avoid allocation overhead) view
+    // matrix on the array to avoid special-casing cross-executor copies
+    auto tmp_result =
+        Dense{exec, this->get_size(),
+              make_array_view(exec, result_output->get_num_elems(),
+                              result_output->get_data()),
+              result->get_stride()};
+    exec->run(dense::make_copy(this, &tmp_result));
 }
 
 
 template <typename ValueType>
 void Dense<ValueType>::move_to(Dense<ValueType>* result)
 {
-    this->convert_to(result);
+    result->values_ = std::move(this->values_);
+    result->stride_ = std::exchange(this->stride_, 0);
+    result->set_size(this->get_size());
+    this->resize(gko::dim<2>{});
 }
 
 
@@ -628,6 +627,7 @@ void Dense<ValueType>::convert_impl(Fbcsr<ValueType, IndexType>* result) const
         exec->copy_val_to_host(tmp->get_const_row_ptrs() + row_blocks);
     tmp->col_idxs_.resize_and_reset(nnz_blocks);
     tmp->values_.resize_and_reset(nnz_blocks * bs * bs);
+    tmp->values_.fill(zero<ValueType>());
     tmp->set_size(this->get_size());
     exec->run(dense::make_convert_to_fbcsr(this, tmp.get()));
 }
@@ -1584,6 +1584,64 @@ void Dense<ValueType>::add_scaled_identity_impl(const LinOp* const a,
 }
 
 
+template <typename ValueType>
+std::unique_ptr<typename Dense<ValueType>::real_type>
+Dense<ValueType>::create_real_view()
+{
+    const auto num_rows = this->get_size()[0];
+    constexpr bool complex = is_complex<ValueType>();
+    const auto num_cols =
+        complex ? 2 * this->get_size()[1] : this->get_size()[1];
+    const auto stride = complex ? 2 * this->get_stride() : this->get_stride();
+
+    return Dense<remove_complex<ValueType>>::create(
+        this->get_executor(), dim<2>{num_rows, num_cols},
+        make_array_view(
+            this->get_executor(), num_rows * stride,
+            reinterpret_cast<remove_complex<ValueType>*>(this->get_values())),
+        stride);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const typename Dense<ValueType>::real_type>
+Dense<ValueType>::create_real_view() const
+{
+    const auto num_rows = this->get_size()[0];
+    constexpr bool complex = is_complex<ValueType>();
+    const auto num_cols =
+        complex ? 2 * this->get_size()[1] : this->get_size()[1];
+    const auto stride = complex ? 2 * this->get_stride() : this->get_stride();
+
+    return Dense<remove_complex<ValueType>>::create_const(
+        this->get_executor(), dim<2>{num_rows, num_cols},
+        make_const_array_view(
+            this->get_executor(), num_rows * stride,
+            reinterpret_cast<const remove_complex<ValueType>*>(
+                this->get_const_values())),
+        stride);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::create_submatrix_impl(
+    const span& rows, const span& columns, const size_type stride)
+{
+    row_major_range range_this{this->get_values(), this->get_size()[0],
+                               this->get_size()[1], this->get_stride()};
+    auto range_result = range_this(rows, columns);
+    size_type storage_size =
+        rows.length() > 0
+            ? range_result.length(0) * this->get_stride() - columns.begin
+            : 0;
+    return Dense::create(
+        this->get_executor(),
+        dim<2>{range_result.length(0), range_result.length(1)},
+        make_array_view(this->get_executor(), storage_size, range_result->data),
+        stride);
+}
+
+
 #define GKO_DECLARE_DENSE_MATRIX(_type) class Dense<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_MATRIX);
 
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 3325638fbde..a651fa41cec 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -232,9 +232,12 @@ void Fbcsr<ValueType, IndexType>::read(device_mat_data&& data)
     this->set_size(data.get_size());
     row_ptrs_.resize_and_reset(row_blocks + 1);
     auto exec = this->get_executor();
-    auto local_data = make_temporary_clone(exec, &data);
-    exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_,
-                                              col_idxs_, values_));
+    {
+        auto local_data = make_temporary_clone(exec, &data);
+        exec->run(fbcsr::make_fill_in_matrix_data(*local_data, bs_, row_ptrs_,
+                                                  col_idxs_, values_));
+    }
+    // this needs to happen after the temporary clone copy-back
     data.empty_out();
 }
 
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index e9f9352bf7b..22c7b61012e 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -273,9 +273,9 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp* system_matrix,
         if (!diag_vt) {
             GKO_NOT_SUPPORTED(system_matrix);
         }
-        auto temp = Array<ValueType>::view(diag_vt->get_executor(),
-                                           diag_vt->get_size()[0],
-                                           diag_vt->get_values());
+        auto temp =
+            make_array_view(diag_vt->get_executor(), diag_vt->get_size()[0],
+                            diag_vt->get_values());
         this->blocks_ = Array<ValueType>(exec, temp.get_num_elems());
         exec->run(jacobi::make_invert_diagonal(temp, this->blocks_));
         this->num_blocks_ = diag_vt->get_size()[0];
diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index d0acb5c68cf..d48bd0bea7e 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -356,7 +356,7 @@ TYPED_TEST(Array, CanBeResized)
 TYPED_TEST(Array, ViewCannotBeResized)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
 
     EXPECT_THROW(view.resize_and_reset(1), gko::NotSupported);
     EXPECT_EQ(view.get_num_elems(), 3);
@@ -416,7 +416,7 @@ TYPED_TEST(Array, ChangesExecutors)
 TYPED_TEST(Array, ViewModifiesOriginalData)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
 
     TypeParam new_data[] = {5, 4, 2};
     std::copy(new_data, new_data + 3, view.get_data());
@@ -448,11 +448,11 @@ TYPED_TEST(Array, CopyArrayToArray)
 TYPED_TEST(Array, CopyViewToView)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
     TypeParam data2[] = {5, 4, 2};
-    auto view2 = gko::Array<TypeParam>::view(this->exec, 3, data2);
+    auto view2 = gko::make_array_view(this->exec, 3, data2);
     TypeParam data_size4[] = {5, 4, 2, 1};
-    auto view_size4 = gko::Array<TypeParam>::view(this->exec, 4, data_size4);
+    auto view_size4 = gko::make_array_view(this->exec, 4, data_size4);
 
     view = view2;
     view2.get_data()[0] = 2;
@@ -470,7 +470,7 @@ TYPED_TEST(Array, CopyViewToView)
 TYPED_TEST(Array, CopyViewToArray)
 {
     TypeParam data[] = {1, 2, 3, 4};
-    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    auto view = gko::make_array_view(this->exec, 4, data);
     gko::Array<TypeParam> array(this->exec, {5, 4, 2});
 
     array = view;
@@ -488,7 +488,7 @@ TYPED_TEST(Array, CopyViewToArray)
 TYPED_TEST(Array, CopyArrayToView)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
     gko::Array<TypeParam> array_size2(this->exec, {5, 4});
     gko::Array<TypeParam> array_size4(this->exec, {5, 4, 2, 1});
 
@@ -525,9 +525,9 @@ TYPED_TEST(Array, MoveArrayToArray)
 TYPED_TEST(Array, MoveViewToView)
 {
     TypeParam data[] = {1, 2, 3, 4};
-    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    auto view = gko::make_array_view(this->exec, 4, data);
     TypeParam data2[] = {5, 4, 2};
-    auto view2 = gko::Array<TypeParam>::view(this->exec, 3, data2);
+    auto view2 = gko::make_array_view(this->exec, 3, data2);
 
     view = std::move(view2);
 
@@ -550,7 +550,7 @@ TYPED_TEST(Array, MoveViewToArray)
 {
     TypeParam data[] = {1, 2, 3, 4};
     gko::Array<TypeParam> array(this->exec, {5, 4, 2});
-    auto view = gko::Array<TypeParam>::view(this->exec, 4, data);
+    auto view = gko::make_array_view(this->exec, 4, data);
 
     array = std::move(view);
 
@@ -572,7 +572,7 @@ TYPED_TEST(Array, MoveViewToArray)
 TYPED_TEST(Array, MoveArrayToView)
 {
     TypeParam data[] = {1, 2, 3};
-    auto view = gko::Array<TypeParam>::view(this->exec, 3, data);
+    auto view = gko::make_array_view(this->exec, 3, data);
     gko::Array<TypeParam> array_size2(this->exec, {5, 4});
     gko::Array<TypeParam> array_size4(this->exec, {5, 4, 2, 1});
     auto size2_ptr = array_size2.get_data();
diff --git a/core/test/matrix/coo.cpp b/core/test/matrix/coo.cpp
index 18438a6dd68..4cc17b145b9 100644
--- a/core/test/matrix/coo.cpp
+++ b/core/test/matrix/coo.cpp
@@ -142,9 +142,9 @@ TYPED_TEST(Coo, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Coo<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 4, values),
-        gko::Array<index_type>::view(this->exec, 4, col_idxs),
-        gko::Array<index_type>::view(this->exec, 4, row_idxs));
+        gko::make_array_view(this->exec, 4, values),
+        gko::make_array_view(this->exec, 4, col_idxs),
+        gko::make_array_view(this->exec, 4, row_idxs));
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
diff --git a/core/test/matrix/csr.cpp b/core/test/matrix/csr.cpp
index 166b1adb86b..85163b95e01 100644
--- a/core/test/matrix/csr.cpp
+++ b/core/test/matrix/csr.cpp
@@ -147,9 +147,9 @@ TYPED_TEST(Csr, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Csr<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 4, values),
-        gko::Array<index_type>::view(this->exec, 4, col_idxs),
-        gko::Array<index_type>::view(this->exec, 4, row_ptrs),
+        gko::make_array_view(this->exec, 4, values),
+        gko::make_array_view(this->exec, 4, col_idxs),
+        gko::make_array_view(this->exec, 4, row_ptrs),
         std::make_shared<typename Mtx::load_balance>(2));
 
     ASSERT_EQ(mtx->get_num_srow_elements(), 1);
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index 2c670fe8862..cbb94a5c605 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -60,8 +60,7 @@ class Dense : public ::testing::Test {
     static void assert_equal_to_original_mtx(gko::matrix::Dense<value_type>* m)
     {
         ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
-        ASSERT_EQ(m->get_stride(), 4);
-        ASSERT_EQ(m->get_num_stored_elements(), 2 * 4);
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * m->get_stride());
         EXPECT_EQ(m->at(0, 0), value_type{1.0});
         EXPECT_EQ(m->at(0, 1), value_type{2.0});
         EXPECT_EQ(m->at(0, 2), value_type{3.0});
@@ -131,7 +130,7 @@ TYPED_TEST(Dense, CanBeConstructedFromExistingData)
 
     auto m = gko::matrix::Dense<TypeParam>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 9, data), 3);
+        gko::make_array_view(this->exec, 9, data), 3);
 
     ASSERT_EQ(m->get_const_values(), data);
     ASSERT_EQ(m->at(2, 1), value_type{6.0});
@@ -172,6 +171,7 @@ TYPED_TEST(Dense, CreateWithSameConfigKeepsStride)
 TYPED_TEST(Dense, KnowsItsSizeAndValues)
 {
     this->assert_equal_to_original_mtx(this->mtx.get());
+    ASSERT_EQ(this->mtx->get_stride(), 4);
 }
 
 
@@ -241,6 +241,8 @@ TYPED_TEST(Dense, CanBeCopied)
     this->assert_equal_to_original_mtx(this->mtx.get());
     this->mtx->at(0) = 7;
     this->assert_equal_to_original_mtx(mtx_copy.get());
+    ASSERT_EQ(this->mtx->get_stride(), 4);
+    ASSERT_EQ(mtx_copy->get_stride(), 3);
 }
 
 
@@ -249,14 +251,15 @@ TYPED_TEST(Dense, CanBeMoved)
     auto mtx_copy = gko::matrix::Dense<TypeParam>::create(this->exec);
     mtx_copy->copy_from(std::move(this->mtx));
     this->assert_equal_to_original_mtx(mtx_copy.get());
+    ASSERT_EQ(mtx_copy->get_stride(), 4);
 }
 
 
 TYPED_TEST(Dense, CanBeCloned)
 {
     auto mtx_clone = this->mtx->clone();
-    this->assert_equal_to_original_mtx(
-        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+    this->assert_equal_to_original_mtx(mtx_clone.get());
+    ASSERT_EQ(mtx_clone->get_stride(), 3);
 }
 
 
diff --git a/core/test/matrix/diagonal.cpp b/core/test/matrix/diagonal.cpp
index a9fca07aa37..bf8bcc221bf 100644
--- a/core/test/matrix/diagonal.cpp
+++ b/core/test/matrix/diagonal.cpp
@@ -108,7 +108,7 @@ TYPED_TEST(Diagonal, CanBeCreatedFromExistingData)
     value_type values[] = {1.0, 2.0, 3.0};
 
     auto diag = gko::matrix::Diagonal<value_type>::create(
-        this->exec, 3, gko::Array<value_type>::view(this->exec, 3, values));
+        this->exec, 3, gko::make_array_view(this->exec, 3, values));
 
     ASSERT_EQ(diag->get_const_values(), values);
 }
diff --git a/core/test/matrix/ell.cpp b/core/test/matrix/ell.cpp
index e7c9fd622e0..d792e00d302 100644
--- a/core/test/matrix/ell.cpp
+++ b/core/test/matrix/ell.cpp
@@ -146,8 +146,8 @@ TYPED_TEST(Ell, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Ell<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<value_type>::view(this->exec, 8, values),
-        gko::Array<index_type>::view(this->exec, 8, col_idxs), 2, 4);
+        gko::make_array_view(this->exec, 8, values),
+        gko::make_array_view(this->exec, 8, col_idxs), 2, 4);
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
diff --git a/core/test/matrix/fbcsr.cpp b/core/test/matrix/fbcsr.cpp
index eddf6d4b753..4dc15755f48 100644
--- a/core/test/matrix/fbcsr.cpp
+++ b/core/test/matrix/fbcsr.cpp
@@ -398,9 +398,9 @@ TYPED_TEST(Fbcsr, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::Fbcsr<value_type, index_type>::create(
         this->exec, gko::dim<2>{nbrows * bs, nbcols * bs}, bs,
-        gko::Array<value_type>::view(this->exec, bnnz * bs * bs, values),
-        gko::Array<index_type>::view(this->exec, bnnz, col_idxs),
-        gko::Array<index_type>::view(this->exec, nbrows + 1, row_ptrs));
+        gko::make_array_view(this->exec, bnnz * bs * bs, values),
+        gko::make_array_view(this->exec, bnnz, col_idxs),
+        gko::make_array_view(this->exec, nbrows + 1, row_ptrs));
 
     ASSERT_EQ(mtx->get_const_values(), values);
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
index a5d21ba1782..5214f439ac9 100644
--- a/core/test/matrix/permutation.cpp
+++ b/core/test/matrix/permutation.cpp
@@ -136,7 +136,7 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData)
 
     auto m = gko::matrix::Permutation<i_type>::create(
         this->exec, gko::dim<2>{3, 5},
-        gko::Array<i_type>::view(this->exec, 3, data));
+        gko::make_array_view(this->exec, 3, data));
 
     ASSERT_EQ(m->get_const_permutation(), data);
 }
@@ -191,7 +191,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions)
 
     ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
                      this->exec, gko::dim<2>{4, 2},
-                     gko::Array<i_type>::view(this->exec, 3, data)),
+                     gko::make_array_view(this->exec, 3, data)),
                  gko::ValueMismatch);
 }
 
@@ -203,7 +203,7 @@ TYPED_TEST(Permutation, SettingMaskDoesNotModifyData)
 
     auto m = gko::matrix::Permutation<i_type>::create(
         this->exec, gko::dim<2>{3, 5},
-        gko::Array<i_type>::view(this->exec, 3, data));
+        gko::make_array_view(this->exec, 3, data));
 
     auto mask = m->get_permute_mask();
     ASSERT_EQ(m->get_const_permutation(), data);
@@ -225,7 +225,7 @@ TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions)
 
     ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
                      this->exec, gko::dim<2>{3, 4},
-                     gko::Array<i_type>::view(this->exec, 3, data),
+                     gko::make_array_view(this->exec, 3, data),
                      gko::matrix::column_permute),
                  gko::ValueMismatch);
 }
diff --git a/core/test/matrix/row_gatherer.cpp b/core/test/matrix/row_gatherer.cpp
index 51096a94893..68c3142ca0c 100644
--- a/core/test/matrix/row_gatherer.cpp
+++ b/core/test/matrix/row_gatherer.cpp
@@ -127,7 +127,7 @@ TYPED_TEST(RowGatherer, RowGathererCanBeConstructedFromExistingData)
 
     auto m = gko::matrix::RowGatherer<i_type>::create(
         this->exec, gko::dim<2>{3, 5},
-        gko::Array<i_type>::view(this->exec, 3, data));
+        gko::make_array_view(this->exec, 3, data));
 
     ASSERT_EQ(m->get_const_row_idxs(), data);
 }
@@ -140,7 +140,7 @@ TYPED_TEST(RowGatherer, RowGathererThrowsforWrongRowPermDimensions)
 
     ASSERT_THROW(gko::matrix::RowGatherer<i_type>::create(
                      this->exec, gko::dim<2>{4, 2},
-                     gko::Array<i_type>::view(this->exec, 3, data)),
+                     gko::make_array_view(this->exec, 3, data)),
                  gko::ValueMismatch);
 }
 
diff --git a/core/test/matrix/sparsity_csr.cpp b/core/test/matrix/sparsity_csr.cpp
index ec59661dfe0..d14a8897d19 100644
--- a/core/test/matrix/sparsity_csr.cpp
+++ b/core/test/matrix/sparsity_csr.cpp
@@ -154,8 +154,8 @@ TYPED_TEST(SparsityCsr, CanBeCreatedFromExistingData)
 
     auto mtx = gko::matrix::SparsityCsr<value_type, index_type>::create(
         this->exec, gko::dim<2>{3, 2},
-        gko::Array<index_type>::view(this->exec, 4, col_idxs),
-        gko::Array<index_type>::view(this->exec, 4, row_ptrs), 2.0);
+        gko::make_array_view(this->exec, 4, col_idxs),
+        gko::make_array_view(this->exec, 4, row_ptrs), 2.0);
 
     ASSERT_EQ(mtx->get_const_col_idxs(), col_idxs);
     ASSERT_EQ(mtx->get_const_row_ptrs(), row_ptrs);
diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp
index 7c8609573f0..5f8d454fac7 100644
--- a/core/test/utils/assertions_test.cpp
+++ b/core/test/utils/assertions_test.cpp
@@ -54,7 +54,7 @@ class MatricesNear : public ::testing::Test {
     template <typename Type, std::size_t size>
     gko::Array<Type> make_view(std::array<Type, size>& array)
     {
-        return gko::Array<Type>::view(exec, size, array.data());
+        return gko::make_array_view(exec, size, array.data());
     }
 
     MatricesNear()
diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu
index be86588a909..e79042b8eec 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu
@@ -161,9 +161,9 @@ void threshold_filter_approx(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu
index 6dd83c41835..0451582187f 100644
--- a/cuda/factorization/par_ilut_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_filter_kernel.cu
@@ -117,9 +117,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernel.cu
index 6655db78296..92bd9e28ab8 100644
--- a/cuda/factorization/par_ilut_select_kernel.cu
+++ b/cuda/factorization/par_ilut_select_kernel.cu
@@ -159,7 +159,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
         if (step > 5) {
             Array<AbsType> cpu_out_array{
                 exec->get_master(),
-                Array<AbsType>::view(exec, bucket.size, tmp_out)};
+                make_array_view(exec, bucket.size, tmp_out)};
             auto begin = cpu_out_array.get_data();
             auto end = begin + bucket.size;
             auto middle = begin + rank;
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index d60323c40c0..2b83c4fdc22 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -36,6 +36,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -47,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -97,6 +107,7 @@ using spgeam_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
 
 
@@ -304,8 +315,6 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
                     as_cuda_type(c->get_stride()));
             }
         }
-    } else {
-        GKO_NOT_SUPPORTED(nwarps);
     }
 }
 
@@ -897,16 +906,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
-void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* source, int bs,
-                      Array<IndexType>& row_ptrs, Array<IndexType>& col_idxs,
-                      Array<ValueType>& values) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const CudaExecutor> exec,
                const matrix::Csr<ValueType, IndexType>* orig,
@@ -1006,7 +1005,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
             idxBase, alg, buffer);
 #endif
         if (grid_size > 0) {
-            conjugate_kernel<<<grid_size, block_size, 0, 0>>>(
+            kernel::conjugate<<<grid_size, block_size, 0, 0>>>(
                 trans->get_num_stored_elements(),
                 as_cuda_type(trans->get_values()));
         }
@@ -1028,7 +1027,7 @@ void inv_symm_permute(std::shared_ptr<const CudaExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        inv_row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size>>>(
             num_rows, perm, orig->get_const_row_ptrs(),
             permuted->get_row_ptrs());
     }
@@ -1036,7 +1035,7 @@ void inv_symm_permute(std::shared_ptr<const CudaExecutor> exec,
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
-        inv_symm_permute_kernel<config::warp_size>
+        kernel::inv_symm_permute<config::warp_size>
             <<<copy_num_blocks, default_block_size>>>(
                 num_rows, perm, orig->get_const_row_ptrs(),
                 orig->get_const_col_idxs(),
@@ -1059,7 +1058,7 @@ void row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        kernel::row_ptr_permute<<<count_num_blocks, default_block_size>>>(
             num_rows, perm, orig->get_const_row_ptrs(),
             row_permuted->get_row_ptrs());
     }
@@ -1067,7 +1066,7 @@ void row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
-        row_permute_kernel<config::warp_size>
+        kernel::row_permute<config::warp_size>
             <<<copy_num_blocks, default_block_size>>>(
                 num_rows, perm, orig->get_const_row_ptrs(),
                 orig->get_const_col_idxs(),
@@ -1090,7 +1089,7 @@ void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        inv_row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size>>>(
             num_rows, perm, orig->get_const_row_ptrs(),
             row_permuted->get_row_ptrs());
     }
@@ -1098,7 +1097,7 @@ void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
-        inv_row_permute_kernel<config::warp_size>
+        kernel::inv_row_permute<config::warp_size>
             <<<copy_num_blocks, default_block_size>>>(
                 num_rows, perm, orig->get_const_row_ptrs(),
                 orig->get_const_col_idxs(),
@@ -1247,7 +1246,7 @@ void is_sorted_by_column_index(
     const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     *is_sorted = true;
-    auto cpu_array = Array<bool>::view(exec->get_master(), 1, is_sorted);
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
     auto gpu_array = Array<bool>{exec, cpu_array};
     auto block_size = default_block_size;
     auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 9537d6595f0..bc55a60ac93 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
@@ -291,7 +292,18 @@ template <typename ValueType, typename IndexType>
 void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Fbcsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto num_block_rows = result->get_num_block_rows();
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::convert_to_fbcsr<<<num_blocks, default_block_size>>>(
+            num_block_rows, result->get_num_block_cols(), source->get_stride(),
+            result->get_block_size(), as_cuda_type(source->get_const_values()),
+            result->get_const_row_ptrs(), result->get_col_idxs(),
+            as_cuda_type(result->get_values()));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
@@ -300,8 +312,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                   const matrix::Dense<ValueType>* source,
-                                  int bs,
-                                  IndexType* result) GKO_NOT_IMPLEMENTED;
+                                  int bs, IndexType* result)
+{
+    const auto num_block_rows = source->get_size()[0] / bs;
+    const auto num_block_cols = source->get_size()[1] / bs;
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::
+            count_nonzero_blocks_per_row<<<num_blocks, default_block_size>>>(
+                num_block_rows, num_block_cols, source->get_stride(), bs,
+                as_cuda_type(source->get_const_values()), result);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
@@ -407,9 +430,9 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
             auto beta = zero<ValueType>();
             cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0],
                          orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta,
-                         static_cast<ValueType*>(nullptr), trans->get_size()[1],
-                         trans->get_values(), trans->get_stride());
+                         orig->get_stride(), &beta, trans->get_values(),
+                         trans->get_stride(), trans->get_values(),
+                         trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -432,9 +455,9 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
             auto beta = zero<ValueType>();
             cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0],
                          orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta,
-                         static_cast<ValueType*>(nullptr), trans->get_size()[1],
-                         trans->get_values(), trans->get_stride());
+                         orig->get_stride(), &beta, trans->get_values(),
+                         trans->get_stride(), trans->get_values(),
+                         trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index c906bb7fd1a..58c59af2cc2 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -242,9 +242,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
      */
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(),
-                               zero<OutputValueType>());
+        dense::fill(exec, c, zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index c9c4782d262..bec6ce29807 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
@@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/merging.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
@@ -77,20 +79,6 @@ namespace kernels {
 namespace cuda {
 
 
-namespace csr_reuse {
-
-
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
-
-
-}  // namespace csr_reuse
-
-
 /**
  * @brief The fixed-size block compressed sparse row matrix format namespace.
  *
@@ -102,7 +90,7 @@ namespace fbcsr {
 constexpr int default_block_size{512};
 
 
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
 
 
@@ -114,6 +102,9 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
                      const size_type orig_stride, const ValueType* const orig,
                      const size_type trans_stride, ValueType* const trans)
 {
+    if (nrows == 0) {
+        return;
+    }
     if (cublas::is_supported<ValueType>::value) {
         auto handle = exec->get_cublas_handle();
         {
@@ -121,8 +112,7 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
             cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha,
-                         orig, orig_stride, &beta,
-                         static_cast<ValueType*>(nullptr), nrows, trans,
+                         orig, orig_stride, &beta, trans, trans_stride, trans,
                          trans_stride);
         }
     } else {
@@ -139,6 +129,15 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Dense<ValueType>* const b,
           matrix::Dense<ValueType>* const c)
 {
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: fill output with zero
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         auto handle = exec->get_cusparse_handle();
         cusparse::pointer_mode_guard pm_guard(handle);
@@ -153,21 +152,24 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         const IndexType nb = a->get_num_block_cols();
         const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
         const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        assert(nrhs == c->get_size()[1]);
-        if (nrhs == 1) {
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
             cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
                             nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs,
                             b->get_const_values(), &beta, c->get_values());
         } else {
-            auto trans_c =
-                Array<ValueType>(exec, c->get_size()[0] * c->get_size()[1]);
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
             cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                             CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                             &alpha, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), nrhs, &beta,
-                            trans_c.get_data(), mb * bs);
-            dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(),
-                            c->get_stride(), c->get_values());
+                            b->get_const_values(), in_stride, &beta,
+                            trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
         }
         cusparse::destroy(descr);
     } else {
@@ -186,6 +188,15 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Dense<ValueType>* const beta,
                    matrix::Dense<ValueType>* const c)
 {
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         auto handle = exec->get_cusparse_handle();
         const auto alphp = alpha->get_const_values();
@@ -199,23 +210,26 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         const IndexType nb = a->get_num_block_cols();
         const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
         const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        assert(nrhs == c->get_size()[1]);
-        if (nrhs == 1) {
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
             cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
                             b->get_const_values(), betap, c->get_values());
         } else {
-            auto trans_c =
-                Array<ValueType>(exec, c->get_size()[0] * c->get_size()[1]);
-            dense_transpose(exec, mb * bs, nrhs, c->get_stride(),
-                            c->get_values(), mb * bs, trans_c.get_data());
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
+            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
+                            trans_stride, trans_c.get_data());
             cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                             CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                             alphp, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), nrhs, betap,
-                            trans_c.get_data(), mb * bs);
-            dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(),
-                            c->get_stride(), c->get_values());
+                            b->get_const_values(), in_stride, betap,
+                            trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
         }
         cusparse::destroy(descr);
     } else {
@@ -230,7 +244,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void fill_in_dense(std::shared_ptr<const CudaExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* source,
-                   matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+                   matrix::Dense<ValueType>* result)
+{
+    constexpr auto warps_per_block = default_block_size / config::warp_size;
+    const auto num_blocks =
+        ceildiv(source->get_num_block_rows(), warps_per_block);
+    if (num_blocks > 0) {
+        kernel::fill_in_dense<<<num_blocks, default_block_size>>>(
+            source->get_const_row_ptrs(), source->get_const_col_idxs(),
+            as_cuda_type(source->get_const_values()),
+            as_cuda_type(result->get_values()), result->get_stride(),
+            source->get_num_block_rows(), source->get_block_size());
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
@@ -329,7 +355,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
         ceildiv(trans->get_num_stored_elements(), default_block_size);
     transpose(exec, orig, trans);
     if (grid_size > 0) {
-        csr_reuse::conjugate_kernel<<<grid_size, default_block_size>>>(
+        kernel::conjugate<<<grid_size, default_block_size>>>(
             trans->get_num_stored_elements(),
             as_cuda_type(trans->get_values()));
     }
@@ -348,14 +374,14 @@ void is_sorted_by_column_index(
     *is_sorted = true;
     auto gpu_array = Array<bool>(exec, 1);
     // need to initialize the GPU value to true
-    exec->copy_from<bool>(exec->get_master().get(), 1, is_sorted,
-                          gpu_array.get_data());
+    exec->copy_from(exec->get_master().get(), 1, is_sorted,
+                    gpu_array.get_data());
     auto block_size = default_block_size;
     const auto num_brows =
         static_cast<IndexType>(to_check->get_num_block_rows());
     const auto num_blocks = ceildiv(num_brows, block_size);
     if (num_blocks > 0) {
-        csr_reuse::kernel::check_unsorted<<<num_blocks, block_size>>>(
+        kernel::check_unsorted<<<num_blocks, block_size>>>(
             to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
             num_brows, gpu_array.get_data());
     }
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index f108eacec05..fb8066dbb61 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -676,16 +676,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
                              const size_type b_stride, ValueType* c,
                              const size_type c_stride)
 {
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            subgroup_size)]] {
-                abstract_classical_spmv<subgroup_size>(num_rows, val, col_idxs,
-                                                       row_ptrs, b, b_stride, c,
-                                                       c_stride, item_ct1);
-            });
-    });
+    if (subgroup_size > 1) {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=
+            ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                                subgroup_size)]] {
+                    abstract_classical_spmv<subgroup_size>(
+                        num_rows, val, col_idxs, row_ptrs, b, b_stride, c,
+                        c_stride, item_ct1);
+                });
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(sycl_nd_range(grid, block),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 abstract_classical_spmv<subgroup_size>(
+                                     num_rows, val, col_idxs, row_ptrs, b,
+                                     b_stride, c, c_stride, item_ct1);
+                             });
+        });
+    }
 }
 
 
@@ -718,14 +729,27 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
                              const size_type b_stride, const ValueType* beta,
                              ValueType* c, const size_type c_stride)
 {
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             abstract_classical_spmv<subgroup_size>(
-                                 num_rows, alpha, val, col_idxs, row_ptrs, b,
-                                 b_stride, beta, c, c_stride, item_ct1);
-                         });
-    });
+    if (subgroup_size > 1) {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=
+            ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                                subgroup_size)]] {
+                    abstract_classical_spmv<subgroup_size>(
+                        num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride,
+                        beta, c, c_stride, item_ct1);
+                });
+        });
+    } else {
+        queue->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(sycl_nd_range(grid, block),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 abstract_classical_spmv<subgroup_size>(
+                                     num_rows, alpha, val, col_idxs, row_ptrs,
+                                     b, b_stride, beta, c, c_stride, item_ct1);
+                             });
+        });
+    }
 }
 
 
@@ -1125,9 +1149,15 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) {
+        // empty input: zero output
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
+    if (a->get_strategy()->get_name() == "load_balance") {
+        dense::fill(exec, c, zero<ValueType>());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
             const dim3 csr_block(config::warp_size, warps_in_block, 1);
@@ -1139,8 +1169,6 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
                 a->get_const_col_idxs(), a->get_const_row_ptrs(),
                 a->get_const_srow(), b->get_const_values(), b->get_stride(),
                 c->get_values(), c->get_stride());
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -1219,7 +1247,14 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
 {
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
-    } else if (a->get_strategy()->get_name() == "load_balance") {
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_elements() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
+    if (a->get_strategy()->get_name() == "load_balance") {
         dense::scale(exec, beta, c);
 
         const IndexType nwarps = a->get_num_srow_elements();
@@ -1235,8 +1270,6 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
                 a->get_const_col_idxs(), a->get_const_row_ptrs(),
                 a->get_const_srow(), b->get_const_values(), b->get_stride(),
                 c->get_values(), c->get_stride());
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "sparselib" ||
                a->get_strategy()->get_name() == "cusparse") {
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index fbd1063bf13..8be02fb7988 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -429,9 +429,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
      */
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(),
-                               zero<OutputValueType>());
+        dense::fill(exec, c, zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
new file mode 100644
index 00000000000..e92c329fba1
--- /dev/null
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -0,0 +1,164 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
+#define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
+
+
+#include <hipsparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The HIPSPARSE namespace.
+ *
+ * @ingroup hipsparse
+ */
+namespace hipsparse {
+
+
+/// Default storage layout within each small dense block
+constexpr hipsparseDirection_t blockDir = HIPSPARSE_DIRECTION_COLUMN;
+
+
+#define GKO_BIND_HIPSPARSE32_BSRMV(ValueType, HipsparseName)                   \
+    inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA,   \
+                      int32 mb, int32 nb, int32 nnzb, const ValueType* alpha,  \
+                      const hipsparseMatDescr_t descrA, const ValueType* valA, \
+                      const int32* rowPtrA, const int32* colIndA,              \
+                      int block_size, const ValueType* x,                      \
+                      const ValueType* beta, ValueType* y)                     \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, blockDir, transA, mb, nb, nnzb, as_hiplibs_type(alpha),    \
+            descrA, as_hiplibs_type(valA), rowPtrA, colIndA, block_size,       \
+            as_hiplibs_type(x), as_hiplibs_type(beta), as_hiplibs_type(y)));   \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_BSRMV(ValueType, HipsparseName)                   \
+    inline void bsrmv(hipsparseHandle_t handle, hipsparseOperation_t transA,   \
+                      int64 mb, int64 nb, int64 nnzb, const ValueType* alpha,  \
+                      const hipsparseMatDescr_t descrA, const ValueType* valA, \
+                      const int64* rowPtrA, const int64* colIndA,              \
+                      int block_size, const ValueType* x,                      \
+                      const ValueType* beta, ValueType* y)                     \
+        GKO_NOT_IMPLEMENTED;                                                   \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_BSRMV(float, hipsparseSbsrmv);
+GKO_BIND_HIPSPARSE32_BSRMV(double, hipsparseDbsrmv);
+GKO_BIND_HIPSPARSE32_BSRMV(std::complex<float>, hipsparseCbsrmv);
+GKO_BIND_HIPSPARSE32_BSRMV(std::complex<double>, hipsparseZbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(float, hipsparseSbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(double, hipsparseDbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(std::complex<float>, hipsparseCbsrmv);
+GKO_BIND_HIPSPARSE64_BSRMV(std::complex<double>, hipsparseZbsrmv);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_BSRMV(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_BSRMV(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_BSRMV
+#undef GKO_BIND_HIPSPARSE64_BSRMV
+
+
+#define GKO_BIND_HIPSPARSE32_BSRMM(ValueType, HipsparseName)                   \
+    inline void bsrmm(hipsparseHandle_t handle, hipsparseOperation_t transA,   \
+                      hipsparseOperation_t transB, int32 mb, int32 n,          \
+                      int32 kb, int32 nnzb, const ValueType* alpha,            \
+                      const hipsparseMatDescr_t descrA, const ValueType* valA, \
+                      const int32* rowPtrA, const int32* colIndA,              \
+                      int block_size, const ValueType* B, int32 ldb,           \
+                      const ValueType* beta, ValueType* C, int32 ldc)          \
+    {                                                                          \
+        GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                          \
+            handle, blockDir, transA, transB, mb, n, kb, nnzb,                 \
+            as_hiplibs_type(alpha), descrA, as_hiplibs_type(valA), rowPtrA,    \
+            colIndA, block_size, as_hiplibs_type(B), ldb,                      \
+            as_hiplibs_type(beta), as_hiplibs_type(C), ldc));                  \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_HIPSPARSE64_BSRMM(ValueType, HipsparseName)                  \
+    inline void bsrmm(                                                        \
+        hipsparseHandle_t handle, hipsparseOperation_t transA,                \
+        hipsparseOperation_t transB, int64 mb, int64 n, int64 kb, int64 nnzb, \
+        const ValueType* alpha, const hipsparseMatDescr_t descrA,             \
+        const ValueType* valA, const int64* rowPtrA, const int64* colIndA,    \
+        int block_size, const ValueType* B, int64 ldb, const ValueType* beta, \
+        ValueType* C, int64 ldc) GKO_NOT_IMPLEMENTED;                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_HIPSPARSE32_BSRMM(float, hipsparseSbsrmm);
+GKO_BIND_HIPSPARSE32_BSRMM(double, hipsparseDbsrmm);
+GKO_BIND_HIPSPARSE32_BSRMM(std::complex<float>, hipsparseCbsrmm);
+GKO_BIND_HIPSPARSE32_BSRMM(std::complex<double>, hipsparseZbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(float, hipsparseSbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(double, hipsparseDbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(std::complex<float>, hipsparseCbsrmm);
+GKO_BIND_HIPSPARSE64_BSRMM(std::complex<double>, hipsparseZbsrmm);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE32_BSRMM(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_HIPSPARSE64_BSRMM(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_HIPSPARSE32_BSRMM
+#undef GKO_BIND_HIPSPARSE64_BSRMM
+
+
+}  // namespace hipsparse
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
index d484af93d2a..f86bed4fa76 100644
--- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
@@ -165,9 +165,9 @@ void threshold_filter_approx(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp
index cf691f58f8b..7b335c08175 100644
--- a/hip/factorization/par_ilut_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp
@@ -120,9 +120,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
     if (num_blocks > 0) {
diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernel.hip.cpp
index 8130a76cb18..7e38a6c67a0 100644
--- a/hip/factorization/par_ilut_select_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernel.hip.cpp
@@ -163,7 +163,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
         if (step > 5) {
             Array<AbsType> cpu_out_array{
                 exec->get_master(),
-                Array<AbsType>::view(exec, bucket.size, tmp_out)};
+                make_array_view(exec, bucket.size, tmp_out)};
             auto begin = cpu_out_array.get_data();
             auto end = begin + bucket.size;
             auto middle = begin + rank;
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index a9fcb49e8e6..5bac002f63c 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -37,6 +37,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
 
 
 #include <ginkgo/core/base/array.hpp>
@@ -50,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -99,6 +107,7 @@ using spgeam_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
 
 
@@ -280,8 +289,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
         // empty output: nothing to do
     } else if (a->get_strategy()->get_name() == "load_balance") {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+        dense::fill(exec, c, zero<ValueType>());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
             const dim3 csr_block(config::warp_size, warps_in_block, 1);
@@ -296,8 +304,6 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
                 as_hip_type(b->get_const_values()),
                 as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
                 as_hip_type(c->get_stride()));
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -392,8 +398,6 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
                 as_hip_type(b->get_const_values()),
                 as_hip_type(b->get_stride()), as_hip_type(c->get_values()),
                 as_hip_type(c->get_stride()));
-        } else {
-            GKO_NOT_SUPPORTED(nwarps);
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
@@ -730,16 +734,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
-void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* source, int bs,
-                      Array<IndexType>& row_ptrs, Array<IndexType>& col_idxs,
-                      Array<ValueType>& values) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const HipExecutor> exec,
                const matrix::Csr<ValueType, IndexType>* orig,
@@ -790,7 +784,7 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
             trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
 
         if (grid_size > 0) {
-            hipLaunchKernelGGL(conjugate_kernel, grid_size, block_size, 0, 0,
+            hipLaunchKernelGGL(kernel::conjugate, grid_size, block_size, 0, 0,
                                trans->get_num_stored_elements(),
                                as_hip_type(trans->get_values()));
         }
@@ -812,7 +806,7 @@ void inv_symm_permute(std::shared_ptr<const HipExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute),
                            count_num_blocks, default_block_size, 0, 0, num_rows,
                            perm, orig->get_const_row_ptrs(),
                            permuted->get_row_ptrs());
@@ -822,7 +816,7 @@ void inv_symm_permute(std::shared_ptr<const HipExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(inv_symm_permute_kernel<config::warp_size>),
+            HIP_KERNEL_NAME(kernel::inv_symm_permute<config::warp_size>),
             copy_num_blocks, default_block_size, 0, 0, num_rows, perm,
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
             as_hip_type(orig->get_const_values()), permuted->get_row_ptrs(),
@@ -842,7 +836,7 @@ void row_permute(std::shared_ptr<const HipExecutor> exec, const IndexType* perm,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(row_ptr_permute_kernel),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::row_ptr_permute),
                            count_num_blocks, default_block_size, 0, 0, num_rows,
                            perm, orig->get_const_row_ptrs(),
                            row_permuted->get_row_ptrs());
@@ -852,7 +846,7 @@ void row_permute(std::shared_ptr<const HipExecutor> exec, const IndexType* perm,
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(row_permute_kernel<config::warp_size>),
+            HIP_KERNEL_NAME(kernel::row_permute<config::warp_size>),
             copy_num_blocks, default_block_size, 0, 0, num_rows, perm,
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
             as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
@@ -874,7 +868,7 @@ void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
     if (count_num_blocks > 0) {
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(inv_row_ptr_permute_kernel),
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::inv_row_ptr_permute),
                            count_num_blocks, default_block_size, 0, 0, num_rows,
                            perm, orig->get_const_row_ptrs(),
                            row_permuted->get_row_ptrs());
@@ -884,7 +878,7 @@ void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
         ceildiv(num_rows, default_block_size / config::warp_size);
     if (copy_num_blocks > 0) {
         hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(inv_row_permute_kernel<config::warp_size>),
+            HIP_KERNEL_NAME(kernel::inv_row_permute<config::warp_size>),
             copy_num_blocks, default_block_size, 0, 0, num_rows, perm,
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
             as_hip_type(orig->get_const_values()), row_permuted->get_row_ptrs(),
@@ -1027,7 +1021,7 @@ void is_sorted_by_column_index(
     const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
 {
     *is_sorted = true;
-    auto cpu_array = Array<bool>::view(exec->get_master(), 1, is_sorted);
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
     auto gpu_array = Array<bool>{exec, cpu_array};
     auto block_size = default_block_size;
     auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 6ace089e2ea..4b63be8f905 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
@@ -296,7 +297,18 @@ template <typename ValueType, typename IndexType>
 void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Fbcsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto num_block_rows = result->get_num_block_rows();
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::convert_to_fbcsr<<<num_blocks, default_block_size>>>(
+            num_block_rows, result->get_num_block_cols(), source->get_stride(),
+            result->get_block_size(), as_hip_type(source->get_const_values()),
+            result->get_const_row_ptrs(), result->get_col_idxs(),
+            as_hip_type(result->get_values()));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL);
@@ -305,8 +317,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void count_nonzero_blocks_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                   const matrix::Dense<ValueType>* source,
-                                  int bs,
-                                  IndexType* result) GKO_NOT_IMPLEMENTED;
+                                  int bs, IndexType* result)
+{
+    const auto num_block_rows = source->get_size()[0] / bs;
+    const auto num_block_cols = source->get_size()[1] / bs;
+    if (num_block_rows > 0) {
+        const auto num_blocks =
+            ceildiv(num_block_rows, default_block_size / config::warp_size);
+        kernel::
+            count_nonzero_blocks_per_row<<<num_blocks, default_block_size>>>(
+                num_block_rows, num_block_cols, source->get_stride(), bs,
+                as_hip_type(source->get_const_values()), result);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL);
@@ -417,7 +440,7 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
             hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N,
                           orig->get_size()[0], orig->get_size()[1], &alpha,
                           orig->get_const_values(), orig->get_stride(), &beta,
-                          orig->get_const_values(), trans->get_size()[1],
+                          trans->get_const_values(), trans->get_stride(),
                           trans->get_values(), trans->get_stride());
         }
     } else {
@@ -442,7 +465,7 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
             hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N,
                           orig->get_size()[0], orig->get_size()[1], &alpha,
                           orig->get_const_values(), orig->get_stride(), &beta,
-                          orig->get_const_values(), trans->get_size()[1],
+                          trans->get_values(), trans->get_stride(),
                           trans->get_values(), trans->get_stride());
         }
     } else {
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 2486a666ab0..a56b9b96d13 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -246,9 +246,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
      */
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
-        components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(),
-                               zero<OutputValueType>());
+        dense::fill(exec, c, zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 33ed4a08e2e..bf707859b02 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -54,15 +54,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "core/base/block_sizes.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/hipsparse_block_bindings.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/merging.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
 
 namespace gko {
 namespace kernels {
 namespace hip {
+
+
 /**
  * @brief The fixed-size block compressed sparse row matrix format namespace.
  *
@@ -74,25 +90,154 @@ namespace fbcsr {
 constexpr int default_block_size{512};
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
 
 
+namespace {
+
+
+template <typename ValueType>
+void dense_transpose(std::shared_ptr<const HipExecutor> exec,
+                     const size_type nrows, const size_type ncols,
+                     const size_type orig_stride, const ValueType* const orig,
+                     const size_type trans_stride, ValueType* const trans)
+{
+    if (nrows == 0) {
+        return;
+    }
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        {
+            hipblas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols,
+                          &alpha, orig, orig_stride, &beta, trans, trans_stride,
+                          trans, trans_stride);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b,
-          matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
+          const matrix::Fbcsr<ValueType, IndexType>* const a,
+          const matrix::Dense<ValueType>* const b,
+          matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: fill output with zero
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        hipsparse::pointer_mode_guard pm_guard(handle);
+        const auto alpha = one<ValueType>();
+        const auto beta = zero<ValueType>();
+        auto descr = hipsparse::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
+                             bs, b->get_const_values(), &beta, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
+            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             &alpha, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, &beta,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        hipsparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
+                   const matrix::Dense<ValueType>* const alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* const a,
+                   const matrix::Dense<ValueType>* const b,
+                   const matrix::Dense<ValueType>* const beta,
+                   matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_hipsparse_handle();
+        const auto alphp = alpha->get_const_values();
+        const auto betap = beta->get_const_values();
+        auto descr = hipsparse::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), betap, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = Array<ValueType>(exec, nrows * nrhs);
+            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
+                            trans_stride, trans_c.get_data());
+            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, betap,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        hipsparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
@@ -101,7 +246,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void fill_in_dense(std::shared_ptr<const HipExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* source,
-                   matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+                   matrix::Dense<ValueType>* result)
+{
+    constexpr auto warps_per_block = default_block_size / config::warp_size;
+    const auto num_blocks =
+        ceildiv(source->get_num_block_rows(), warps_per_block);
+    if (num_blocks > 0) {
+        kernel::fill_in_dense<<<num_blocks, default_block_size>>>(
+            source->get_const_row_ptrs(), source->get_const_col_idxs(),
+            as_hip_type(source->get_const_values()),
+            as_hip_type(result->get_values()), result->get_stride(),
+            source->get_num_block_rows(), source->get_block_size());
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
@@ -116,10 +273,8 @@ void convert_to_csr(const std::shared_ptr<const HipExecutor> exec,
     const auto num_blocks =
         ceildiv(source->get_num_block_rows(), warps_per_block);
     if (num_blocks > 0) {
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(kernel::convert_to_csr), num_blocks,
-            default_block_size, 0, 0, source->get_const_row_ptrs(),
-            source->get_const_col_idxs(),
+        kernel::convert_to_csr<<<num_blocks, default_block_size>>>(
+            source->get_const_row_ptrs(), source->get_const_col_idxs(),
             as_hip_type(source->get_const_values()), result->get_row_ptrs(),
             result->get_col_idxs(), as_hip_type(result->get_values()),
             source->get_num_block_rows(), source->get_block_size());
@@ -131,9 +286,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const HipExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* orig,
-               matrix::Fbcsr<ValueType, IndexType>* trans) GKO_NOT_IMPLEMENTED;
+void transpose(const std::shared_ptr<const HipExecutor> exec,
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
+    GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
@@ -152,8 +308,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const HipExecutor> exec,
-    const matrix::Fbcsr<ValueType, IndexType>* to_check,
-    bool* is_sorted) GKO_NOT_IMPLEMENTED;
+    const matrix::Fbcsr<ValueType, IndexType>* const to_check,
+    bool* const is_sorted)
+{
+    *is_sorted = true;
+    auto gpu_array = Array<bool>(exec, 1);
+    // need to initialize the GPU value to true
+    exec->copy_from(exec->get_master().get(), 1, is_sorted,
+                    gpu_array.get_data());
+    auto block_size = default_block_size;
+    const auto num_brows =
+        static_cast<IndexType>(to_check->get_num_block_rows());
+    const auto num_blocks = ceildiv(num_brows, block_size);
+    if (num_blocks > 0) {
+        kernel::check_unsorted<<<num_blocks, block_size>>>(
+            to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+            num_brows, gpu_array.get_data());
+    }
+    *is_sorted = exec->copy_val_to_host(gpu_array.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp
index 2f95d5a4da7..35ed7985ca2 100644
--- a/include/ginkgo/core/base/array.hpp
+++ b/include/ginkgo/core/base/array.hpp
@@ -755,6 +755,25 @@ Array<ValueType> make_array_view(std::shared_ptr<const Executor> exec,
 }
 
 
+/**
+ * Helper function to create a const array view deducing the value type.
+ *
+ * @param exec  the executor on which the array resides
+ * @param size  the number of elements for the array
+ * @param data  the pointer to the array we create a view on.
+ *
+ * @tparam ValueType  the type of the array elements
+ *
+ * @return `Array<ValueType>::const_view(exec, size, data)`
+ */
+template <typename ValueType>
+detail::ConstArrayView<ValueType> make_const_array_view(
+    std::shared_ptr<const Executor> exec, size_type size, const ValueType* data)
+{
+    return Array<ValueType>::const_view(exec, size, data);
+}
+
+
 namespace detail {
 
 
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index bcaba840fdf..20479295ad7 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -855,46 +855,12 @@ class Dense
      * real with a reinterpret_cast with twice the number of columns and
      * double the stride.
      */
-    std::unique_ptr<Dense<remove_complex<ValueType>>> create_real_view()
-    {
-        const auto num_rows = this->get_size()[0];
-        const bool complex = is_complex<ValueType>();
-        const auto num_cols =
-            complex ? 2 * this->get_size()[1] : this->get_size()[1];
-        const auto stride =
-            complex ? 2 * this->get_stride() : this->get_stride();
-
-        return Dense<remove_complex<ValueType>>::create(
-            this->get_executor(), dim<2>{num_rows, num_cols},
-            Array<remove_complex<ValueType>>::view(
-                this->get_executor(), num_rows * stride,
-                reinterpret_cast<remove_complex<ValueType>*>(
-                    this->get_values())),
-            stride);
-    }
+    std::unique_ptr<real_type> create_real_view();
 
     /**
      * @copydoc create_real_view()
      */
-    std::unique_ptr<const Dense<remove_complex<ValueType>>> create_real_view()
-        const
-    {
-        const auto num_rows = this->get_size()[0];
-        const bool complex = is_complex<ValueType>();
-        const auto num_cols =
-            complex ? 2 * this->get_size()[1] : this->get_size()[1];
-        const auto stride =
-            complex ? 2 * this->get_stride() : this->get_stride();
-
-        return Dense<remove_complex<ValueType>>::create(
-            this->get_executor(), dim<2>{num_rows, num_cols},
-            Array<remove_complex<ValueType>>::view(
-                this->get_executor(), num_rows * stride,
-                const_cast<remove_complex<ValueType>*>(
-                    reinterpret_cast<const remove_complex<ValueType>*>(
-                        this->get_const_values()))),
-            stride);
-    }
+    std::unique_ptr<const real_type> create_real_view() const;
 
     /**
      * Creates a constant (immutable) Dense matrix from a constant array.
@@ -1102,24 +1068,8 @@ class Dense
      *        instead of create_submatrix(const span, const span, const
      *        size_type).
      */
-    virtual std::unique_ptr<Dense> create_submatrix_impl(const span& rows,
-                                                         const span& columns,
-                                                         const size_type stride)
-    {
-        row_major_range range_this{this->get_values(), this->get_size()[0],
-                                   this->get_size()[1], this->get_stride()};
-        auto range_result = range_this(rows, columns);
-        // TODO: can result in HUGE padding - which will be copied with the
-        // vector
-        return Dense::create(
-            this->get_executor(),
-            dim<2>{range_result.length(0), range_result.length(1)},
-            Array<ValueType>::view(
-                this->get_executor(),
-                range_result.length(0) * range_this.length(1) - columns.begin,
-                range_result->data),
-            stride);
-    }
+    virtual std::unique_ptr<Dense> create_submatrix_impl(
+        const span& rows, const span& columns, const size_type stride);
 
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp
index 8f0b597686d..90183ec37df 100644
--- a/omp/factorization/par_ilut_kernels.cpp
+++ b/omp/factorization/par_ilut_kernels.cpp
@@ -136,9 +136,9 @@ void abstract_filter(std::shared_ptr<const DefaultExecutor> exec,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index ce647f79935..ea01355cffa 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -86,9 +86,10 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
 
 #pragma omp parallel for
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i) {
-            c->get_values()[i] = zero<ValueType>();
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) = zero<ValueType>();
+            }
         }
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
@@ -130,10 +131,11 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
 
 #pragma omp parallel for
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i)
-            c->get_values()[i] *= vbeta;
-
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) *= vbeta;
+            }
+        }
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
             for (int ib = 0; ib < bs; ib++) {
@@ -164,7 +166,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     components::soa_to_aos(exec, data, block_ordered);
     const auto in_nnz = data.get_num_elems();
     auto block_ordered_ptr = block_ordered.get_data();
-    std::stable_sort(
+    std::sort(
         block_ordered_ptr, block_ordered_ptr + in_nnz,
         [block_size](auto a, auto b) {
             return std::make_tuple(a.row / block_size, a.column / block_size) <
@@ -248,7 +250,43 @@ template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const OmpExecutor> exec,
                     const matrix::Fbcsr<ValueType, IndexType>* const source,
                     matrix::Csr<ValueType, IndexType>* const result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto nbrows = source->get_num_block_rows();
+    const auto bs = source->get_block_size();
+    const auto block_row_ptrs = source->get_const_row_ptrs();
+    const auto block_col_idxs = source->get_const_col_idxs();
+    const auto row_ptrs = result->get_row_ptrs();
+    const auto col_idxs = result->get_col_idxs();
+    const auto vals = result->get_values();
+    auto sizes =
+        gko::to_std_array<acc::size_type>(block_row_ptrs[nbrows], bs, bs);
+    const auto block_vals =
+        acc::range<acc::block_col_major<const ValueType, 3>>(
+            sizes, source->get_const_values());
+#pragma omp parallel for
+    for (IndexType block_row = 0; block_row < nbrows; block_row++) {
+        const auto block_row_begin = block_row_ptrs[block_row];
+        const auto block_row_end = block_row_ptrs[block_row + 1];
+        const auto block_row_size = block_row_end - block_row_begin;
+        const auto row_size = block_row_size * bs;
+        const auto row_base = block_row_begin * bs * bs;
+        for (int local_row = 0; local_row < bs; local_row++) {
+            const auto row = block_row * bs + local_row;
+            row_ptrs[row] = row_base + row_size * local_row;
+            for (auto block = block_row_begin; block < block_row_end; block++) {
+                const auto block_base =
+                    row_ptrs[row] + bs * (block - block_row_begin);
+                for (int local_col = 0; local_col < bs; local_col++) {
+                    const auto col = block_col_idxs[block] * bs + local_col;
+                    const auto out_idx = block_base + local_col;
+                    col_idxs[out_idx] = col;
+                    vals[out_idx] = block_vals(block, local_row, local_col);
+                }
+            }
+        }
+    }
+    row_ptrs[result->get_size()[0]] = source->get_num_stored_elements();
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp
index 56cebc91366..691de096660 100644
--- a/reference/factorization/par_ilut_kernels.cpp
+++ b/reference/factorization/par_ilut_kernels.cpp
@@ -137,9 +137,9 @@ void abstract_filter(std::shared_ptr<const DefaultExecutor> exec,
         matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
         coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
         coo_builder.get_col_idx_array() =
-            Array<IndexType>::view(exec, new_nnz, new_col_idxs);
+            make_array_view(exec, new_nnz, new_col_idxs);
         coo_builder.get_value_array() =
-            Array<ValueType>::view(exec, new_nnz, new_vals);
+            make_array_view(exec, new_nnz, new_vals);
         new_row_idxs = m_out_coo->get_row_idxs();
     }
 
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 9b093a97b68..deca89b976a 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -85,9 +85,10 @@ void spmv(const std::shared_ptr<const ReferenceExecutor>,
         to_std_array<acc::size_type>(nbnz, bs, bs), a->get_const_values()};
 
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i) {
-            c->get_values()[i] = zero<ValueType>();
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) = zero<ValueType>();
+            }
         }
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
@@ -128,9 +129,11 @@ void advanced_spmv(const std::shared_ptr<const ReferenceExecutor>,
         to_std_array<acc::size_type>(nbnz, bs, bs), a->get_const_values()};
 
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
-        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
-             ++i)
-            c->get_values()[i] *= vbeta;
+        for (IndexType row = ibrow * bs; row < (ibrow + 1) * bs; ++row) {
+            for (IndexType rhs = 0; rhs < nvecs; rhs++) {
+                c->at(row, rhs) *= vbeta;
+            }
+        }
 
         for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
              ++inz) {
diff --git a/reference/test/components/precision_conversion_kernels.cpp b/reference/test/components/precision_conversion_kernels.cpp
index a82b9a2ae5c..4ff17753098 100644
--- a/reference/test/components/precision_conversion_kernels.cpp
+++ b/reference/test/components/precision_conversion_kernels.cpp
@@ -128,7 +128,7 @@ TEST_F(PrecisionConversion, ConvertsRealFromView)
     gko::Array<double> tmp{ref};
     gko::Array<float> out{ref};
 
-    tmp = gko::Array<float>::view(ref, vals.get_num_elems(), vals.get_data());
+    tmp = gko::make_array_view(ref, vals.get_num_elems(), vals.get_data());
     out = tmp;
 
     GKO_ASSERT_ARRAY_EQ(vals, out);
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
index bdb0f30059d..40773359229 100644
--- a/reference/test/matrix/permutation.cpp
+++ b/reference/test/matrix/permutation.cpp
@@ -81,8 +81,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToDense)
     i_type rdata[] = {1, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, rdata));
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata));
 
     perm->apply(x.get(), y.get());
     // clang-format off
@@ -108,8 +107,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToDense)
     i_type rdata[] = {1, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, rdata),
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata),
         gko::matrix::column_permute);
 
     perm->apply(x.get(), y.get());
@@ -138,11 +136,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense)
     i_type rdata[] = {1, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, rdata));
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata));
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, cdata),
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, cdata),
         gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
@@ -170,8 +166,7 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray)
     i_type data[] = {1, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2},
-        gko::Array<i_type>::view(this->exec, 2, data),
+        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, data),
         gko::matrix::row_permute | gko::matrix::column_permute);
 
     perm->apply(x.get(), y1.get());
@@ -200,12 +195,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
@@ -234,8 +227,7 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray)
     i_type data[] = {1, 2, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, data),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, data),
         gko::matrix::column_permute | gko::matrix::row_permute |
             gko::matrix::inverse_permute);
 
@@ -264,8 +256,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
 
     rperm->apply(x.get(), y.get());
@@ -293,8 +284,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToDense)
     i_type cdata[] = {1, 2, 0};
 
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     cperm->apply(x.get(), y.get());
@@ -323,8 +313,7 @@ TYPED_TEST(Permutation, AppliesRowPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata));
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata));
 
     perm->apply(x.get(), y.get());
     // clang-format off
@@ -352,8 +341,7 @@ TYPED_TEST(Permutation, AppliesColPermutationToCsr)
     i_type cdata[] = {1, 2, 0};
 
     auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::column_permute);
 
     perm->apply(x.get(), y.get());
@@ -384,11 +372,9 @@ TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata));
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata));
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
@@ -417,8 +403,7 @@ TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
 
     rperm->apply(x.get(), y.get());
@@ -446,8 +431,7 @@ TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr)
     i_type cdata[] = {1, 2, 0};
 
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     cperm->apply(x.get(), y.get());
@@ -477,12 +461,10 @@ TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr)
     i_type rdata[] = {1, 2, 0};
 
     auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, rdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
         gko::matrix::row_permute | gko::matrix::inverse_permute);
     auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3},
-        gko::Array<i_type>::view(this->exec, 3, cdata),
+        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
         gko::matrix::inverse_permute | gko::matrix::column_permute);
 
     rperm->apply(x.get(), y1.get());
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index c87874776c3..30563438c65 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -81,6 +81,8 @@ struct SimpleMatrixTest {
     {}
 
     static void check_property(const std::unique_ptr<matrix_type>&) {}
+
+    static bool supports_strides() { return true; }
 };
 
 struct DenseWithDefaultStride
@@ -200,36 +202,33 @@ struct CsrWithAutomaticalStrategy
 struct Ell : SimpleMatrixTest<gko::matrix::Ell<matrix_value_type, int>> {};
 
 
-struct FbcsrBlocksize1
-    : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
+template <int block_size>
+struct Fbcsr : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
     static bool preserves_zeros() { return false; }
 
     static std::unique_ptr<matrix_type> create(
         std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
     {
-        return matrix_type::create(exec, size, 0, 1);
+        size[0] = gko::ceildiv(size[0], block_size) * block_size;
+        size[1] = gko::ceildiv(size[1], block_size) * block_size;
+        return matrix_type::create(exec, size, 0, block_size);
     }
 
     static void check_property(const std::unique_ptr<matrix_type>& mtx)
     {
-        ASSERT_EQ(mtx->get_block_size(), 1);
+        ASSERT_EQ(mtx->get_block_size(), block_size);
     }
-};
-
-struct FbcsrBlocksize2
-    : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
-    static bool preserves_zeros() { return false; }
 
-    static std::unique_ptr<matrix_type> create(
-        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    static void modify_data(gko::matrix_data<matrix_value_type, int>& data)
     {
-        return matrix_type::create(exec, size, 0, 2);
+        data.size[0] = gko::ceildiv(data.size[0], block_size) * block_size;
+        data.size[1] = gko::ceildiv(data.size[1], block_size) * block_size;
     }
 
-    static void check_property(const std::unique_ptr<matrix_type>& mtx)
-    {
-        ASSERT_EQ(mtx->get_block_size(), 2);
-    }
+#ifdef GKO_COMPILING_HIP
+    // Fbcsr support in rocSPARSE is buggy w.r.t. strides
+    static bool supports_strides() { return false; }
+#endif
 };
 
 
@@ -440,12 +439,11 @@ class Matrix : public ::testing::Test {
     }
 
     template <typename VecType = Vec>
-    test_pair<VecType> gen_in_vec(const test_pair<Mtx>& mtx, int nrhs,
-                                  int stride)
+    test_pair<VecType> gen_in_vec(const test_pair<Mtx>& mtx, int nrhs)
     {
         auto size = gko::dim<2>{mtx.ref->get_size()[1],
                                 static_cast<gko::size_type>(nrhs)};
-        auto result = VecType::create(ref, size, stride);
+        auto result = VecType::create(ref, size);
         result->read(gen_dense_data<typename VecType::value_type,
                                     typename Mtx::index_type>(size));
         return {std::move(result), exec};
@@ -466,12 +464,11 @@ class Matrix : public ::testing::Test {
     }
 
     template <typename VecType = Vec>
-    test_pair<VecType> gen_out_vec(const test_pair<Mtx>& mtx, int nrhs,
-                                   int stride)
+    test_pair<VecType> gen_out_vec(const test_pair<Mtx>& mtx, int nrhs)
     {
         auto size = gko::dim<2>{mtx.ref->get_size()[0],
                                 static_cast<gko::size_type>(nrhs)};
-        auto result = VecType::create(ref, size, stride);
+        auto result = VecType::create(ref, size);
         result->read(gen_dense_data<typename VecType::value_type,
                                     typename Mtx::index_type>(size));
         return {std::move(result), exec};
@@ -484,9 +481,10 @@ class Matrix : public ::testing::Test {
     template <typename TestFunction>
     void forall_matrix_data_scenarios(TestFunction fn)
     {
-        auto guarded_fn = [&](auto mtx) {
+        auto guarded_fn = [&](auto data) {
             try {
-                fn(std::move(mtx));
+                Config::modify_data(data);
+                fn(std::move(data));
             } catch (std::exception& e) {
                 FAIL() << e.what();
             }
@@ -507,6 +505,14 @@ class Matrix : public ::testing::Test {
             SCOPED_TRACE("Zero matrix (200x100)");
             guarded_fn(gen_mtx_data(200, 100, 0, 0));
         }
+        {
+            SCOPED_TRACE("Small Sparse Matrix with variable row nnz (10x10)");
+            guarded_fn(gen_mtx_data(10, 10, 1, 5));
+        }
+        {
+            SCOPED_TRACE("Small Dense Matrix (10x10)");
+            guarded_fn(gen_mtx_data(10, 10, 10, 10));
+        }
         {
             SCOPED_TRACE("Sparse Matrix with some zeros rows (200x100)");
             guarded_fn(gen_mtx_data(200, 100, 0, 50));
@@ -567,6 +573,34 @@ class Matrix : public ::testing::Test {
         });
     }
 
+    template <typename VecType = Vec, typename MtxType, typename TestFunction>
+    void run_strided(const test_pair<MtxType>& mtx, int rhs, int in_stride,
+                     int out_stride, TestFunction fn)
+    {
+        // create slightly bigger vectors
+        auto in_padded = gen_in_vec<VecType>(mtx, in_stride);
+        auto out_padded = gen_out_vec<VecType>(mtx, out_stride);
+        const auto in_rows = gko::span(0, mtx.ref->get_size()[1]);
+        const auto out_rows = gko::span(0, mtx.ref->get_size()[0]);
+        const auto cols = gko::span(0, rhs);
+        const auto out_pad_cols = gko::span(rhs, out_stride);
+        // create views of the padding and in/out vectors
+        auto out_padding = test_pair<VecType>{
+            out_padded.ref->create_submatrix(out_rows, out_pad_cols),
+            out_padded.dev->create_submatrix(out_rows, out_pad_cols)};
+        auto orig_padding = out_padding.ref->clone();
+        auto in =
+            test_pair<VecType>{in_padded.ref->create_submatrix(in_rows, cols),
+                               in_padded.dev->create_submatrix(in_rows, cols)};
+        auto out = test_pair<VecType>{
+            out_padded.ref->create_submatrix(out_rows, cols),
+            out_padded.dev->create_submatrix(out_rows, cols)};
+        fn(std::move(in), std::move(out));
+        // check that padding was unmodified
+        GKO_ASSERT_MTX_NEAR(out_padding.ref, orig_padding, 0.0);
+        GKO_ASSERT_MTX_NEAR(out_padding.dev, orig_padding, 0.0);
+    }
+
     template <typename VecType = Vec, typename MtxType, typename TestFunction>
     void forall_vector_scenarios(const test_pair<MtxType>& mtx, TestFunction fn)
     {
@@ -579,53 +613,48 @@ class Matrix : public ::testing::Test {
         };
         {
             SCOPED_TRACE("Multivector with 0 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 0, 0),
-                       gen_out_vec<VecType>(mtx, 0, 0));
+            guarded_fn(gen_in_vec<VecType>(mtx, 0),
+                       gen_out_vec<VecType>(mtx, 0));
         }
         {
             SCOPED_TRACE("Single vector");
-            guarded_fn(gen_in_vec<VecType>(mtx, 1, 1),
-                       gen_out_vec<VecType>(mtx, 1, 1));
+            guarded_fn(gen_in_vec<VecType>(mtx, 1),
+                       gen_out_vec<VecType>(mtx, 1));
         }
-        {
+        if (Config::supports_strides()) {
             SCOPED_TRACE("Single strided vector");
-            guarded_fn(gen_in_vec<VecType>(mtx, 1, 2),
-                       gen_out_vec<VecType>(mtx, 1, 3));
+            run_strided(mtx, 1, 2, 3, guarded_fn);
         }
         if (!gko::is_complex<value_type>()) {
             // check application of real matrix to complex vector
             // viewed as interleaved real/imag vector
             using complex_vec = gko::to_complex<VecType>;
-            {
+            if (Config::supports_strides()) {
                 SCOPED_TRACE("Single strided complex vector");
-                guarded_fn(gen_in_vec<complex_vec>(mtx, 1, 2),
-                           gen_out_vec<complex_vec>(mtx, 1, 3));
+                run_strided<complex_vec>(mtx, 1, 2, 3, guarded_fn);
             }
-            {
+            if (Config::supports_strides()) {
                 SCOPED_TRACE("Strided complex multivector with 2 columns");
-                guarded_fn(gen_in_vec<complex_vec>(mtx, 2, 3),
-                           gen_out_vec<complex_vec>(mtx, 2, 4));
+                run_strided<complex_vec>(mtx, 2, 3, 4, guarded_fn);
             }
         }
         {
             SCOPED_TRACE("Multivector with 2 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 2, 2),
-                       gen_out_vec<VecType>(mtx, 2, 2));
+            guarded_fn(gen_in_vec<VecType>(mtx, 2),
+                       gen_out_vec<VecType>(mtx, 2));
         }
-        {
+        if (Config::supports_strides()) {
             SCOPED_TRACE("Strided multivector with 2 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 2, 3),
-                       gen_out_vec<VecType>(mtx, 2, 4));
+            run_strided(mtx, 2, 3, 4, guarded_fn);
         }
         {
             SCOPED_TRACE("Multivector with 40 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 40, 40),
-                       gen_out_vec<VecType>(mtx, 40, 40));
+            guarded_fn(gen_in_vec<VecType>(mtx, 40),
+                       gen_out_vec<VecType>(mtx, 40));
         }
-        {
+        if (Config::supports_strides()) {
             SCOPED_TRACE("Strided multivector with 40 columns");
-            guarded_fn(gen_in_vec<VecType>(mtx, 40, 43),
-                       gen_out_vec<VecType>(mtx, 40, 45));
+            run_strided(mtx, 40, 43, 45, guarded_fn);
         }
     }
 
@@ -637,18 +666,21 @@ class Matrix : public ::testing::Test {
 
 using MatrixTypes = ::testing::Types<
     DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy,
-    // The strategies have issues with zero rows
-    /*
-    #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
-        defined(GKO_COMPILING_DPCPP)
-        CsrWithClassicalStrategy, CsrWithMergePathStrategy,
-        CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy,
-        CsrWithAutomaticalStrategy,
-    #endif
-    */
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
+    defined(GKO_COMPILING_DPCPP)
+    CsrWithClassicalStrategy, CsrWithMergePathStrategy,
+    CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy,
+    CsrWithAutomaticalStrategy,
+#endif
     Ell,
-    // Fbcsr is slightly broken
-    /*FbcsrBlocksize1, FbcsrBlocksize2,*/
+#ifdef GKO_COMPILING_OMP
+    // CUDA doesn't allow blocksize 1
+    Fbcsr<1>,
+#endif
+#if defined(GKO_COMPILING_OMP) || defined(GKO_COMPILING_CUDA) || \
+    defined(GKO_COMPILING_HIP)
+    Fbcsr<2>, Fbcsr<3>,
+#endif
     SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy,
     HybridColumnLimitStrategy, HybridImbalanceLimitStrategy,
     HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy,
@@ -771,13 +803,30 @@ TYPED_TEST(Matrix, ConvertToDenseIsEquivalentToRef)
     using Mtx = typename TestFixture::Mtx;
     using Dense = gko::matrix::Dense<typename Mtx::value_type>;
     this->forall_matrix_scenarios([&](auto mtx) {
-        auto ref_result = Dense::create(this->ref);
-        auto dev_result = Dense::create(this->exec);
+        const auto size = mtx.ref->get_size();
+        const auto stride = size[1] + 5;
+        const auto padded_size = gko::dim<2>{size[0], stride};
+        auto ref_padded = Dense::create(this->ref, padded_size);
+        auto dev_padded = Dense::create(this->exec, padded_size);
+        ref_padded->fill(12345);
+        dev_padded->fill(12345);
+        const auto rows = gko::span{0, size[0]};
+        const auto cols = gko::span{0, size[1]};
+        const auto pad_cols = gko::span{size[1], stride};
+        auto ref_result = ref_padded->create_submatrix(rows, cols);
+        auto dev_result = dev_padded->create_submatrix(rows, cols);
+        auto ref_padding = ref_padded->create_submatrix(rows, pad_cols);
+        auto dev_padding = dev_padded->create_submatrix(rows, pad_cols);
+        auto orig_padding = ref_padding->clone();
 
         mtx.ref->convert_to(ref_result.get());
         mtx.dev->convert_to(dev_result.get());
 
         GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0);
+        ASSERT_EQ(ref_result->get_stride(), stride);
+        ASSERT_EQ(dev_result->get_stride(), stride);
+        GKO_ASSERT_MTX_NEAR(ref_padding, orig_padding, 0.0);
+        GKO_ASSERT_MTX_NEAR(dev_padding, orig_padding, 0.0);
     });
 }
 
@@ -788,10 +837,13 @@ TYPED_TEST(Matrix, ConvertFromDenseIsEquivalentToRef)
     using Mtx = typename TestFixture::Mtx;
     using Dense = gko::matrix::Dense<typename Mtx::value_type>;
     this->forall_matrix_data_scenarios([&](auto data) {
-        auto ref_src = Dense::create(this->ref);
-        auto dev_src = Dense::create(this->exec);
+        const auto stride = data.size[0] + 2;
+        auto ref_src = Dense::create(this->ref, data.size, stride);
+        auto dev_src = Dense::create(this->exec, data.size, stride);
         ref_src->read(data);
         dev_src->read(data);
+        ASSERT_EQ(ref_src->get_stride(), stride);
+        ASSERT_EQ(dev_src->get_stride(), stride);
         auto ref_result = TestConfig::create(this->ref, data.size);
         auto dev_result = TestConfig::create(this->exec, data.size);
 
@@ -813,7 +865,6 @@ TYPED_TEST(Matrix, ReadWriteRoundtrip)
         auto new_mtx = TestConfig::create(this->exec, data.size);
         gko::matrix_data<value_type, index_type> out_data;
 
-        TestConfig::modify_data(data);
         new_mtx->read(data);
         new_mtx->write(out_data);