diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
new file mode 100644
index 00000000000..f18d0aafc88
--- /dev/null
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
@@ -0,0 +1,110 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType,
+          typename Closure>
+__device__ void device_classical_spmv(const size_type num_rows,
+                                      const MatrixValueType* __restrict__ val,
+                                      const IndexType* __restrict__ col_idxs,
+                                      const IndexType* __restrict__ row_ptrs,
+                                      acc::range<input_accessor> b,
+                                      acc::range<output_accessor> c,
+                                      Closure scale)
+{
+    using arithmetic_type = typename output_accessor::arithmetic_type;
+    auto subwarp_tile =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
+    const auto subid = subwarp_tile.thread_rank();
+    const IndexType column_id = blockIdx.y;
+    const arithmetic_type value = val[0];
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    for (; row < num_rows; row += subrow) {
+        const auto ind_end = row_ptrs[row + 1];
+        arithmetic_type temp_val = zero<arithmetic_type>();
+        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
+             ind += subwarp_size) {
+            temp_val += value * b(col_idxs[ind], column_id);
+        }
+        auto subwarp_result =
+            reduce(subwarp_tile, temp_val,
+                   [](const arithmetic_type& a, const arithmetic_type& b) {
+                       return a + b;
+                   });
+        if (subid == 0) {
+            c(row, column_id) = scale(subwarp_result, c(row, column_id));
+        }
+    }
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [](const type& x, const type& y) { return x; });
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
+    const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    const typename output_accessor::storage_type* __restrict__ beta,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    const type alpha_val = alpha[0];
+    const type beta_val = beta[0];
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [&alpha_val, &beta_val](const type& x, const type& y) {
+            return alpha_val * x + beta_val * y;
+        });
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 2f7e7e9fe2a..5ffb7aad173 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -452,8 +452,9 @@ GKO_STUB_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
 namespace sparsity_csr {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
+GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_COUNT_NUM_DIAGONAL_ELEMENTS_KERNEL);
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index 6ef49c6f1fc..6e3ecc5c319 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -76,7 +76,7 @@ template <typename ValueType, typename IndexType>
 void SparsityCsr<ValueType, IndexType>::apply_impl(const LinOp* b,
                                                    LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
+    mixed_precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->get_executor()->run(
                 sparsity_csr::make_spmv(this, dense_b, dense_x));
@@ -91,12 +91,15 @@ void SparsityCsr<ValueType, IndexType>::apply_impl(const LinOp* alpha,
                                                    const LinOp* beta,
                                                    LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
+    mixed_precision_dispatch_real_complex<ValueType>(
+        [this, alpha, beta](auto dense_b, auto dense_x) {
+            auto dense_alpha = make_temporary_conversion<ValueType>(alpha);
+            auto dense_beta = make_temporary_conversion<
+                typename std::decay_t<decltype(*dense_x)>::value_type>(beta);
             this->get_executor()->run(sparsity_csr::make_advanced_spmv(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
+                dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x));
         },
-        alpha, b, beta, x);
+        b, x);
 }
 
 
diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp
index 5bb81585230..6933b61313d 100644
--- a/core/matrix/sparsity_csr_kernels.hpp
+++ b/core/matrix/sparsity_csr_kernels.hpp
@@ -48,18 +48,22 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType) \
-    void spmv(std::shared_ptr<const DefaultExecutor> exec,         \
-              const matrix::SparsityCsr<ValueType, IndexType>* a,  \
-              const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
-
-#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType) \
-    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,         \
-                       const matrix::Dense<ValueType>* alpha,               \
-                       const matrix::SparsityCsr<ValueType, IndexType>* a,  \
-                       const matrix::Dense<ValueType>* b,                   \
-                       const matrix::Dense<ValueType>* beta,                \
-                       matrix::Dense<ValueType>* c)
+#define GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType, \
+                                             OutputValueType, IndexType)      \
+    void spmv(std::shared_ptr<const DefaultExecutor> exec,                    \
+              const matrix::SparsityCsr<MatrixValueType, IndexType>* a,       \
+              const matrix::Dense<InputValueType>* b,                         \
+              matrix::Dense<OutputValueType>* c)
+
+#define GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(            \
+    MatrixValueType, InputValueType, OutputValueType, IndexType)  \
+    void advanced_spmv(                                           \
+        std::shared_ptr<const DefaultExecutor> exec,              \
+        const matrix::Dense<MatrixValueType>* alpha,              \
+        const matrix::SparsityCsr<MatrixValueType, IndexType>* a, \
+        const matrix::Dense<InputValueType>* b,                   \
+        const matrix::Dense<OutputValueType>* beta,               \
+        matrix::Dense<OutputValueType>* c)
 
 #define GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType)    \
     void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,            \
@@ -98,10 +102,14 @@ namespace kernels {
         bool* is_sorted)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                        \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(ValueType, IndexType);             \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(ValueType, IndexType);    \
+    template <typename MatrixValueType, typename InputValueType,            \
+              typename OutputValueType, typename IndexType>                 \
+    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL(MatrixValueType, InputValueType,   \
+                                         OutputValueType, IndexType);       \
+    template <typename MatrixValueType, typename InputValueType,            \
+              typename OutputValueType, typename IndexType>                 \
+    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL(                          \
+        MatrixValueType, InputValueType, OutputValueType, IndexType);       \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_SPARSITY_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType);    \
     template <typename ValueType, typename IndexType>                       \
diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp
index 4233e45a2ff..dfc25749a95 100644
--- a/core/multigrid/amgx_pgm.cpp
+++ b/core/multigrid/amgx_pgm.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/base/utils.hpp"
@@ -161,6 +162,16 @@ void AmgxPgm<ValueType, IndexType>::generate()
                                         one<ValueType>()));
     // TODO: implement the restrict_op from aggregation.
     auto restrict_op = gko::as<csr_type>(share(prolong_csr->transpose()));
+    auto restrict_sparsity =
+        share(matrix::SparsityCsr<ValueType, IndexType>::create(
+            exec, restrict_op->get_size(),
+            restrict_op->get_num_stored_elements()));
+    exec->copy_from(exec.get(), static_cast<size_type>(num_agg + 1),
+                    restrict_op->get_const_row_ptrs(),
+                    restrict_sparsity->get_row_ptrs());
+    exec->copy_from(exec.get(), restrict_op->get_num_stored_elements(),
+                    restrict_op->get_const_col_idxs(),
+                    restrict_sparsity->get_col_idxs());
 
     // Construct the coarse matrix
     // TODO: use less memory footprint to improve it
@@ -170,7 +181,8 @@ void AmgxPgm<ValueType, IndexType>::generate()
     amgxpgm_op->apply(prolong_csr.get(), tmp.get());
     restrict_op->apply(tmp.get(), coarse_matrix.get());
 
-    this->set_multigrid_level(prolong_row_gather, coarse_matrix, restrict_op);
+    this->set_multigrid_level(prolong_row_gather, coarse_matrix,
+                              restrict_sparsity);
 }
 
 
diff --git a/core/test/matrix/fbcsr_sample.hpp b/core/test/matrix/fbcsr_sample.hpp
index f8b09fa701c..0cb410c2bd7 100644
--- a/core/test/matrix/fbcsr_sample.hpp
+++ b/core/test/matrix/fbcsr_sample.hpp
@@ -114,10 +114,11 @@ class FbcsrSample {
                 static_cast<gko::acc::size_type>(bs)},
             v);
 
-        if (mtx->get_size()[0] % bs != 0)
+        if (mtx->get_size()[0] % bs != 0) {
             throw gko::BadDimension(__FILE__, __LINE__, __func__, "test fbcsr",
                                     mtx->get_size()[0], mtx->get_size()[1],
                                     "block size does not divide the size!");
+        }
 
         for (index_type ibrow = 0; ibrow < mtx->get_num_block_rows(); ibrow++) {
             const index_type* const browptr = mtx->get_row_ptrs();
@@ -125,11 +126,13 @@ class FbcsrSample {
                  inz++) {
                 const index_type bcolind = mtx->get_col_idxs()[inz];
                 const value_type base = (ibrow + 1) * (bcolind + 1);
-                for (int ival = 0; ival < bs; ival++)
-                    for (int jval = 0; jval < bs; jval++)
+                for (int ival = 0; ival < bs; ival++) {
+                    for (int jval = 0; jval < bs; jval++) {
                         vals(inz, ival, jval) =
                             base + static_cast<gko::remove_complex<value_type>>(
                                        ival * bs + jval);
+                    }
+                }
             }
         }
 
@@ -175,10 +178,12 @@ class FbcsrSample {
         gko::Array<IndexType> colids(exec, nbnz);
         gko::Array<IndexType> rowptrs(exec, nbrows + 1);
         const std::unique_ptr<const Fbcsr> fbmat = generate_fbcsr();
-        for (index_type i = 0; i < nbrows + 1; i++)
+        for (index_type i = 0; i < nbrows + 1; i++) {
             rowptrs.get_data()[i] = fbmat->get_const_row_ptrs()[i];
-        for (index_type i = 0; i < nbnz; i++)
+        }
+        for (index_type i = 0; i < nbnz; i++) {
             colids.get_data()[i] = fbmat->get_const_col_idxs()[i];
+        }
         return SparCsr::create(exec, gko::dim<2>{nbrows, nbcols}, colids,
                                rowptrs);
     }
@@ -302,7 +307,9 @@ class FbcsrSample2 {
         gko::Array<index_type> c(exec, {0, 0, 3, 2});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
 
         v[0] = 1;
         v[1] = 3;
@@ -328,7 +335,9 @@ class FbcsrSample2 {
             exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
         v[0] = 1;
         v[1] = 2;
         v[2] = 3;
@@ -401,7 +410,9 @@ class FbcsrSampleSquare {
         gko::Array<index_type> r(exec, {0, 1, 2});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = i;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = i;
+        }
 
         return Fbcsr::create(exec,
                              gko::dim<2>{static_cast<size_type>(nrows),
@@ -447,7 +458,9 @@ class FbcsrSampleComplex {
         gko::Array<index_type> c(exec, {0, 0, 3, 2});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
 
         using namespace std::complex_literals;
         v[0] = 1.0 + 1.15i;
@@ -474,7 +487,9 @@ class FbcsrSampleComplex {
             exec, {0, 1, 0, 1, 0, 1, 6, 7, 0, 1, 6, 7, 4, 5, 4, 5});
         gko::Array<value_type> vals(exec, nnz);
         value_type* const v = vals.get_data();
-        for (IndexType i = 0; i < nnz; i++) v[i] = 0.15 + fbcsr_test_offset;
+        for (IndexType i = 0; i < nnz; i++) {
+            v[i] = 0.15 + fbcsr_test_offset;
+        }
 
         using namespace std::complex_literals;
         v[0] = 1.0 + 1.15i;
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 30a778572ad..71ac5d9c664 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -36,6 +36,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "accessor/cuda_helper.hpp"
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {
@@ -47,25 +60,115 @@ namespace cuda {
 namespace sparsity_csr {
 
 
-template <typename ValueType, typename IndexType>
+constexpr int classical_overweight = 32;
+constexpr int spmv_block_size = 128;
+constexpr int warps_in_block = 4;
+
+
+using classical_kernels = syn::value_list<int, 2>;
+
+
+#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
+
+
+namespace host_kernel {
+
+
+template <int subwarp_size, typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void classical_spmv(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const CudaExecutor> exec,
+                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                    const matrix::Dense<InputValueType>* b,
+                    matrix::Dense<OutputValueType>* c,
+                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                    const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+    using input_accessor =
+        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
+    using output_accessor =
+        gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>;
+
+    const auto nwarps = exec->get_num_warps_per_sm() *
+                        exec->get_num_multiprocessor() * classical_overweight;
+    const auto gridx =
+        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
+                 int64(nwarps / warps_in_block));
+    const dim3 grid(gridx, b->get_size()[1]);
+    const auto block = spmv_block_size;
+
+    const auto b_vals = gko::acc::range<input_accessor>(
+        std::array<acc::size_type, 2>{
+            {static_cast<acc::size_type>(b->get_size()[0]),
+             static_cast<acc::size_type>(b->get_size()[1])}},
+        b->get_const_values(),
+        std::array<acc::size_type, 1>{
+            {static_cast<acc::size_type>(b->get_stride())}});
+    auto c_vals = gko::acc::range<output_accessor>(
+        std::array<acc::size_type, 2>{
+            {static_cast<acc::size_type>(c->get_size()[0]),
+             static_cast<acc::size_type>(c->get_size()[1])}},
+        c->get_values(),
+        std::array<acc::size_type, 1>{
+            {static_cast<acc::size_type>(c->get_stride())}});
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (alpha == nullptr && beta == nullptr) {
+        kernel::abstract_classical_spmv<subwarp_size><<<grid, block, 0, 0>>>(
+            a->get_size()[0], as_cuda_type(a->get_const_value()),
+            a->get_const_col_idxs(), as_cuda_type(a->get_const_row_ptrs()),
+            acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+    } else if (alpha != nullptr && beta != nullptr) {
+        kernel::abstract_classical_spmv<subwarp_size><<<grid, block, 0, 0>>>(
+            a->get_size()[0], as_cuda_type(alpha->get_const_values()),
+            as_cuda_type(a->get_const_value()), a->get_const_col_idxs(),
+            as_cuda_type(a->get_const_row_ptrs()), acc::as_cuda_range(b_vals),
+            as_cuda_type(beta->get_const_values()), acc::as_cuda_range(c_vals));
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
+
+
+}  // namespace host_kernel
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::SparsityCsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b,
-          matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+          const matrix::Dense<InputValueType>* b,
+          matrix::Dense<OutputValueType>* c)
+{
+    host_kernel::select_classical_spmv(
+        classical_kernels(), [](int compiled_info) { return true; },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::SparsityCsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+                   const matrix::Dense<MatrixValueType>* alpha,
+                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                   const matrix::Dense<InputValueType>* b,
+                   const matrix::Dense<OutputValueType>* beta,
+                   matrix::Dense<OutputValueType>* c)
+{
+    host_kernel::select_classical_spmv(
+        classical_kernels(), [](int compiled_info) { return true; },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/cuda/test/matrix/CMakeLists.txt b/cuda/test/matrix/CMakeLists.txt
index be2ebd7ef03..0c0442f61e0 100644
--- a/cuda/test/matrix/CMakeLists.txt
+++ b/cuda/test/matrix/CMakeLists.txt
@@ -1,9 +1,9 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
-ginkgo_create_test(fbcsr_kernels)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
 ginkgo_create_test(ell_kernels)
+ginkgo_create_test(fbcsr_kernels)
 ginkgo_create_test(fft_kernels)
 ginkgo_create_test(hybrid_kernels)
 ginkgo_create_test(sellp_kernels)
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 2a7120ffa0b..97c01e0fac5 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -237,11 +237,14 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Dense<ValueType>* c)
 {
     using namespace oneapi::mkl;
-    oneapi::mkl::blas::row_major::gemm(
-        *exec->get_queue(), transpose::nontrans, transpose::nontrans,
-        c->get_size()[0], c->get_size()[1], a->get_size()[1], one<ValueType>(),
-        a->get_const_values(), a->get_stride(), b->get_const_values(),
-        b->get_stride(), zero<ValueType>(), c->get_values(), c->get_stride());
+    if (b->get_stride() != 0 && c->get_stride() != 0) {
+        oneapi::mkl::blas::row_major::gemm(
+            *exec->get_queue(), transpose::nontrans, transpose::nontrans,
+            c->get_size()[0], c->get_size()[1], a->get_size()[1],
+            one<ValueType>(), a->get_const_values(), a->get_stride(),
+            b->get_const_values(), b->get_stride(), zero<ValueType>(),
+            c->get_values(), c->get_stride());
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
@@ -254,13 +257,15 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
     using namespace oneapi::mkl;
-    oneapi::mkl::blas::row_major::gemm(
-        *exec->get_queue(), transpose::nontrans, transpose::nontrans,
-        c->get_size()[0], c->get_size()[1], a->get_size()[1],
-        exec->copy_val_to_host(alpha->get_const_values()),
-        a->get_const_values(), a->get_stride(), b->get_const_values(),
-        b->get_stride(), exec->copy_val_to_host(beta->get_const_values()),
-        c->get_values(), c->get_stride());
+    if (b->get_stride() != 0 && c->get_stride() != 0) {
+        oneapi::mkl::blas::row_major::gemm(
+            *exec->get_queue(), transpose::nontrans, transpose::nontrans,
+            c->get_size()[0], c->get_size()[1], a->get_size()[1],
+            exec->copy_val_to_host(alpha->get_const_values()),
+            a->get_const_values(), a->get_stride(), b->get_const_values(),
+            b->get_stride(), exec->copy_val_to_host(beta->get_const_values()),
+            c->get_values(), c->get_stride());
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 81ffcb87cd3..61610538a1c 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -33,9 +33,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
 
+#include <CL/sycl.hpp>
+
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -47,25 +61,232 @@ namespace dpcpp {
 namespace sparsity_csr {
 
 
-template <typename ValueType, typename IndexType>
+constexpr int classical_overweight = 32;
+constexpr int spmv_block_size = 128;
+
+
+using classical_kernels = syn::value_list<int, 1>;
+
+
+namespace kernel {
+
+
+template <size_type subgroup_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType,
+          typename Closure>
+void device_classical_spmv(const size_type num_rows,
+                           const MatrixValueType* __restrict__ val,
+                           const IndexType* __restrict__ col_idxs,
+                           const IndexType* __restrict__ row_ptrs,
+                           acc::range<input_accessor> b,
+                           acc::range<output_accessor> c, Closure scale,
+                           sycl::nd_item<3> item_ct1)
+{
+    using arithmetic_type = typename output_accessor::arithmetic_type;
+    auto subgroup_tile = group::tiled_partition<subgroup_size>(
+        group::this_thread_block(item_ct1));
+    const auto subrow = thread::get_subwarp_num_flat<subgroup_size>(item_ct1);
+    const auto subid = subgroup_tile.thread_rank();
+    const IndexType column_id = item_ct1.get_group(1);
+    const arithmetic_type value = static_cast<arithmetic_type>(val[0]);
+    auto row = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
+    for (; row < num_rows; row += subrow) {
+        const auto ind_end = row_ptrs[row + 1];
+        arithmetic_type temp_val = zero<arithmetic_type>();
+        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
+             ind += subgroup_size) {
+            temp_val += value * b(col_idxs[ind], column_id);
+        }
+        auto subgroup_result = ::gko::kernels::dpcpp::reduce(
+            subgroup_tile, temp_val,
+            [](const arithmetic_type& a, const arithmetic_type& b) {
+                return a + b;
+            });
+        // TODO: check the barrier
+        subgroup_tile.sync();
+        if (subid == 0) {
+            c(row, column_id) = scale(subgroup_result, c(row, column_id));
+        }
+    }
+}
+
+
+template <size_type subgroup_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+void abstract_classical_spmv(const size_type num_rows,
+                             const MatrixValueType* __restrict__ val,
+                             const IndexType* __restrict__ col_idxs,
+                             const IndexType* __restrict__ row_ptrs,
+                             acc::range<input_accessor> b,
+                             acc::range<output_accessor> c,
+                             sycl::nd_item<3> item_ct1)
+{
+    using type = typename output_accessor::arithmetic_type;
+    device_classical_spmv<subgroup_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [](const type& x, const type& y) { return x; }, item_ct1);
+}
+
+template <size_type subgroup_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+void abstract_classical_spmv(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const size_type num_rows, const MatrixValueType* val,
+    const IndexType* col_idxs, const IndexType* row_ptrs,
+    acc::range<input_accessor> b, acc::range<output_accessor> c)
+{
+    // only subgroup = 1, so does not need sycl::reqd_sub_group_size
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            abstract_classical_spmv<subgroup_size>(num_rows, val, col_idxs,
+                                                   row_ptrs, b, c, item_ct1);
+        });
+}
+
+
+template <size_type subgroup_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
+    const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    const typename output_accessor::storage_type* __restrict__ beta,
+    acc::range<output_accessor> c, sycl::nd_item<3> item_ct1)
+{
+    using type = typename output_accessor::arithmetic_type;
+    const type alpha_val = static_cast<type>(alpha[0]);
+    const type beta_val = static_cast<type>(beta[0]);
+    device_classical_spmv<subgroup_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [&alpha_val, &beta_val](const type& x, const type& y) {
+            return alpha_val * x + beta_val * y;
+        },
+        item_ct1);
+}
+
+template <size_type subgroup_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+void abstract_classical_spmv(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    const size_type num_rows, const MatrixValueType* alpha,
+    const MatrixValueType* val, const IndexType* col_idxs,
+    const IndexType* row_ptrs, acc::range<input_accessor> b,
+    const typename output_accessor::storage_type* beta,
+    acc::range<output_accessor> c)
+{
+    // only subgroup = 1, so does not need sycl::reqd_sub_group_size
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            abstract_classical_spmv<subgroup_size>(
+                num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, item_ct1);
+        });
+}
+
+
+}  // namespace kernel
+
+
+namespace host_kernel {
+
+
+template <int subgroup_size, typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void classical_spmv(syn::value_list<int, subgroup_size>,
+                    std::shared_ptr<const DpcppExecutor> exec,
+                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                    const matrix::Dense<InputValueType>* b,
+                    matrix::Dense<OutputValueType>* c,
+                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                    const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+    using input_accessor =
+        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
+    using output_accessor =
+        gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>;
+    constexpr int threads_per_cu = 7;
+    const auto num_subgroup =
+        exec->get_num_computing_units() * threads_per_cu * classical_overweight;
+    const auto nsg_in_group = spmv_block_size / subgroup_size;
+    const auto gridx =
+        std::min(ceildiv(a->get_size()[0], spmv_block_size / subgroup_size),
+                 int64(num_subgroup / nsg_in_group));
+    const dim3 grid(gridx, b->get_size()[1]);
+    const auto block = spmv_block_size;
+
+    const auto b_vals = gko::acc::range<input_accessor>(
+        std::array<acc::size_type, 2>{
+            {static_cast<acc::size_type>(b->get_size()[0]),
+             static_cast<acc::size_type>(b->get_size()[1])}},
+        b->get_const_values(),
+        std::array<acc::size_type, 1>{
+            {static_cast<acc::size_type>(b->get_stride())}});
+    auto c_vals = gko::acc::range<output_accessor>(
+        std::array<acc::size_type, 2>{
+            {static_cast<acc::size_type>(c->get_size()[0]),
+             static_cast<acc::size_type>(c->get_size()[1])}},
+        c->get_values(),
+        std::array<acc::size_type, 1>{
+            {static_cast<acc::size_type>(c->get_stride())}});
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (alpha == nullptr && beta == nullptr) {
+        kernel::abstract_classical_spmv<subgroup_size>(
+            grid, block, 0, exec->get_queue(), a->get_size()[0],
+            a->get_const_value(), a->get_const_col_idxs(),
+            a->get_const_row_ptrs(), b_vals, c_vals);
+    } else if (alpha != nullptr && beta != nullptr) {
+        kernel::abstract_classical_spmv<subgroup_size>(
+            grid, block, 0, exec->get_queue(), a->get_size()[0],
+            alpha->get_const_values(), a->get_const_value(),
+            a->get_const_col_idxs(), a->get_const_row_ptrs(), b_vals,
+            beta->get_const_values(), c_vals);
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
+
+
+}  // namespace host_kernel
+
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const DpcppExecutor> exec,
-          const matrix::SparsityCsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b,
-          matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+          const matrix::Dense<InputValueType>* b,
+          matrix::Dense<OutputValueType>* c)
+{
+    host_kernel::select_classical_spmv(
+        classical_kernels(), [](int compiled_info) { return true; },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::SparsityCsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+                   const matrix::Dense<MatrixValueType>* alpha,
+                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                   const matrix::Dense<InputValueType>* b,
+                   const matrix::Dense<OutputValueType>* beta,
+                   matrix::Dense<OutputValueType>* c)
+{
+    host_kernel::select_classical_spmv(
+        classical_kernels(), [](int compiled_info) { return true; },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index f7e4d7643e2..68daf706fbc 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -33,9 +33,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
 
+#include <hip/hip_runtime.h>
+
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "accessor/hip_helper.hpp"
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace hip {
@@ -47,25 +63,119 @@ namespace hip {
 namespace sparsity_csr {
 
 
-template <typename ValueType, typename IndexType>
+constexpr int classical_overweight = 32;
+constexpr int spmv_block_size = 256;
+constexpr int warps_in_block = 4;
+
+
+using classical_kernels = syn::value_list<int, 2>;
+
+
+#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
+
+
+namespace host_kernel {
+
+
+template <int subwarp_size, typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
+void classical_spmv(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const HipExecutor> exec,
+                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                    const matrix::Dense<InputValueType>* b,
+                    matrix::Dense<OutputValueType>* c,
+                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
+                    const matrix::Dense<OutputValueType>* beta = nullptr)
+{
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+    using input_accessor =
+        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
+    using output_accessor =
+        gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>;
+
+    const auto nwarps = exec->get_num_warps_per_sm() *
+                        exec->get_num_multiprocessor() * classical_overweight;
+    const auto gridx =
+        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
+                 int64(nwarps / warps_in_block));
+    const dim3 grid(gridx, b->get_size()[1]);
+    const auto block = spmv_block_size;
+
+    const auto b_vals = gko::acc::range<input_accessor>(
+        std::array<acc::size_type, 2>{
+            {static_cast<acc::size_type>(b->get_size()[0]),
+             static_cast<acc::size_type>(b->get_size()[1])}},
+        b->get_const_values(),
+        std::array<acc::size_type, 1>{
+            {static_cast<acc::size_type>(b->get_stride())}});
+    auto c_vals = gko::acc::range<output_accessor>(
+        std::array<acc::size_type, 2>{
+            {static_cast<acc::size_type>(c->get_size()[0]),
+             static_cast<acc::size_type>(c->get_size()[1])}},
+        c->get_values(),
+        std::array<acc::size_type, 1>{
+            {static_cast<acc::size_type>(c->get_stride())}});
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (alpha == nullptr && beta == nullptr) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::abstract_classical_spmv<subwarp_size>),
+            grid, block, 0, 0, a->get_size()[0],
+            as_hip_type(a->get_const_value()), a->get_const_col_idxs(),
+            as_hip_type(a->get_const_row_ptrs()), acc::as_hip_range(b_vals),
+            acc::as_hip_range(c_vals));
+    } else if (alpha != nullptr && beta != nullptr) {
+        hipLaunchKernelGGL(
+            HIP_KERNEL_NAME(kernel::abstract_classical_spmv<subwarp_size>),
+            grid, block, 0, 0, a->get_size()[0],
+            as_hip_type(alpha->get_const_values()),
+            as_hip_type(a->get_const_value()), a->get_const_col_idxs(),
+            as_hip_type(a->get_const_row_ptrs()), acc::as_hip_range(b_vals),
+            as_hip_type(beta->get_const_values()), acc::as_hip_range(c_vals));
+    } else {
+        GKO_KERNEL_NOT_FOUND;
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
+
+
+}  // namespace host_kernel
+
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::SparsityCsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b,
-          matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+          const matrix::Dense<InputValueType>* b,
+          matrix::Dense<OutputValueType>* c)
+{
+    host_kernel::select_classical_spmv(
+        classical_kernels(), [](int compiled_info) { return true; },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::SparsityCsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+                   const matrix::Dense<MatrixValueType>* alpha,
+                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                   const matrix::Dense<InputValueType>* b,
+                   const matrix::Dense<OutputValueType>* beta,
+                   matrix::Dense<OutputValueType>* c)
+{
+    host_kernel::select_classical_spmv(
+        classical_kernels(), [](int compiled_info) { return true; },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp
index 07cf2c6e689..9c1c909c25a 100644
--- a/include/ginkgo/core/matrix/sparsity_csr.hpp
+++ b/include/ginkgo/core/matrix/sparsity_csr.hpp
@@ -253,7 +253,9 @@ class SparsityCsr
           col_idxs_(exec, num_nonzeros),
           row_ptrs_(exec, size[0] + 1),
           value_(exec, {one<ValueType>()})
-    {}
+    {
+        row_ptrs_.fill(0);
+    }
 
     /**
      * Creates a SparsityCsr matrix from already allocated (and initialized) row
diff --git a/omp/matrix/sparsity_csr_kernels.cpp b/omp/matrix/sparsity_csr_kernels.cpp
index 3ed0bc507ce..92db204f534 100644
--- a/omp/matrix/sparsity_csr_kernels.cpp
+++ b/omp/matrix/sparsity_csr_kernels.cpp
@@ -46,7 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
 
@@ -61,64 +63,71 @@ namespace omp {
 namespace sparsity_csr {
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const OmpExecutor> exec,
-          const matrix::SparsityCsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
+          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+          const matrix::Dense<InputValueType>* b,
+          matrix::Dense<OutputValueType>* c)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    auto val = a->get_const_value()[0];
+    const auto val = static_cast<arithmetic_type>(a->get_const_value()[0]);
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; ++row) {
         for (size_type j = 0; j < c->get_size()[1]; ++j) {
-            c->at(row, j) = zero<ValueType>();
-        }
-        for (size_type k = row_ptrs[row];
-             k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
-            auto col = col_idxs[k];
-            for (size_type j = 0; j < c->get_size()[1]; ++j) {
-                c->at(row, j) += val * b->at(col, j);
+            auto temp_val = gko::zero<arithmetic_type>();
+            for (size_type k = row_ptrs[row];
+                 k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
+                temp_val +=
+                    val * static_cast<arithmetic_type>(b->at(col_idxs[k], j));
             }
+            c->at(row, j) = static_cast<OutputValueType>(temp_val);
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::SparsityCsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c)
+                   const matrix::Dense<MatrixValueType>* alpha,
+                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                   const matrix::Dense<InputValueType>* b,
+                   const matrix::Dense<OutputValueType>* beta,
+                   matrix::Dense<OutputValueType>* c)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    auto valpha = alpha->at(0, 0);
-    auto vbeta = beta->at(0, 0);
-    auto val = a->get_const_value()[0];
+    const auto valpha = static_cast<arithmetic_type>(alpha->at(0, 0));
+    const auto vbeta = static_cast<arithmetic_type>(beta->at(0, 0));
+    const auto val = static_cast<arithmetic_type>(a->get_const_value()[0]);
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; ++row) {
         for (size_type j = 0; j < c->get_size()[1]; ++j) {
-            c->at(row, j) *= vbeta;
-        }
-        for (size_type k = row_ptrs[row];
-             k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
-            auto col = col_idxs[k];
-            for (size_type j = 0; j < c->get_size()[1]; ++j) {
-                c->at(row, j) += valpha * val * b->at(col, j);
+            auto temp_val = gko::zero<arithmetic_type>();
+            for (size_type k = row_ptrs[row];
+                 k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
+                temp_val +=
+                    val * static_cast<arithmetic_type>(b->at(col_idxs[k], j));
             }
+            c->at(row, j) = static_cast<OutputValueType>(
+                vbeta * static_cast<arithmetic_type>(c->at(row, j)) +
+                valpha * temp_val);
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/test/matrix/CMakeLists.txt b/omp/test/matrix/CMakeLists.txt
index d2843b7e459..0c0442f61e0 100644
--- a/omp/test/matrix/CMakeLists.txt
+++ b/omp/test/matrix/CMakeLists.txt
@@ -7,4 +7,3 @@ ginkgo_create_test(fbcsr_kernels)
 ginkgo_create_test(fft_kernels)
 ginkgo_create_test(hybrid_kernels)
 ginkgo_create_test(sellp_kernels)
-ginkgo_create_test(sparsity_csr_kernels)
diff --git a/omp/test/matrix/sparsity_csr_kernels.cpp b/omp/test/matrix/sparsity_csr_kernels.cpp
deleted file mode 100644
index 3075b12d483..00000000000
--- a/omp/test/matrix/sparsity_csr_kernels.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2022, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include <memory>
-#include <random>
-#include <utility>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "core/matrix/sparsity_csr_kernels.hpp"
-#include "core/test/utils.hpp"
-
-
-namespace {
-
-
-class SparsityCsr : public ::testing::Test {
-protected:
-    using Mtx = gko::matrix::SparsityCsr<>;
-    using Vec = gko::matrix::Dense<>;
-    using ComplexVec = gko::matrix::Dense<std::complex<double>>;
-    using ComplexMtx = gko::matrix::SparsityCsr<std::complex<double>>;
-
-    SparsityCsr() : mtx_size(532, 231), rand_engine(42) {}
-
-    void SetUp()
-    {
-        ref = gko::ReferenceExecutor::create();
-        omp = gko::OmpExecutor::create();
-    }
-
-    void TearDown()
-    {
-        if (omp != nullptr) {
-            ASSERT_NO_THROW(omp->synchronize());
-        }
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row)
-    {
-        return gko::test::generate_random_matrix<MtxType>(
-            num_rows, num_cols,
-            std::uniform_int_distribution<>(min_nnz_row, num_cols),
-            std::uniform_real_distribution<>(0.0, 1.0), rand_engine, ref);
-    }
-
-    void set_up_apply_data(int num_vectors = 1)
-    {
-        mtx = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 1);
-        complex_mtx = gen_mtx<ComplexMtx>(mtx_size[0], mtx_size[1], 1);
-        expected = gen_mtx<Vec>(mtx_size[0], num_vectors, 1);
-        y = gen_mtx<Vec>(mtx_size[1], num_vectors, 1);
-        alpha = gko::initialize<Vec>({2.0}, ref);
-        beta = gko::initialize<Vec>({-1.0}, ref);
-        dmtx = gko::clone(omp, mtx);
-        complex_dmtx = gko::clone(omp, complex_mtx);
-        dresult = gko::clone(omp, expected);
-        dy = gko::clone(omp, y);
-        dalpha = gko::clone(omp, alpha);
-        dbeta = gko::clone(omp, beta);
-    }
-
-    struct matrix_pair {
-        std::unique_ptr<Mtx> ref;
-        std::unique_ptr<Mtx> omp;
-    };
-
-    matrix_pair gen_unsorted_mtx()
-    {
-        constexpr int min_nnz_per_row = 2;  // Must be at least 2
-        auto local_mtx_ref =
-            gen_mtx<Mtx>(mtx_size[0], mtx_size[1], min_nnz_per_row);
-        for (size_t row = 0; row < mtx_size[0]; ++row) {
-            const auto row_ptrs = local_mtx_ref->get_const_row_ptrs();
-            const auto start_row = row_ptrs[row];
-            auto col_idx = local_mtx_ref->get_col_idxs() + start_row;
-            const auto nnz_in_this_row = row_ptrs[row + 1] - row_ptrs[row];
-            auto swap_idx_dist =
-                std::uniform_int_distribution<>(0, nnz_in_this_row - 1);
-            // shuffle `nnz_in_this_row / 2` times
-            for (size_t perm = 0; perm < nnz_in_this_row; perm += 2) {
-                const auto idx1 = swap_idx_dist(rand_engine);
-                const auto idx2 = swap_idx_dist(rand_engine);
-                std::swap(col_idx[idx1], col_idx[idx2]);
-            }
-        }
-        auto local_mtx_omp = gko::clone(omp, local_mtx_ref);
-
-        return {std::move(local_mtx_ref), std::move(local_mtx_omp)};
-    }
-
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<const gko::OmpExecutor> omp;
-
-    const gko::dim<2> mtx_size;
-    std::default_random_engine rand_engine;
-
-    std::unique_ptr<Mtx> mtx;
-    std::unique_ptr<ComplexMtx> complex_mtx;
-    std::unique_ptr<Vec> expected;
-    std::unique_ptr<Vec> y;
-    std::unique_ptr<Vec> alpha;
-    std::unique_ptr<Vec> beta;
-
-    std::unique_ptr<Mtx> dmtx;
-    std::unique_ptr<ComplexMtx> complex_dmtx;
-    std::unique_ptr<Vec> dresult;
-    std::unique_ptr<Vec> dy;
-    std::unique_ptr<Vec> dalpha;
-    std::unique_ptr<Vec> dbeta;
-};
-
-
-TEST_F(SparsityCsr, SimpleApplyIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    mtx->apply(y.get(), expected.get());
-    dmtx->apply(dy.get(), dresult.get());
-
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
-}
-
-
-TEST_F(SparsityCsr, AdvancedApplyIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
-    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
-
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
-}
-
-
-TEST_F(SparsityCsr, SimpleApplyToDenseMatrixIsEquivalentToRef)
-{
-    set_up_apply_data(3);
-
-    mtx->apply(y.get(), expected.get());
-    dmtx->apply(dy.get(), dresult.get());
-
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
-}
-
-
-TEST_F(SparsityCsr, AdvancedApplyToDenseMatrixIsEquivalentToRef)
-{
-    set_up_apply_data(3);
-
-    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
-    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
-
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
-}
-
-
-TEST_F(SparsityCsr, TransposeIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto trans = mtx->transpose();
-    auto d_trans = dmtx->transpose();
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx*>(d_trans.get()),
-                        static_cast<Mtx*>(trans.get()), 0.0);
-}
-
-
-TEST_F(SparsityCsr, CountsNumberOfDiagElementsIsEqualToRef)
-{
-    set_up_apply_data();
-    gko::size_type num_diags = 0;
-    gko::size_type d_num_diags = 0;
-
-    gko::kernels::reference::sparsity_csr::count_num_diagonal_elements(
-        ref, mtx.get(), &num_diags);
-    gko::kernels::omp::sparsity_csr::count_num_diagonal_elements(
-        omp, dmtx.get(), &d_num_diags);
-
-    ASSERT_EQ(d_num_diags, num_diags);
-}
-
-
-TEST_F(SparsityCsr, RemovesDiagElementsKernelIsEquivalentToRef)
-{
-    set_up_apply_data();
-    gko::size_type num_diags = 0;
-    gko::kernels::reference::sparsity_csr::count_num_diagonal_elements(
-        ref, mtx.get(), &num_diags);
-    auto tmp =
-        Mtx::create(ref, mtx->get_size(), mtx->get_num_nonzeros() - num_diags);
-    auto d_tmp = Mtx::create(omp, dmtx->get_size(),
-                             dmtx->get_num_nonzeros() - num_diags);
-
-    gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
-        ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), tmp.get());
-    gko::kernels::omp::sparsity_csr::remove_diagonal_elements(
-        omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
-        d_tmp.get());
-
-    GKO_ASSERT_MTX_NEAR(tmp.get(), d_tmp.get(), 0.0);
-}
-
-
-TEST_F(SparsityCsr, RecognizeSortedMatrixIsEquivalentToRef)
-{
-    set_up_apply_data();
-    bool is_sorted_omp{};
-    bool is_sorted_ref{};
-
-    is_sorted_ref = mtx->is_sorted_by_column_index();
-    is_sorted_omp = dmtx->is_sorted_by_column_index();
-
-    ASSERT_EQ(is_sorted_ref, is_sorted_omp);
-}
-
-
-TEST_F(SparsityCsr, RecognizeUnsortedMatrixIsEquivalentToRef)
-{
-    auto uns_mtx = gen_unsorted_mtx();
-    bool is_sorted_omp{};
-    bool is_sorted_ref{};
-
-    is_sorted_ref = uns_mtx.ref->is_sorted_by_column_index();
-    is_sorted_omp = uns_mtx.omp->is_sorted_by_column_index();
-
-    ASSERT_EQ(is_sorted_ref, is_sorted_omp);
-}
-
-
-TEST_F(SparsityCsr, SortSortedMatrixIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    mtx->sort_by_column_index();
-    dmtx->sort_by_column_index();
-
-    // Values must be unchanged, therefore, tolerance is `0`
-    GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0);
-}
-
-
-TEST_F(SparsityCsr, SortUnsortedMatrixIsEquivalentToRef)
-{
-    auto uns_mtx = gen_unsorted_mtx();
-
-    uns_mtx.ref->sort_by_column_index();
-    uns_mtx.omp->sort_by_column_index();
-
-    // Values must be unchanged, therefore, tolerance is `0`
-    GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.omp, 0);
-}
-
-
-}  // namespace
diff --git a/reference/matrix/sparsity_csr_kernels.cpp b/reference/matrix/sparsity_csr_kernels.cpp
index 71e63f18554..dfb7bc38bfc 100644
--- a/reference/matrix/sparsity_csr_kernels.cpp
+++ b/reference/matrix/sparsity_csr_kernels.cpp
@@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
 
@@ -58,62 +60,70 @@ namespace reference {
 namespace sparsity_csr {
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const ReferenceExecutor> exec,
-          const matrix::SparsityCsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* c)
+          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+          const matrix::Dense<InputValueType>* b,
+          matrix::Dense<OutputValueType>* c)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    auto val = a->get_const_value()[0];
+    const auto val = static_cast<arithmetic_type>(a->get_const_value()[0]);
 
     for (size_type row = 0; row < a->get_size()[0]; ++row) {
         for (size_type j = 0; j < c->get_size()[1]; ++j) {
-            c->at(row, j) = zero<ValueType>();
-        }
-        for (size_type k = row_ptrs[row];
-             k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
-            auto col = col_idxs[k];
-            for (size_type j = 0; j < c->get_size()[1]; ++j) {
-                c->at(row, j) += val * b->at(col, j);
+            auto temp_val = gko::zero<arithmetic_type>();
+            for (size_type k = row_ptrs[row];
+                 k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
+                temp_val +=
+                    val * static_cast<arithmetic_type>(b->at(col_idxs[k], j));
             }
+            c->at(row, j) = static_cast<OutputValueType>(temp_val);
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename MatrixValueType, typename InputValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::SparsityCsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c)
+                   const matrix::Dense<MatrixValueType>* alpha,
+                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
+                   const matrix::Dense<InputValueType>* b,
+                   const matrix::Dense<OutputValueType>* beta,
+                   matrix::Dense<OutputValueType>* c)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
+
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    auto valpha = alpha->at(0, 0);
-    auto vbeta = beta->at(0, 0);
-    auto val = a->get_const_value()[0];
+    const auto valpha = static_cast<arithmetic_type>(alpha->at(0, 0));
+    const auto vbeta = static_cast<arithmetic_type>(beta->at(0, 0));
+    const auto val = static_cast<arithmetic_type>(a->get_const_value()[0]);
 
     for (size_type row = 0; row < a->get_size()[0]; ++row) {
         for (size_type j = 0; j < c->get_size()[1]; ++j) {
-            c->at(row, j) *= vbeta;
-        }
-        for (size_type k = row_ptrs[row];
-             k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
-            auto col = col_idxs[k];
-            for (size_type j = 0; j < c->get_size()[1]; ++j) {
-                c->at(row, j) += valpha * val * b->at(col, j);
+            auto temp_val = gko::zero<arithmetic_type>();
+            for (size_type k = row_ptrs[row];
+                 k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
+                temp_val +=
+                    val * static_cast<arithmetic_type>(b->at(col_idxs[k], j));
             }
+            c->at(row, j) = static_cast<OutputValueType>(
+                vbeta * static_cast<arithmetic_type>(c->at(row, j)) +
+                valpha * temp_val);
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/reference/test/multigrid/amgx_pgm_kernels.cpp b/reference/test/multigrid/amgx_pgm_kernels.cpp
index 9dc075d79a3..6f71d43ab56 100644
--- a/reference/test/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/test/multigrid/amgx_pgm_kernels.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
@@ -68,6 +69,7 @@ class AmgxPgm : public ::testing::Test {
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using Mtx = gko::matrix::Csr<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
+    using SparsityCsr = gko::matrix::SparsityCsr<value_type, index_type>;
     using MgLevel = gko::multigrid::AmgxPgm<value_type, index_type>;
     using RowGatherer = gko::matrix::RowGatherer<index_type>;
     using VT = value_type;
@@ -511,6 +513,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevel)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     using Mtx = typename TestFixture::Mtx;
+    using SparsityCsr = typename TestFixture::SparsityCsr;
     using RowGatherer = typename TestFixture::RowGatherer;
     auto prolong_op = gko::share(Mtx::create(this->exec, gko::dim<2>{5, 2}, 0));
     // 0-2-4, 1-3
@@ -526,7 +529,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevel)
     auto expected_row_gather =
         gko::Array<index_type>(this->exec, {0, 1, 0, 1, 0});
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_restrict_op()),
+    GKO_ASSERT_MTX_NEAR(gko::as<SparsityCsr>(coarse_fine->get_restrict_op()),
                         restrict_op, r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_coarse_op()),
                         this->coarse, r<value_type>::value);
@@ -539,6 +542,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevelOnUnsortedMatrix)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     using Mtx = typename TestFixture::Mtx;
+    using SparsityCsr = typename TestFixture::SparsityCsr;
     using MgLevel = typename TestFixture::MgLevel;
     using RowGatherer = typename TestFixture::RowGatherer;
     auto mglevel_sort = MgLevel::build()
@@ -572,7 +576,7 @@ TYPED_TEST(AmgxPgm, GenerateMgLevelOnUnsortedMatrix)
     auto expected_row_gather =
         gko::Array<index_type>(this->exec, {0, 1, 0, 1, 0});
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_restrict_op()),
+    GKO_ASSERT_MTX_NEAR(gko::as<SparsityCsr>(coarse_fine->get_restrict_op()),
                         restrict_op, r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_coarse_op()),
                         this->coarse, r<value_type>::value);
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 951f1f1d3d4..f64b4d9673b 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/test/utils.hpp"
@@ -74,6 +75,11 @@ struct SimpleMatrixTest {
         return matrix_type::create(exec->get_master(), size);
     }
 
+    static void modify_data(
+        gko::matrix_data<typename MtxType::value_type,
+                         typename MtxType::index_type>& data)
+    {}
+
     static void check_property(const std::unique_ptr<matrix_type>&) {}
 };
 
@@ -348,6 +354,19 @@ struct HybridAutomaticStrategy
 };
 
 
+struct SparsityCsr
+    : SimpleMatrixTest<gko::matrix::SparsityCsr<matrix_value_type, int>> {
+    static void modify_data(gko::matrix_data<matrix_value_type, int>& data)
+    {
+        using entry_type =
+            gko::matrix_data<matrix_value_type, int>::nonzero_type;
+        for (auto& entry : data.nonzeros) {
+            entry.value = gko::one<matrix_value_type>();
+        }
+    }
+};
+
+
 template <typename ObjectType>
 struct test_pair {
     std::unique_ptr<ObjectType> ref;
@@ -633,7 +652,7 @@ using MatrixTypes = ::testing::Types<
     SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy,
     HybridColumnLimitStrategy, HybridImbalanceLimitStrategy,
     HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy,
-    HybridAutomaticStrategy>;
+    HybridAutomaticStrategy, SparsityCsr>;
 
 TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator);
 
@@ -794,6 +813,7 @@ TYPED_TEST(Matrix, ReadWriteRoundtrip)
         auto new_mtx = TestConfig::create(this->exec, data.size);
         gko::matrix_data<value_type, index_type> out_data;
 
+        TestConfig::modify_data(data);
         new_mtx->read(data);
         new_mtx->write(out_data);
 
diff --git a/test/multigrid/amgx_pgm_kernels.cpp b/test/multigrid/amgx_pgm_kernels.cpp
index 4694f0375ee..46e13b94d10 100644
--- a/test/multigrid/amgx_pgm_kernels.cpp
+++ b/test/multigrid/amgx_pgm_kernels.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 #include <ginkgo/core/multigrid/amgx_pgm.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
@@ -71,6 +72,7 @@ class AmgxPgm : public ::testing::Test {
     using index_type = gko::int32;
     using Mtx = gko::matrix::Dense<value_type>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
+    using SparsityCsr = gko::matrix::SparsityCsr<value_type, index_type>;
     using RowGatherer = gko::matrix::RowGatherer<index_type>;
     using Diag = gko::matrix::Diagonal<value_type>;
 
@@ -315,8 +317,8 @@ TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
         d_row_gatherer->get_executor(), d_row_gatherer->get_size()[0],
         d_row_gatherer->get_const_row_idxs());
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_restrict_op()),
-                        gko::as<Csr>(mg_level->get_restrict_op()),
+    GKO_ASSERT_MTX_NEAR(gko::as<SparsityCsr>(d_mg_level->get_restrict_op()),
+                        gko::as<SparsityCsr>(mg_level->get_restrict_op()),
                         r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_coarse_op()),
                         gko::as<Csr>(mg_level->get_coarse_op()),
@@ -348,8 +350,8 @@ TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRefOnUnsortedMatrix)
         d_row_gatherer->get_executor(), d_row_gatherer->get_size()[0],
         d_row_gatherer->get_const_row_idxs());
 
-    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_restrict_op()),
-                        gko::as<Csr>(mg_level->get_restrict_op()),
+    GKO_ASSERT_MTX_NEAR(gko::as<SparsityCsr>(d_mg_level->get_restrict_op()),
+                        gko::as<SparsityCsr>(mg_level->get_restrict_op()),
                         r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_coarse_op()),
                         gko::as<Csr>(mg_level->get_coarse_op()),