diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index dccd16332e2..28685fdf232 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -186,13 +186,14 @@ function(ginkgo_create_common_test test_name)
         string(SUBSTRING ${exec} 0 1 exec_initial)
         string(SUBSTRING ${exec} 1 -1 exec_tail)
         string(TOUPPER ${exec_initial} exec_initial)
+        string(TOUPPER ${exec} exec_upper)
         set(exec_type ${exec_initial}${exec_tail}Executor)
         # set up actual test
         set(test_target_name ${test_target_name}_${exec})
         add_executable(${test_target_name} ${test_name}.cpp)
         target_compile_features(${test_target_name} PUBLIC cxx_std_14)
         target_compile_options(${test_target_name} PRIVATE ${GINKGO_COMPILER_FLAGS})
-        target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec})
+        target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
         target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES})
         # use float for DPC++ if necessary
         if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
index 28eb7dd7332..c4cdc214e76 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -95,6 +95,35 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 }
 
 
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_sparsity_csr(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType* __restrict__ source, IndexType* __restrict__ row_ptrs,
+    IndexType* __restrict__ col_idxs)
+{
+    const auto row = thread::get_subwarp_id_flat<config::warp_size>();
+
+    if (row < num_rows) {
+        auto warp = group::tiled_partition<config::warp_size>(
+            group::this_thread_block());
+        auto lane_prefix_mask =
+            (config::lane_mask_type(1) << warp.thread_rank()) - 1;
+        auto base_out_idx = row_ptrs[row];
+        for (size_type i = 0; i < num_cols; i += config::warp_size) {
+            const auto col = i + warp.thread_rank();
+            const auto pred =
+                col < num_cols ? is_nonzero(source[stride * row + col]) : false;
+            const auto mask = warp.ballot(pred);
+            const auto out_idx = base_out_idx + popcnt(mask & lane_prefix_mask);
+            if (pred) {
+                col_idxs[out_idx] = col;
+            }
+            base_out_idx += popcnt(mask);
+        }
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void fill_in_ell(
     size_type num_rows, size_type num_cols, size_type source_stride,
diff --git a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc b/common/cuda_hip/matrix/hybrid_kernels.hpp.inc
deleted file mode 100644
index 04d52f6b0dc..00000000000
--- a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2022, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-
-namespace {
-
-
-template <typename ValueType, typename IndexType>
-struct hybrid_tuple_unpack_functor {
-    using device_entry = device_type<matrix_data_entry<ValueType, IndexType>>;
-    device_entry __device__
-    operator()(thrust::tuple<size_type, device_entry> e) const
-    {
-        return thrust::get<1>(e);
-    }
-};
-
-
-}  // anonymous namespace
-
-
-template <typename ValueType, typename IndexType>
-void split_matrix_data(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const Array<matrix_data_entry<ValueType, IndexType>>& data,
-    const int64* row_ptrs, size_type ell_limit, size_type num_rows,
-    Array<matrix_data_entry<ValueType, IndexType>>& ell_data,
-    Array<matrix_data_entry<ValueType, IndexType>>& coo_data)
-{
-    using device_entry = device_type<matrix_data_entry<ValueType, IndexType>>;
-    auto iota = thrust::make_counting_iterator(size_type{});
-    auto data_it =
-        thrust::device_pointer_cast(as_device_type(data.get_const_data()));
-    const auto nnz = data.get_num_elems();
-    auto enumerated_data_it =
-        thrust::make_zip_iterator(thrust::make_tuple(iota, data_it));
-    auto ell_predicate = [row_ptrs, ell_limit] __device__(
-                             thrust::tuple<size_type, device_entry> e) {
-        const auto row_begin = row_ptrs[thrust::get<1>(e).row];
-        const auto local_nz = thrust::get<0>(e) - row_begin;
-        return local_nz < ell_limit;
-    };
-    auto coo_predicate = [row_ptrs, ell_limit] __device__(
-                             thrust::tuple<size_type, device_entry> e) {
-        const auto row_begin = row_ptrs[thrust::get<1>(e).row];
-        const auto local_nz = thrust::get<0>(e) - row_begin;
-        return local_nz >= ell_limit;
-    };
-    const auto ell_nnz = static_cast<size_type>(
-        thrust::count_if(thrust::device, enumerated_data_it,
-                         enumerated_data_it + nnz, ell_predicate));
-    const auto coo_nnz = nnz - ell_nnz;
-    ell_data.resize_and_reset(ell_nnz);
-    coo_data.resize_and_reset(coo_nnz);
-    auto ell_data_it = thrust::make_transform_output_iterator(
-        thrust::device_pointer_cast(as_device_type(ell_data.get_data())),
-        hybrid_tuple_unpack_functor<ValueType, IndexType>{});
-    auto coo_data_it = thrust::make_transform_output_iterator(
-        thrust::device_pointer_cast(as_device_type(coo_data.get_data())),
-        hybrid_tuple_unpack_functor<ValueType, IndexType>{});
-    thrust::copy_if(thrust::device, enumerated_data_it,
-                    enumerated_data_it + nnz, ell_data_it, ell_predicate);
-    thrust::copy_if(thrust::device, enumerated_data_it,
-                    enumerated_data_it + nnz, coo_data_it, coo_predicate);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp
index 8f654e7d157..3ed8de8b1c5 100644
--- a/common/unified/matrix/hybrid_kernels.cpp
+++ b/common/unified/matrix/hybrid_kernels.cpp
@@ -75,6 +75,50 @@ void compute_row_nnz(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
+template <typename ValueType, typename IndexType>
+void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
+                         const device_matrix_data<ValueType, IndexType>& data,
+                         const int64* row_ptrs, const int64* coo_row_ptrs,
+                         matrix::Hybrid<ValueType, IndexType>* result)
+{
+    using device_value_type = device_type<ValueType>;
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto row_ptrs, auto data, auto ell_stride,
+                      auto ell_max_nnz, auto ell_cols, auto ell_vals,
+                      auto coo_row_ptrs, auto coo_row_idxs, auto coo_col_idxs,
+                      auto coo_vals) {
+            const auto row_begin = row_ptrs[row];
+            const auto row_size = row_ptrs[row + 1] - row_begin;
+            for (int64 i = 0; i < ell_max_nnz; i++) {
+                const auto out_idx = row + ell_stride * i;
+                const auto in_idx = i + row_begin;
+                const bool use = i < row_size;
+                ell_cols[out_idx] = use ? data[in_idx].column : 0;
+                ell_vals[out_idx] = use ? unpack_member(data[in_idx].value)
+                                        : zero<device_value_type>();
+            }
+            const auto coo_begin = coo_row_ptrs[row];
+            for (int64 i = ell_max_nnz; i < row_size; i++) {
+                const auto in_idx = i + row_begin;
+                const auto out_idx =
+                    coo_begin + i - static_cast<int64>(ell_max_nnz);
+                coo_row_idxs[out_idx] = row;
+                coo_col_idxs[out_idx] = data[in_idx].column;
+                coo_vals[out_idx] = unpack_member(data[in_idx].value);
+            }
+        },
+        data.size[0], row_ptrs, data.nonzeros, result->get_ell_stride(),
+        result->get_ell_num_stored_elements_per_row(),
+        result->get_ell_col_idxs(), result->get_ell_values(), coo_row_ptrs,
+        result->get_coo_row_idxs(), result->get_coo_col_idxs(),
+        result->get_coo_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Hybrid<ValueType, IndexType>* source,
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 647c0c7ed12..462406ef4b5 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -579,7 +579,7 @@ namespace hybrid {
 
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL);
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 
diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
index 0b5781e7a5d..d84cbd9c003 100644
--- a/core/matrix/coo.cpp
+++ b/core/matrix/coo.cpp
@@ -182,10 +182,10 @@ template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(coo::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(coo::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 54c735f89b6..31ba7d7c8bb 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -211,10 +211,10 @@ template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(csr::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(csr::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
@@ -230,15 +230,22 @@ void Csr<ValueType, IndexType>::convert_to(
     Hybrid<ValueType, IndexType>* result) const
 {
     auto exec = this->get_executor();
-    Array<size_type> row_nnz{exec, this->get_size()[0]};
-    Array<int64> coo_row_ptrs{exec, this->get_size()[0] + 1};
-    exec->run(csr::make_convert_ptrs_to_sizes(
-        this->get_const_row_ptrs(), this->get_size()[0], row_nnz.get_data()));
+    const auto num_rows = this->get_size()[0];
+    const auto num_cols = this->get_size()[1];
+    Array<size_type> row_nnz{exec, num_rows};
+    Array<int64> coo_row_ptrs{exec, num_rows + 1};
+    exec->run(csr::make_convert_ptrs_to_sizes(this->get_const_row_ptrs(),
+                                              num_rows, row_nnz.get_data()));
     size_type ell_lim{};
     size_type coo_nnz{};
     result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz);
+    if (ell_lim > num_cols) {
+        // TODO remove temporary fix after ELL gains true structural zeros
+        ell_lim = num_cols;
+    }
     exec->run(csr::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim,
                                                     coo_row_ptrs.get_data()));
+    coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows);
     auto tmp = make_temporary_clone(exec, result);
     tmp->resize(this->get_size(), ell_lim, coo_nnz);
     exec->run(csr::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(),
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 93ec5578700..b89d036ee73 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -557,18 +557,23 @@ template <typename IndexType>
 void Dense<ValueType>::convert_impl(Hybrid<ValueType, IndexType>* result) const
 {
     auto exec = this->get_executor();
-    Array<size_type> row_nnz{exec, this->get_size()[0]};
-    Array<int64> coo_row_ptrs{exec, this->get_size()[0] + 1};
+    const auto num_rows = this->get_size()[0];
+    const auto num_cols = this->get_size()[1];
+    Array<size_type> row_nnz{exec, num_rows};
+    Array<int64> coo_row_ptrs{exec, num_rows + 1};
     exec->run(dense::make_count_nonzeros_per_row(this, row_nnz.get_data()));
     size_type ell_lim{};
     size_type coo_nnz{};
     result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz);
+    if (ell_lim > num_cols) {
+        // TODO remove temporary fix after ELL gains true structural zeros
+        ell_lim = num_cols;
+    }
     exec->run(dense::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim,
                                                       coo_row_ptrs.get_data()));
+    coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows);
     auto tmp = make_temporary_clone(exec, result);
-    tmp->ell_->resize(this->get_size(), ell_lim);
-    tmp->coo_->resize(this->get_size(), coo_nnz);
-    tmp->set_size(this->get_size());
+    tmp->resize(this->get_size(), ell_lim, coo_nnz);
     exec->run(dense::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(),
                                             tmp.get()));
 }
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 684fb7e627d..0a9f22f8acc 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -132,10 +132,10 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(ell::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(ell::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 688d7aa4b9f..e0aa4b8df3e 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -155,10 +155,10 @@ void Fbcsr<ValueType, IndexType>::convert_to(
     Dense<ValueType>* const result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(fbcsr::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(fbcsr::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
index 42cdce0f162..6850c71a4bb 100644
--- a/core/matrix/hybrid.cpp
+++ b/core/matrix/hybrid.cpp
@@ -63,12 +63,13 @@ namespace {
 
 GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs);
 GKO_REGISTER_OPERATION(compute_row_nnz, hybrid::compute_row_nnz);
-GKO_REGISTER_OPERATION(split_matrix_data, hybrid::split_matrix_data);
+GKO_REGISTER_OPERATION(fill_in_matrix_data, hybrid::fill_in_matrix_data);
 GKO_REGISTER_OPERATION(ell_fill_in_dense, ell::fill_in_dense);
 GKO_REGISTER_OPERATION(coo_fill_in_dense, coo::fill_in_dense);
 GKO_REGISTER_OPERATION(ell_extract_diagonal, ell::extract_diagonal);
 GKO_REGISTER_OPERATION(coo_extract_diagonal, coo::extract_diagonal);
 GKO_REGISTER_OPERATION(ell_count_nonzeros_per_row, ell::count_nonzeros_per_row);
+GKO_REGISTER_OPERATION(compute_coo_row_ptrs, hybrid::compute_coo_row_ptrs);
 GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs);
 GKO_REGISTER_OPERATION(convert_to_csr, hybrid::convert_to_csr);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
@@ -209,9 +210,11 @@ template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::read(const device_mat_data& data)
 {
     auto exec = this->get_executor();
+    const auto num_rows = data.size[0];
+    const auto num_cols = data.size[1];
     auto local_data = make_temporary_clone(exec, &data.nonzeros);
-    Array<int64> row_ptrs{exec, data.size[0] + 1};
-    exec->run(hybrid::make_build_row_ptrs(*local_data, data.size[0],
+    Array<int64> row_ptrs{exec, num_rows + 1};
+    exec->run(hybrid::make_build_row_ptrs(*local_data, num_rows,
                                           row_ptrs.get_data()));
     Array<size_type> row_nnz{exec, data.size[0]};
     exec->run(hybrid::make_compute_row_nnz(row_ptrs, row_nnz.get_data()));
@@ -219,15 +222,17 @@ void Hybrid<ValueType, IndexType>::read(const device_mat_data& data)
     size_type coo_nnz{};
     this->get_strategy()->compute_hybrid_config(row_nnz, &ell_max_nnz,
                                                 &coo_nnz);
-    auto ell_nnz = data.nonzeros.get_num_elems() - coo_nnz;
-    device_mat_data ell_data{exec, data.size, ell_nnz};
-    device_mat_data coo_data{exec, data.size, coo_nnz};
-    exec->run(hybrid::make_split_matrix_data(
-        data.nonzeros, row_ptrs.get_const_data(), ell_max_nnz, data.size[0],
-        ell_data.nonzeros, coo_data.nonzeros));
-    this->set_size(data.size);
-    ell_->read(ell_data);
-    coo_->read(coo_data);
+    if (ell_max_nnz > num_cols) {
+        // TODO remove temporary fix after ELL gains true structural zeros
+        ell_max_nnz = num_cols;
+    }
+    Array<int64> coo_row_ptrs{exec, num_rows + 1};
+    exec->run(hybrid::make_compute_coo_row_ptrs(row_nnz, ell_max_nnz,
+                                                coo_row_ptrs.get_data()));
+    coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows);
+    this->resize(data.size, ell_max_nnz, coo_nnz);
+    exec->run(hybrid::make_fill_in_matrix_data(
+        data, row_ptrs.get_const_data(), coo_row_ptrs.get_const_data(), this));
 }
 
 
diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp
index 9e9cc08f88d..ca298c9eabd 100644
--- a/core/matrix/hybrid_kernels.hpp
+++ b/core/matrix/hybrid_kernels.hpp
@@ -56,13 +56,12 @@ namespace kernels {
                               const Array<size_type>& row_nnz,             \
                               size_type ell_lim, int64* coo_row_ptrs)
 
-#define GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType) \
-    void split_matrix_data(                                               \
-        std::shared_ptr<const DefaultExecutor> exec,                      \
-        const Array<matrix_data_entry<ValueType, IndexType>>& data,       \
-        const int64* row_ptrs, size_type num_rows, size_type ell_limit,   \
-        Array<matrix_data_entry<ValueType, IndexType>>& ell_data,         \
-        Array<matrix_data_entry<ValueType, IndexType>>& coo_data)
+#define GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType) \
+    void fill_in_matrix_data(                                               \
+        std::shared_ptr<const DefaultExecutor> exec,                        \
+        const device_matrix_data<ValueType, IndexType>& data,               \
+        const int64* row_ptrs, const int64* coo_row_ptrs,                   \
+        matrix::Hybrid<ValueType, IndexType>* result)
 
 #define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)      \
     void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,        \
@@ -72,12 +71,12 @@ namespace kernels {
                         matrix::Csr<ValueType, IndexType>* result)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                   \
-    GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ;                                \
-    GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL;                    \
-    template <typename ValueType, typename IndexType>                  \
-    GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType); \
-    template <typename ValueType, typename IndexType>                  \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                     \
+    GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ;                                  \
+    GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL;                      \
+    template <typename ValueType, typename IndexType>                    \
+    GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                    \
     GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType)
 
 
diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp
index b5e70c23647..bbdf22df95e 100644
--- a/core/matrix/sellp.cpp
+++ b/core/matrix/sellp.cpp
@@ -127,10 +127,10 @@ template <typename ValueType, typename IndexType>
 void Sellp<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(sellp::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(sellp::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp
index 8d32985abf1..ea02f43b7c2 100644
--- a/core/matrix/sparsity_csr.cpp
+++ b/core/matrix/sparsity_csr.cpp
@@ -127,10 +127,10 @@ void SparsityCsr<ValueType, IndexType>::convert_to(
     Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    auto tmp = make_temporary_output_clone(exec, result);
-    tmp->resize(this->get_size());
-    tmp->fill(zero<ValueType>());
-    exec->run(sparsity_csr::make_fill_in_dense(this, tmp.get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(sparsity_csr::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 4945056790b..2ace138b485 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -95,7 +95,6 @@ target_sources(ginkgo_cuda
     matrix/ell_kernels.cu
     matrix/fbcsr_kernels.cu
     matrix/fft_kernels.cu
-    matrix/hybrid_kernels.cu
     matrix/sellp_kernels.cu
     matrix/sparsity_csr_kernels.cu
     multigrid/amgx_pgm_kernels.cu
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
index 0c7540ca7b1..29291c6c93e 100644
--- a/cuda/matrix/coo_kernels.cu
+++ b/cuda/matrix/coo_kernels.cu
@@ -114,7 +114,7 @@ void spmv2(std::shared_ptr<const CudaExecutor> exec,
     const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto nwarps = host_kernel::calculate_nwarps(exec, nnz);
 
-    if (nwarps > 0) {
+    if (nwarps > 0 && b_ncols > 0) {
         if (b_ncols < 4) {
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
             int num_lines = ceildiv(nnz, nwarps * config::warp_size);
@@ -152,7 +152,7 @@ void advanced_spmv2(std::shared_ptr<const CudaExecutor> exec,
     const dim3 coo_block(config::warp_size, warps_in_block, 1);
     const auto b_ncols = b->get_size()[1];
 
-    if (nwarps > 0) {
+    if (nwarps > 0 && b_ncols > 0) {
         if (b_ncols < 4) {
             int num_lines = ceildiv(nnz, nwarps * config::warp_size);
             const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols);
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 60d2c119a03..7d2fc8defc7 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -300,7 +300,23 @@ template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const CudaExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_ptrs = result->get_row_ptrs();
+    auto col_idxs = result->get_col_idxs();
+
+    auto stride = source->get_stride();
+
+    const auto grid_dim =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (grid_dim > 0) {
+        kernel::fill_in_sparsity_csr<<<grid_dim, default_block_size>>>(
+            num_rows, num_cols, stride,
+            as_cuda_type(source->get_const_values()), row_ptrs, col_idxs);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu
deleted file mode 100644
index f1b05c4f605..00000000000
--- a/cuda/matrix/hybrid_kernels.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2022, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include "core/matrix/hybrid_kernels.hpp"
-
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-
-
-#include "common/unified/base/kernel_launch.hpp"
-#include "cuda/base/types.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Hybrid matrix format namespace.
- *
- * @ingroup hybrid
- */
-namespace hybrid {
-
-
-#include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc"
-
-
-}  // namespace hybrid
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index 8b3ab2630f3..ff787f2be0c 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/components/fill_array_kernels.hpp"
@@ -437,6 +438,32 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda);
+
+    x->convert_to(sparsity_mtx.get());
+    dx->convert_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0);
+}
+
+
+TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda);
+
+    x->move_to(sparsity_mtx.get());
+    dx->move_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0);
+}
+
+
 TEST_F(Dense, ConvertToEllIsEquivalentToRef)
 {
     set_up_apply_data();
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index f896ac396a8..3a18b0930d1 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -35,7 +35,6 @@ target_sources(ginkgo_dpcpp
     matrix/diagonal_kernels.dp.cpp
     matrix/ell_kernels.dp.cpp
     matrix/fft_kernels.dp.cpp
-    matrix/hybrid_kernels.dp.cpp
     matrix/sellp_kernels.dp.cpp
     matrix/sparsity_csr_kernels.dp.cpp
     multigrid/amgx_pgm_kernels.dp.cpp
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 626fd17dab0..cfd435d1dc1 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -84,316 +85,6 @@ constexpr int default_block_size = 256;
 namespace kernel {
 
 
-template <typename ValueType, typename IndexType>
-void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
-                 const int64* __restrict__ row_ptrs,
-                 const ValueType* __restrict__ source,
-                 IndexType* __restrict__ row_idxs,
-                 IndexType* __restrict__ col_idxs,
-                 ValueType* __restrict__ values, sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    if (tidx < num_rows) {
-        size_type write_to = row_ptrs[tidx];
-
-        for (size_type i = 0; i < num_cols; i++) {
-            if (is_nonzero(source[stride * tidx + i])) {
-                values[write_to] = source[stride * tidx + i];
-                col_idxs[write_to] = i;
-                row_idxs[write_to] = tidx;
-                write_to++;
-            }
-        }
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
-
-
-template <std::uint32_t cfg, typename ValueType, typename IndexType>
-void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
-                       const ValueType* __restrict__ work,
-                       IndexType* __restrict__ result,
-                       sycl::nd_item<3> item_ct1)
-{
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-    const auto row_idx = thread::get_subwarp_id_flat<sg_size>(item_ct1);
-    auto warp_tile =
-        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
-
-    if (row_idx < num_rows) {
-        IndexType part_result{};
-        for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) {
-            if (is_nonzero(work[stride * row_idx + i])) {
-                part_result += 1;
-            }
-        }
-        result[row_idx] = ::gko::kernels::dpcpp::reduce(
-            warp_tile, part_result,
-            [](const size_type& a, const size_type& b) { return a + b; });
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
-                               kcfg_1d_list)
-
-
-template <typename ValueType, typename IndexType>
-void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
-                 const ValueType* __restrict__ source,
-                 IndexType* __restrict__ row_ptrs,
-                 IndexType* __restrict__ col_idxs,
-                 ValueType* __restrict__ values, sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-
-    if (tidx < num_rows) {
-        auto write_to = row_ptrs[tidx];
-        for (size_type i = 0; i < num_cols; i++) {
-            if (is_nonzero(source[stride * tidx + i])) {
-                values[write_to] = source[stride * tidx + i];
-                col_idxs[write_to] = i;
-                write_to++;
-            }
-        }
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr)
-
-
-template <typename ValueType, typename IndexType>
-void fill_in_ell(size_type num_rows, size_type num_cols,
-                 size_type source_stride, const ValueType* __restrict__ source,
-                 size_type max_nnz_per_row, size_type result_stride,
-                 IndexType* __restrict__ col_ptrs,
-                 ValueType* __restrict__ values, sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    if (tidx < num_rows) {
-        IndexType col_idx = 0;
-        for (size_type col = 0; col < num_cols; col++) {
-            if (is_nonzero(source[tidx * source_stride + col])) {
-                col_ptrs[col_idx * result_stride + tidx] = col;
-                values[col_idx * result_stride + tidx] =
-                    source[tidx * source_stride + col];
-                col_idx++;
-            }
-        }
-        for (size_type j = col_idx; j < max_nnz_per_row; j++) {
-            col_ptrs[j * result_stride + tidx] = 0;
-            values[j * result_stride + tidx] = zero<ValueType>();
-        }
-    } else if (tidx < result_stride) {
-        for (size_type j = 0; j < max_nnz_per_row; j++) {
-            col_ptrs[j * result_stride + tidx] = 0;
-            values[j * result_stride + tidx] = zero<ValueType>();
-        }
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
-
-
-template <std::uint32_t cfg>
-void calculate_slice_lengths(size_type num_rows, size_type slice_size,
-                             int slice_num, size_type stride_factor,
-                             const size_type* __restrict__ nnz_per_row,
-                             size_type* __restrict__ slice_lengths,
-                             size_type* __restrict__ slice_sets,
-                             sycl::nd_item<3> item_ct1)
-{
-    constexpr auto sg_size = cfg;
-    const auto sliceid = item_ct1.get_group(2);
-    const auto tid_in_warp = item_ct1.get_local_id(2);
-    const bool runable = sliceid * slice_size + tid_in_warp < num_rows;
-    size_type thread_result = 0;
-    for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
-        thread_result =
-            (i + slice_size * sliceid < num_rows)
-                ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
-                : thread_result;
-    }
-
-    auto warp_tile =
-        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
-    auto warp_result = ::gko::kernels::dpcpp::reduce(
-        warp_tile, thread_result,
-        [](const size_type& a, const size_type& b) { return max(a, b); });
-
-    if (tid_in_warp == 0 && runable) {
-        auto slice_length = ceildiv(warp_result, stride_factor) * stride_factor;
-        slice_lengths[sliceid] = slice_length;
-        slice_sets[sliceid] = slice_length;
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
-                                           calculate_slice_lengths)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
-                               calculate_slice_lengths, subgroup_list)
-
-
-template <typename ValueType, typename IndexType>
-void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
-                   size_type stride, const ValueType* __restrict__ source,
-                   size_type* __restrict__ slice_lengths,
-                   size_type* __restrict__ slice_sets,
-                   IndexType* __restrict__ col_idxs,
-                   ValueType* __restrict__ vals, sycl::nd_item<3> item_ct1)
-{
-    const auto global_row = thread::get_thread_id_flat(item_ct1);
-    const auto row = global_row % slice_size;
-    const auto sliceid = global_row / slice_size;
-
-    if (global_row < num_rows) {
-        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
-
-        for (size_type col = 0; col < num_cols; col++) {
-            auto val = source[global_row * stride + col];
-            if (is_nonzero(val)) {
-                col_idxs[sellp_ind] = col;
-                vals[sellp_ind] = val;
-                sellp_ind += slice_size;
-            }
-        }
-        for (size_type i = sellp_ind;
-             i <
-             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
-             i += slice_size) {
-            col_idxs[i] = 0;
-            vals[i] = zero<ValueType>();
-        }
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
-
-
-template <std::uint32_t cfg>
-void reduce_max_nnz(size_type size, const size_type* __restrict__ nnz_per_row,
-                    size_type* __restrict__ result, sycl::nd_item<3> item_ct1,
-                    uint8_t* dpct_local)
-{
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-    auto block_max = (size_type*)dpct_local;
-
-    reduce_array<sg_size>(
-        size, nnz_per_row, block_max, item_ct1,
-        [](const size_type& x, const size_type& y) { return max(x, y); });
-
-    if (item_ct1.get_local_id(2) == 0) {
-        result[item_ct1.get_group(2)] = block_max[0];
-    }
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16)>
-void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory,
-                    sycl::queue* queue, size_type size,
-                    const size_type* nnz_per_row, size_type* result)
-{
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
-
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                reduce_max_nnz<cfg>(size, nnz_per_row, result, item_ct1,
-                                    dpct_local_acc_ct1.get_pointer().get());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
-GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz,
-                               kcfg_1d_list)
-
-
-template <std::uint32_t cfg>
-void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
-                              size_type stride_factor,
-                              const size_type* __restrict__ nnz_per_row,
-                              size_type* __restrict__ result,
-                              sycl::nd_item<3> item_ct1)
-{
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-    auto warp_tile =
-        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
-    const auto warpid = thread::get_subwarp_id_flat<sg_size>(item_ct1);
-    const auto tid_in_warp = warp_tile.thread_rank();
-    const auto slice_num = ceildiv(num_rows, slice_size);
-
-    size_type thread_result = 0;
-    for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
-        if (warpid * slice_size + i < num_rows) {
-            thread_result =
-                max(thread_result, nnz_per_row[warpid * slice_size + i]);
-        }
-    }
-
-    auto warp_result = ::gko::kernels::dpcpp::reduce(
-        warp_tile, thread_result,
-        [](const size_type& a, const size_type& b) { return max(a, b); });
-
-    if (tid_in_warp == 0 && warpid < slice_num) {
-        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice,
-                               reduce_max_nnz_per_slice)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice,
-                                           reduce_max_nnz_per_slice)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
-                               reduce_max_nnz_per_slice, kcfg_1d_list)
-
-
-template <std::uint32_t cfg>
-void reduce_total_cols(size_type num_slices,
-                       const size_type* __restrict__ max_nnz_per_slice,
-                       size_type* __restrict__ result,
-                       sycl::nd_item<3> item_ct1, uint8_t* dpct_local)
-{
-    auto block_result = (size_type*)dpct_local;
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-    reduce_array<sg_size>(
-        num_slices, max_nnz_per_slice, block_result, item_ct1,
-        [](const size_type& x, const size_type& y) { return x + y; });
-
-    if (item_ct1.get_local_id(2) == 0) {
-        result[item_ct1.get_group(2)] = block_result[0];
-    }
-}
-
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16)>
-void reduce_total_cols(dim3 grid, dim3 block, size_type dynamic_shared_memory,
-                       sycl::queue* queue, size_type num_slices,
-                       const size_type* max_nnz_per_slice, size_type* result)
-{
-    queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
-                       sycl::access::target::local>
-            dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
-
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                reduce_total_cols<cfg>(num_slices, max_nnz_per_slice, result,
-                                       item_ct1,
-                                       dpct_local_acc_ct1.get_pointer().get());
-            });
-    });
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols,
-                                           reduce_total_cols);
-GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
-                               kcfg_1d_list)
-
 template <std::uint32_t sg_size, typename ValueType, typename Closure>
 void transpose(const size_type nrows, const size_type ncols,
                const ValueType* __restrict__ in, const size_type in_stride,
@@ -540,29 +231,30 @@ void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
                     const int64* row_ptrs,
                     matrix::Coo<ValueType, IndexType>* result)
 {
-    auto num_rows = result->get_size()[0];
-    auto num_cols = result->get_size()[1];
-
-    auto row_idxs = result->get_row_idxs();
-    auto col_idxs = result->get_col_idxs();
-    auto values = result->get_values();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto in_vals = source->get_const_values();
+    const auto stride = source->get_stride();
 
-    auto stride = source->get_stride();
+    auto rows = result->get_row_idxs();
+    auto cols = result->get_col_idxs();
+    auto vals = result->get_values();
 
-    auto queue = exec->get_queue();
-    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const std::uint32_t cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
+            const auto row = static_cast<size_type>(item[0]);
+            auto write_to = row_ptrs[row];
+
+            for (size_type col = 0; col < num_cols; col++) {
+                if (is_nonzero(in_vals[stride * row + col])) {
+                    vals[write_to] = in_vals[stride * row + col];
+                    cols[write_to] = static_cast<IndexType>(col);
+                    rows[write_to] = static_cast<IndexType>(row);
+                    write_to++;
+                }
+            }
         });
-    const auto wg_size = KCFG_1D::decode<0>(cfg);
-    const auto sg_size = KCFG_1D::decode<1>(cfg);
-    size_type grid_dim = ceildiv(num_rows, wg_size);
-
-    kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                        num_cols, stride, row_ptrs, source->get_const_values(),
-                        row_idxs, col_idxs, values);
+    });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -574,30 +266,29 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Csr<ValueType, IndexType>* result)
 {
-    auto queue = exec->get_queue();
-    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const std::uint32_t cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
-    const auto wg_size = KCFG_1D::decode<0>(cfg);
-    const auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    auto num_rows = result->get_size()[0];
-    auto num_cols = result->get_size()[1];
-
-    auto row_ptrs = result->get_row_ptrs();
-    auto col_idxs = result->get_col_idxs();
-    auto values = result->get_values();
-
-    auto stride = source->get_stride();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto in_vals = source->get_const_values();
+    const auto stride = source->get_stride();
 
-    size_type grid_dim = ceildiv(num_rows, wg_size);
+    const auto row_ptrs = result->get_const_row_ptrs();
+    auto cols = result->get_col_idxs();
+    auto vals = result->get_values();
 
-    kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, stride, source->get_const_values(),
-                        row_ptrs, col_idxs, values);
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
+            const auto row = static_cast<size_type>(item[0]);
+            auto write_to = row_ptrs[row];
+
+            for (size_type col = 0; col < num_cols; col++) {
+                if (is_nonzero(in_vals[stride * row + col])) {
+                    vals[write_to] = in_vals[stride * row + col];
+                    cols[write_to] = static_cast<IndexType>(col);
+                    write_to++;
+                }
+            }
+        });
+    });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -609,29 +300,34 @@ void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Ell<ValueType, IndexType>* result)
 {
-    auto num_rows = result->get_size()[0];
-    auto num_cols = result->get_size()[1];
-    auto max_nnz_per_row = result->get_num_stored_elements_per_row();
-
-    auto col_ptrs = result->get_col_idxs();
-    auto values = result->get_values();
-
-    auto source_stride = source->get_stride();
-    auto result_stride = result->get_stride();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto max_nnz_per_row = result->get_num_stored_elements_per_row();
+    const auto in_vals = source->get_const_values();
+    const auto in_stride = source->get_stride();
 
-    auto queue = exec->get_queue();
-    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const std::uint32_t cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
+    auto cols = result->get_col_idxs();
+    auto vals = result->get_values();
+    const auto stride = result->get_stride();
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
+            const auto row = static_cast<size_type>(item[0]);
+            size_type col_idx = 0;
+            for (size_type col = 0; col < num_cols; col++) {
+                if (is_nonzero(in_vals[row * in_stride + col])) {
+                    cols[col_idx * stride + row] = col;
+                    vals[col_idx * stride + row] =
+                        in_vals[row * in_stride + col];
+                    col_idx++;
+                }
+            }
+            for (; col_idx < max_nnz_per_row; col_idx++) {
+                cols[col_idx * stride + row] = 0;
+                vals[col_idx * stride + row] = zero<ValueType>();
+            }
         });
-    const auto wg_size = KCFG_1D::decode<0>(cfg);
-    const auto sg_size = KCFG_1D::decode<1>(cfg);
-    auto grid_dim = ceildiv(result_stride, wg_size);
-    kernel::fill_in_ell(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                        num_cols, source_stride, source->get_const_values(),
-                        max_nnz_per_row, result_stride, col_ptrs, values);
+    });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -659,10 +355,56 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_hybrid(
-    std::shared_ptr<const DpcppExecutor> exec,
-    const matrix::Dense<ValueType>* source, const int64* coo_row_ptrs,
-    matrix::Hybrid<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
+void convert_to_hybrid(std::shared_ptr<const DpcppExecutor> exec,
+                       const matrix::Dense<ValueType>* source,
+                       const int64* coo_row_ptrs,
+                       matrix::Hybrid<ValueType, IndexType>* result)
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto ell_lim = result->get_ell_num_stored_elements_per_row();
+    const auto in_vals = source->get_const_values();
+    const auto in_stride = source->get_stride();
+    const auto ell_stride = result->get_ell_stride();
+    auto ell_cols = result->get_ell_col_idxs();
+    auto ell_vals = result->get_ell_values();
+    auto coo_rows = result->get_coo_row_idxs();
+    auto coo_cols = result->get_coo_col_idxs();
+    auto coo_vals = result->get_coo_values();
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
+            const auto row = static_cast<size_type>(item[0]);
+            size_type ell_count = 0;
+            size_type col = 0;
+            auto ell_idx = row;
+            for (; col < num_cols && ell_count < ell_lim; col++) {
+                const auto val = in_vals[row * in_stride + col];
+                if (is_nonzero(val)) {
+                    ell_vals[ell_idx] = val;
+                    ell_cols[ell_idx] = static_cast<IndexType>(col);
+                    ell_count++;
+                    ell_idx += ell_stride;
+                }
+            }
+            for (; ell_count < ell_lim; ell_count++) {
+                ell_vals[ell_idx] = zero<ValueType>();
+                ell_cols[ell_idx] = 0;
+                ell_idx += ell_stride;
+            }
+            auto coo_idx = coo_row_ptrs[row];
+            for (; col < num_cols; col++) {
+                const auto val = in_vals[row * in_stride + col];
+                if (is_nonzero(val)) {
+                    coo_vals[coo_idx] = val;
+                    coo_cols[coo_idx] = static_cast<IndexType>(col);
+                    coo_rows[coo_idx] = static_cast<IndexType>(row);
+                    coo_idx++;
+                }
+            }
+        });
+    });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL);
@@ -673,50 +415,38 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Sellp<ValueType, IndexType>* result)
 {
-    auto queue = exec->get_queue();
-    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const std::uint32_t cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
-    const auto wg_size = KCFG_1D::decode<0>(cfg);
-    const auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    const auto stride = source->get_stride();
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
+    const auto stride = source->get_stride();
+    const auto in_vals = source->get_const_values();
 
+    const auto slice_sets = result->get_const_slice_sets();
+    const auto slice_size = result->get_slice_size();
     auto vals = result->get_values();
     auto col_idxs = result->get_col_idxs();
-    auto slice_lengths = result->get_slice_lengths();
-    auto slice_sets = result->get_slice_sets();
-
-    const auto slice_size = result->get_slice_size();
-    const auto stride_factor = result->get_stride_factor();
-    const int slice_num = ceildiv(num_rows, slice_size);
-
-    auto nnz_per_row = Array<size_type>(exec, num_rows);
-    count_nonzeros_per_row(exec, source, nnz_per_row.get_data());
-
-    auto grid_dim = slice_num;
 
-    if (grid_dim > 0) {
-        kernel::calculate_slice_lengths_call(
-            sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows,
-            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
-            slice_lengths, slice_sets);
-    }
-
-    components::prefix_sum(exec, slice_sets, slice_num + 1);
-
-    grid_dim = ceildiv(num_rows, wg_size);
-    if (grid_dim > 0) {
-        kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                              num_cols, slice_size, stride,
-                              source->get_const_values(), slice_lengths,
-                              slice_sets, col_idxs, vals);
-    }
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
+            const auto row = static_cast<size_type>(item[0]);
+            const auto local_row = row % slice_size;
+            const auto slice = row / slice_size;
+            const auto slice_end = slice_sets[slice + 1] * slice_size;
+            auto out_idx = slice_sets[slice] * slice_size + local_row;
+
+            for (size_type col = 0; col < num_cols; col++) {
+                const auto val = in_vals[row * stride + col];
+                if (is_nonzero(val)) {
+                    col_idxs[out_idx] = static_cast<IndexType>(col);
+                    vals[out_idx] = val;
+                    out_idx += slice_size;
+                }
+            }
+            for (; out_idx < slice_end; out_idx += slice_size) {
+                col_idxs[out_idx] = 0;
+                vals[out_idx] = zero<ValueType>();
+            }
+        });
+    });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -727,7 +457,29 @@ template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const DpcppExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto in_vals = source->get_const_values();
+    const auto stride = source->get_stride();
+
+    const auto row_ptrs = result->get_const_row_ptrs();
+    auto cols = result->get_col_idxs();
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_rows, [=](sycl::item<1> item) {
+            const auto row = static_cast<size_type>(item[0]);
+            auto write_to = row_ptrs[row];
+
+            for (size_type col = 0; col < num_cols; col++) {
+                if (is_nonzero(in_vals[stride * row + col])) {
+                    cols[write_to] = static_cast<IndexType>(col);
+                    write_to++;
+                }
+            }
+        });
+    });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index e8976754b98..6afc78f0d8f 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/components/fill_array_kernels.hpp"
@@ -470,6 +471,32 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sparsity_mtx = gko::matrix::SparsityCsr<vtype>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<vtype>::create(dpcpp);
+
+    x->convert_to(sparsity_mtx.get());
+    dx->convert_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0);
+}
+
+
+TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sparsity_mtx = gko::matrix::SparsityCsr<vtype>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<vtype>::create(dpcpp);
+
+    x->move_to(sparsity_mtx.get());
+    dx->move_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0);
+}
+
+
 TEST_F(Dense, ConvertToEllIsEquivalentToRef)
 {
     set_up_apply_data();
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 55b2c4605b8..c860d6abb2a 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -179,7 +179,6 @@ set(GINKGO_HIP_SOURCES
     matrix/diagonal_kernels.hip.cpp
     matrix/ell_kernels.hip.cpp
     matrix/fbcsr_kernels.hip.cpp
-    matrix/hybrid_kernels.hip.cpp
     matrix/sellp_kernels.hip.cpp
     matrix/sparsity_csr_kernels.hip.cpp
     multigrid/amgx_pgm_kernels.hip.cpp
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 188deeb7a0f..63e42fbea5b 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -308,7 +308,24 @@ template <typename ValueType, typename IndexType>
 void convert_to_sparsity_csr(std::shared_ptr<const HipExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_ptrs = result->get_row_ptrs();
+    auto col_idxs = result->get_col_idxs();
+
+    auto stride = source->get_stride();
+
+    const auto grid_dim =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (grid_dim > 0) {
+        hipLaunchKernelGGL(kernel::fill_in_sparsity_csr, grid_dim,
+                           default_block_size, 0, 0, num_rows, num_cols, stride,
+                           as_hip_type(source->get_const_values()),
+                           as_hip_type(row_ptrs), as_hip_type(col_idxs));
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp
deleted file mode 100644
index acd067ea28b..00000000000
--- a/hip/matrix/hybrid_kernels.hip.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2022, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include "core/matrix/hybrid_kernels.hpp"
-
-
-#include <hip/hip_runtime.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-
-
-#include "common/unified/base/kernel_launch.hpp"
-#include "hip/base/types.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Hybrid matrix format namespace.
- *
- * @ingroup hybrid
- */
-namespace hybrid {
-
-
-#include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc"
-
-
-}  // namespace hybrid
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp
index e6978a68fa1..b9c3899a5fe 100644
--- a/hip/test/matrix/dense_kernels.hip.cpp
+++ b/hip/test/matrix/dense_kernels.hip.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/components/fill_array_kernels.hpp"
@@ -433,6 +434,32 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip);
+
+    x->convert_to(sparsity_mtx.get());
+    dx->convert_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0);
+}
+
+
+TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref);
+    auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip);
+
+    x->move_to(sparsity_mtx.get());
+    dx->move_to(d_sparsity_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0);
+}
+
+
 TEST_F(Dense, ConvertToEllIsEquivalentToRef)
 {
     set_up_apply_data();
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 79b29e053c3..179e6ee18a7 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -22,7 +22,6 @@ target_sources(ginkgo_omp
     matrix/ell_kernels.cpp
     matrix/fbcsr_kernels.cpp
     matrix/fft_kernels.cpp
-    matrix/hybrid_kernels.cpp
     matrix/sellp_kernels.cpp
     matrix/sparsity_csr_kernels.cpp
     multigrid/amgx_pgm_kernels.cpp
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 56735caa436..dd47a2b3ed7 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -283,8 +283,7 @@ void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
 {
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
-    auto strategy = result->get_strategy();
-    auto ell_lim = strategy->get_ell_num_stored_elements_per_row();
+    auto ell_lim = result->get_ell_num_stored_elements_per_row();
     auto coo_val = result->get_coo_values();
     auto coo_col = result->get_coo_col_idxs();
     auto coo_row = result->get_coo_row_idxs();
diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp
deleted file mode 100644
index 083975cdc96..00000000000
--- a/omp/matrix/hybrid_kernels.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2022, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include "core/matrix/hybrid_kernels.hpp"
-
-
-#include <omp.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/ell_kernels.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace omp {
-/**
- * @brief The Hybrid matrix format namespace.
- *
- * @ingroup hybrid
- */
-namespace hybrid {
-
-
-template <typename ValueType, typename IndexType>
-void split_matrix_data(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const Array<matrix_data_entry<ValueType, IndexType>>& data,
-    const int64* row_ptrs, size_type ell_limit, size_type num_rows,
-    Array<matrix_data_entry<ValueType, IndexType>>& ell_data,
-    Array<matrix_data_entry<ValueType, IndexType>>& coo_data)
-{
-    auto data_ptr = data.get_const_data();
-    size_type ell_nnz{};
-    for (size_type row = 0; row < num_rows; row++) {
-        ell_nnz +=
-            std::min<size_type>(ell_limit, row_ptrs[row + 1] - row_ptrs[row]);
-    }
-    ell_data.resize_and_reset(ell_nnz);
-    coo_data.resize_and_reset(data.get_num_elems() - ell_nnz);
-    size_type ell_nz{};
-    size_type coo_nz{};
-    for (size_type row = 0; row < num_rows; row++) {
-        size_type local_ell_nnz{};
-        for (auto i = row_ptrs[row]; i < row_ptrs[row + 1]; i++) {
-            if (local_ell_nnz < ell_limit) {
-                ell_data.get_data()[ell_nz] = data.get_const_data()[i];
-                ell_nz++;
-                local_ell_nnz++;
-            } else {
-                coo_data.get_data()[coo_nz] = data.get_const_data()[i];
-                coo_nz++;
-            }
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
-
-
-}  // namespace hybrid
-}  // namespace omp
-}  // namespace kernels
-}  // namespace gko
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 7ce43de53ed..1d8667e14ac 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -683,8 +683,7 @@ void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
     auto strategy = result->get_strategy();
-    auto ell_lim = strategy->get_ell_num_stored_elements_per_row();
-    auto coo_lim = strategy->get_coo_nnz();
+    auto ell_lim = result->get_ell_num_stored_elements_per_row();
     auto coo_val = result->get_coo_values();
     auto coo_col = result->get_coo_col_idxs();
     auto coo_row = result->get_coo_row_idxs();
diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp
index 3b2f9badc54..d2f12cfc65a 100644
--- a/reference/matrix/hybrid_kernels.cpp
+++ b/reference/matrix/hybrid_kernels.cpp
@@ -83,40 +83,38 @@ void compute_row_nnz(std::shared_ptr<const DefaultExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void split_matrix_data(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const Array<matrix_data_entry<ValueType, IndexType>>& data,
-    const int64* row_ptrs, size_type ell_limit, size_type num_rows,
-    Array<matrix_data_entry<ValueType, IndexType>>& ell_data,
-    Array<matrix_data_entry<ValueType, IndexType>>& coo_data)
+void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
+                         const device_matrix_data<ValueType, IndexType>& data,
+                         const int64* row_ptrs, const int64*,
+                         matrix::Hybrid<ValueType, IndexType>* result)
 {
-    auto data_ptr = data.get_const_data();
-    size_type ell_nnz{};
-    for (size_type row = 0; row < num_rows; row++) {
-        ell_nnz +=
-            std::min<size_type>(ell_limit, row_ptrs[row + 1] - row_ptrs[row]);
-    }
-    ell_data.resize_and_reset(ell_nnz);
-    coo_data.resize_and_reset(data.get_num_elems() - ell_nnz);
-    size_type ell_nz{};
+    const auto num_rows = result->get_size()[0];
+    const auto ell_max_nnz = result->get_ell_num_stored_elements_per_row();
+    const auto nonzeros = data.nonzeros.get_const_data();
     size_type coo_nz{};
     for (size_type row = 0; row < num_rows; row++) {
-        size_type local_ell_nnz{};
-        for (auto i = row_ptrs[row]; i < row_ptrs[row + 1]; i++) {
-            if (local_ell_nnz < ell_limit) {
-                ell_data.get_data()[ell_nz] = data.get_const_data()[i];
+        size_type ell_nz{};
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; nz++) {
+            if (ell_nz < ell_max_nnz) {
+                result->ell_col_at(row, ell_nz) = nonzeros[nz].column;
+                result->ell_val_at(row, ell_nz) = nonzeros[nz].value;
                 ell_nz++;
-                local_ell_nnz++;
             } else {
-                coo_data.get_data()[coo_nz] = data.get_const_data()[i];
+                result->get_coo_row_idxs()[coo_nz] = nonzeros[nz].row;
+                result->get_coo_col_idxs()[coo_nz] = nonzeros[nz].column;
+                result->get_coo_values()[coo_nz] = nonzeros[nz].value;
                 coo_nz++;
             }
         }
+        for (; ell_nz < ell_max_nnz; ell_nz++) {
+            result->ell_col_at(row, ell_nz) = 0;
+            result->ell_val_at(row, ell_nz) = zero<ValueType>();
+        }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
+    GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt
index e118dcb71bf..2bed18bb9ed 100644
--- a/test/matrix/CMakeLists.txt
+++ b/test/matrix/CMakeLists.txt
@@ -1,2 +1,3 @@
 ginkgo_create_common_test(csr_kernels)
 ginkgo_create_common_test(dense_kernels)
+ginkgo_create_common_test(matrix)
\ No newline at end of file
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
new file mode 100644
index 00000000000..7d4b4a41fae
--- /dev/null
+++ b/test/matrix/matrix.cpp
@@ -0,0 +1,806 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+#if GINKGO_DPCPP_SINGLE_MODE
+using matrix_value_type = float;
+#else
+using matrix_value_type = double;
+#endif  // GINKGO_DPCPP_SINGLE_MODE
+
+
+template <typename MtxType>
+struct SimpleMatrixTest {
+    using matrix_type = MtxType;
+
+    static bool preserves_zeros() { return true; }
+
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec->get_master(), size);
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>&) {}
+};
+
+struct DenseWithDefaultStride
+    : SimpleMatrixTest<gko::matrix::Dense<matrix_value_type>> {
+    static bool preserves_zeros() { return false; }
+};
+
+struct DenseWithCustomStride : DenseWithDefaultStride {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, size[0] + 10);
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_EQ(mtx->get_stride(), mtx->get_size()[0] + 10);
+    }
+};
+
+struct Coo : SimpleMatrixTest<gko::matrix::Coo<matrix_value_type, int>> {};
+
+struct CsrWithDefaultStrategy
+    : SimpleMatrixTest<gko::matrix::Csr<matrix_value_type, int>> {};
+
+
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
+    defined(GKO_COMPILING_DPCPP)
+
+
+struct CsrWithClassicalStrategy
+    : SimpleMatrixTest<gko::matrix::Csr<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0,
+                                   std::make_shared<matrix_type::classical>());
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_TRUE(dynamic_cast<const matrix_type::classical*>(
+            mtx->get_strategy().get()));
+    }
+};
+
+struct CsrWithMergePathStrategy
+    : SimpleMatrixTest<gko::matrix::Csr<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0,
+                                   std::make_shared<matrix_type::merge_path>());
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_TRUE(dynamic_cast<const matrix_type::merge_path*>(
+            mtx->get_strategy().get()));
+    }
+};
+
+struct CsrWithSparselibStrategy
+    : SimpleMatrixTest<gko::matrix::Csr<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0,
+                                   std::make_shared<matrix_type::sparselib>());
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_TRUE(dynamic_cast<const matrix_type::sparselib*>(
+            mtx->get_strategy().get()));
+    }
+};
+
+struct CsrWithLoadBalanceStrategy
+    : SimpleMatrixTest<gko::matrix::Csr<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0,
+                                   std::make_shared<matrix_type::load_balance>(
+                                       gko::EXEC_TYPE::create(0, exec)));
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_TRUE(dynamic_cast<const matrix_type::load_balance*>(
+            mtx->get_strategy().get()));
+    }
+};
+
+struct CsrWithAutomaticalStrategy
+    : SimpleMatrixTest<gko::matrix::Csr<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0,
+                                   std::make_shared<matrix_type::automatical>(
+                                       gko::EXEC_TYPE::create(0, exec)));
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_TRUE(dynamic_cast<const matrix_type::automatical*>(
+            mtx->get_strategy().get()));
+    }
+};
+
+
+#endif
+
+
+struct Ell : SimpleMatrixTest<gko::matrix::Ell<matrix_value_type, int>> {};
+
+
+struct FbcsrBlocksize1
+    : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
+    static bool preserves_zeros() { return false; }
+
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0, 1);
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_EQ(mtx->get_block_size(), 1);
+    }
+};
+
+struct FbcsrBlocksize2
+    : SimpleMatrixTest<gko::matrix::Fbcsr<matrix_value_type, int>> {
+    static bool preserves_zeros() { return false; }
+
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0, 2);
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_EQ(mtx->get_block_size(), 2);
+    }
+};
+
+
+struct SellpDefaultParameters
+    : SimpleMatrixTest<gko::matrix::Sellp<matrix_value_type, int>> {
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_EQ(mtx->get_stride_factor(), 1);
+        ASSERT_EQ(mtx->get_slice_size(), 64);
+    }
+};
+
+struct Sellp32Factor2
+    : SimpleMatrixTest<gko::matrix::Sellp<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 32, 2, 0);
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        ASSERT_EQ(mtx->get_stride_factor(), 2);
+        ASSERT_EQ(mtx->get_slice_size(), 32);
+    }
+};
+
+
+struct HybridDefaultStrategy
+    : SimpleMatrixTest<gko::matrix::Hybrid<matrix_value_type, int>> {};
+
+struct HybridColumnLimitStrategy
+    : SimpleMatrixTest<gko::matrix::Hybrid<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(
+            exec, size, 0, std::make_shared<matrix_type::column_limit>(10));
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        auto strategy = dynamic_cast<const matrix_type::column_limit*>(
+            mtx->get_strategy().get());
+        ASSERT_TRUE(strategy);
+        ASSERT_EQ(strategy->get_num_columns(), 10);
+    }
+};
+
+struct HybridImbalanceLimitStrategy
+    : SimpleMatrixTest<gko::matrix::Hybrid<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(
+            exec, size, 0, std::make_shared<matrix_type::imbalance_limit>(0.5));
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        auto strategy = dynamic_cast<const matrix_type::imbalance_limit*>(
+            mtx->get_strategy().get());
+        ASSERT_TRUE(strategy);
+        ASSERT_EQ(strategy->get_percentage(), 0.5);
+    }
+};
+
+struct HybridImbalanceBoundedLimitStrategy
+    : SimpleMatrixTest<gko::matrix::Hybrid<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(
+            exec, size, 0,
+            std::make_shared<matrix_type::imbalance_bounded_limit>(0.5, 0.01));
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        auto strategy =
+            dynamic_cast<const matrix_type::imbalance_bounded_limit*>(
+                mtx->get_strategy().get());
+        ASSERT_TRUE(strategy);
+        ASSERT_EQ(strategy->get_percentage(), 0.5);
+        ASSERT_EQ(strategy->get_ratio(), 0.01);
+    }
+};
+
+struct HybridMinStorageStrategy
+    : SimpleMatrixTest<gko::matrix::Hybrid<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(
+            exec, size, 0,
+            std::make_shared<matrix_type::minimal_storage_limit>());
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        auto strategy = dynamic_cast<const matrix_type::minimal_storage_limit*>(
+            mtx->get_strategy().get());
+        ASSERT_TRUE(strategy);
+    }
+};
+
+struct HybridAutomaticStrategy
+    : SimpleMatrixTest<gko::matrix::Hybrid<matrix_value_type, int>> {
+    static std::unique_ptr<matrix_type> create(
+        std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
+    {
+        return matrix_type::create(exec, size, 0,
+                                   std::make_shared<matrix_type::automatic>());
+    }
+
+    static void check_property(const std::unique_ptr<matrix_type>& mtx)
+    {
+        auto strategy = dynamic_cast<const matrix_type::automatic*>(
+            mtx->get_strategy().get());
+        ASSERT_TRUE(strategy);
+    }
+};
+
+
+template <typename ObjectType>
+struct test_pair {
+    std::unique_ptr<ObjectType> ref;
+    std::unique_ptr<ObjectType> dev;
+
+    test_pair(std::unique_ptr<ObjectType> ref_obj,
+              std::shared_ptr<const gko::Executor> exec)
+        : ref{std::move(ref_obj)}, dev{gko::clone(exec, ref)}
+    {}
+
+    test_pair(std::unique_ptr<ObjectType> ref_obj,
+              std::unique_ptr<ObjectType> dev_obj)
+        : ref{std::move(ref_obj)}, dev{std::move(dev_obj)}
+    {}
+};
+
+
+template <typename T>
+class Matrix : public ::testing::Test {
+protected:
+    using Config = T;
+    using Mtx = typename T::matrix_type;
+    using index_type = typename Mtx::index_type;
+    using value_type = typename Mtx::value_type;
+    using mixed_value_type = gko::next_precision<value_type>;
+    using Vec = gko::matrix::Dense<value_type>;
+    using MixedVec = gko::matrix::Dense<mixed_value_type>;
+
+    Matrix() : rand_engine(15) {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        init_executor(ref, exec);
+    }
+
+    void TearDown()
+    {
+        if (exec != nullptr) {
+            ASSERT_NO_THROW(exec->synchronize());
+        }
+    }
+
+    template <typename DistType>
+    gko::matrix_data<value_type, index_type> gen_mtx_data(int num_rows,
+                                                          int num_cols,
+                                                          DistType dist)
+    {
+        return gko::test::generate_random_matrix_data<value_type, index_type>(
+            num_rows, num_cols, dist, std::normal_distribution<>(0.0, 1.0),
+            rand_engine);
+    }
+
+    gko::matrix_data<value_type, index_type> gen_mtx_data(int num_rows,
+                                                          int num_cols,
+                                                          int min_cols,
+                                                          int max_cols)
+    {
+        return gen_mtx_data(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_cols, max_cols));
+    }
+
+    template <typename ValueType, typename IndexType>
+    gko::matrix_data<ValueType, IndexType> gen_dense_data(gko::dim<2> size)
+    {
+        return {
+            size,
+            std::normal_distribution<gko::remove_complex<ValueType>>(0.0, 1.0),
+            rand_engine};
+    }
+
+    template <typename VecType = Vec>
+    test_pair<VecType> gen_in_vec(const test_pair<Mtx>& mtx, int nrhs,
+                                  int stride)
+    {
+        auto size = gko::dim<2>{mtx.ref->get_size()[1],
+                                static_cast<gko::size_type>(nrhs)};
+        auto result = VecType::create(ref, size, stride);
+        result->read(gen_dense_data<typename VecType::value_type,
+                                    typename Mtx::index_type>(size));
+        return {std::move(result), exec};
+    }
+
+    template <typename VecType = Vec>
+    test_pair<VecType> gen_scalar()
+    {
+        return {gko::initialize<VecType>(
+                    {gko::test::detail::get_rand_value<
+                        typename VecType::value_type>(
+                        std::normal_distribution<
+                            gko::remove_complex<typename VecType::value_type>>(
+                            0.0, 1.0),
+                        rand_engine)},
+                    ref),
+                exec};
+    }
+
+    template <typename VecType = Vec>
+    test_pair<VecType> gen_out_vec(const test_pair<Mtx>& mtx, int nrhs,
+                                   int stride)
+    {
+        auto size = gko::dim<2>{mtx.ref->get_size()[0],
+                                static_cast<gko::size_type>(nrhs)};
+        auto result = VecType::create(ref, size, stride);
+        result->read(gen_dense_data<typename VecType::value_type,
+                                    typename Mtx::index_type>(size));
+        return {std::move(result), exec};
+    }
+
+    double tol() { return r<value_type>::value; }
+
+    double mixed_tol() { return r_mixed<value_type, mixed_value_type>(); }
+
+    template <typename TestFunction>
+    void forall_matrix_data_scenarios(TestFunction fn)
+    {
+        auto guarded_fn = [&](auto mtx) {
+            try {
+                fn(std::move(mtx));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            SCOPED_TRACE("Zero matrix (0x0)");
+            guarded_fn(gen_mtx_data(0, 0, 0, 0));
+        }
+        {
+            SCOPED_TRACE("Zero matrix (0x2)");
+            guarded_fn(gen_mtx_data(0, 2, 0, 0));
+        }
+        {
+            SCOPED_TRACE("Zero matrix (2x0)");
+            guarded_fn(gen_mtx_data(2, 0, 0, 0));
+        }
+        {
+            SCOPED_TRACE("Zero matrix (200x100)");
+            guarded_fn(gen_mtx_data(200, 100, 0, 0));
+        }
+        {
+            SCOPED_TRACE("Sparse Matrix with some zeros rows (200x100)");
+            guarded_fn(gen_mtx_data(200, 100, 0, 50));
+        }
+        {
+            SCOPED_TRACE("Sparse Matrix with fixed row nnz (200x100)");
+            guarded_fn(gen_mtx_data(200, 100, 50, 50));
+        }
+        {
+            SCOPED_TRACE("Sparse Matrix with variable row nnz (200x100)");
+            guarded_fn(gen_mtx_data(200, 100, 10, 50));
+        }
+        {
+            SCOPED_TRACE(
+                "Sparse Matrix with heavily imbalanced row nnz (200x100)");
+            guarded_fn(
+                gen_mtx_data(200, 100, std::poisson_distribution<>{1.5}));
+        }
+        {
+            SCOPED_TRACE("Dense matrix (200x100)");
+            guarded_fn(gen_mtx_data(200, 100, 100, 100));
+        }
+    }
+
+    template <typename TestFunction>
+    void forall_matrix_scenarios(TestFunction fn)
+    {
+        auto guarded_fn = [&](auto mtx) {
+            try {
+                T::check_property(mtx.ref);
+                T::check_property(mtx.dev);
+                fn(std::move(mtx));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            SCOPED_TRACE("Uninitialized matrix (0x0)");
+            guarded_fn(test_pair<Mtx>{T::create(ref, gko::dim<2>{}),
+                                      T::create(exec, gko::dim<2>{})});
+        }
+        {
+            SCOPED_TRACE("Uninitialized matrix (0x2)");
+            guarded_fn(test_pair<Mtx>{T::create(ref, gko::dim<2>{0, 2}),
+                                      T::create(exec, gko::dim<2>{0, 2})});
+        }
+        {
+            SCOPED_TRACE("Uninitialized matrix (2x0)");
+            guarded_fn(test_pair<Mtx>{T::create(ref, gko::dim<2>{2, 0}),
+                                      T::create(exec, gko::dim<2>{2, 0})});
+        }
+        forall_matrix_data_scenarios([&](auto data) {
+            test_pair<Mtx> pair{T::create(ref, data.size),
+                                T::create(exec, data.size)};
+            pair.dev->read(data);
+            pair.ref->read(data);
+            guarded_fn(std::move(pair));
+        });
+    }
+
+    template <typename VecType = Vec, typename MtxType, typename TestFunction>
+    void forall_vector_scenarios(const test_pair<MtxType>& mtx, TestFunction fn)
+    {
+        auto guarded_fn = [&](auto b, auto x) {
+            try {
+                fn(std::move(b), std::move(x));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            SCOPED_TRACE("Multivector with 0 columns");
+            guarded_fn(gen_in_vec<VecType>(mtx, 0, 0),
+                       gen_out_vec<VecType>(mtx, 0, 0));
+        }
+        {
+            SCOPED_TRACE("Single vector");
+            guarded_fn(gen_in_vec<VecType>(mtx, 1, 1),
+                       gen_out_vec<VecType>(mtx, 1, 1));
+        }
+        {
+            SCOPED_TRACE("Single strided vector");
+            guarded_fn(gen_in_vec<VecType>(mtx, 1, 2),
+                       gen_out_vec<VecType>(mtx, 1, 3));
+        }
+        if (!gko::is_complex<value_type>()) {
+            // check application of real matrix to complex vector
+            // viewed as interleaved real/imag vector
+            using complex_vec = gko::to_complex<VecType>;
+            {
+                SCOPED_TRACE("Single strided complex vector");
+                guarded_fn(gen_in_vec<complex_vec>(mtx, 1, 2),
+                           gen_out_vec<complex_vec>(mtx, 1, 3));
+            }
+            {
+                SCOPED_TRACE("Strided complex multivector with 2 columns");
+                guarded_fn(gen_in_vec<complex_vec>(mtx, 2, 3),
+                           gen_out_vec<complex_vec>(mtx, 2, 4));
+            }
+        }
+        {
+            SCOPED_TRACE("Multivector with 2 columns");
+            guarded_fn(gen_in_vec<VecType>(mtx, 2, 2),
+                       gen_out_vec<VecType>(mtx, 2, 2));
+        }
+        {
+            SCOPED_TRACE("Strided multivector with 2 columns");
+            guarded_fn(gen_in_vec<VecType>(mtx, 2, 3),
+                       gen_out_vec<VecType>(mtx, 2, 4));
+        }
+        {
+            SCOPED_TRACE("Multivector with 40 columns");
+            guarded_fn(gen_in_vec<VecType>(mtx, 40, 40),
+                       gen_out_vec<VecType>(mtx, 40, 40));
+        }
+        {
+            SCOPED_TRACE("Strided multivector with 40 columns");
+            guarded_fn(gen_in_vec<VecType>(mtx, 40, 43),
+                       gen_out_vec<VecType>(mtx, 40, 45));
+        }
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::EXEC_TYPE> exec;
+
+    std::ranlux48 rand_engine;
+};
+
+using MatrixTypes = ::testing::Types<
+    DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy,
+    // The strategies have issues with zero rows
+    /*
+    #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
+        defined(GKO_COMPILING_DPCPP)
+        CsrWithClassicalStrategy, CsrWithMergePathStrategy,
+        CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy,
+        CsrWithAutomaticalStrategy,
+    #endif
+    */
+    Ell,
+    // Fbcsr is slightly broken
+    /*FbcsrBlocksize1, FbcsrBlocksize2,*/
+    SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy,
+    HybridColumnLimitStrategy, HybridImbalanceLimitStrategy,
+    HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy,
+    HybridAutomaticStrategy>;
+
+TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Matrix, SpMVIsEquivalentToRef)
+{
+    this->forall_matrix_scenarios([&](auto mtx) {
+        this->forall_vector_scenarios(mtx, [&](auto b, auto x) {
+            mtx.ref->apply(b.ref.get(), x.ref.get());
+            mtx.dev->apply(b.dev.get(), x.dev.get());
+
+            GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->tol());
+        });
+    });
+}
+
+
+TYPED_TEST(Matrix, AdvancedSpMVIsEquivalentToRef)
+{
+    this->forall_matrix_scenarios([&](auto mtx) {
+        this->forall_vector_scenarios(mtx, [&](auto b, auto x) {
+            auto alpha = this->gen_scalar();
+            auto beta = this->gen_scalar();
+
+            mtx.ref->apply(alpha.ref.get(), b.ref.get(), alpha.ref.get(),
+                           x.ref.get());
+            mtx.dev->apply(alpha.dev.get(), b.dev.get(), alpha.dev.get(),
+                           x.dev.get());
+
+            GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->tol());
+        });
+    });
+}
+
+
+#if !(GINKGO_DPCPP_SINGLE_MODE)
+TYPED_TEST(Matrix, MixedSpMVIsEquivalentToRef)
+{
+    using MixedVec = typename TestFixture::MixedVec;
+    this->forall_matrix_scenarios([&](auto mtx) {
+        this->template forall_vector_scenarios<MixedVec>(
+            mtx, [&](auto b, auto x) {
+                mtx.ref->apply(b.ref.get(), x.ref.get());
+                mtx.dev->apply(b.dev.get(), x.dev.get());
+
+                GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol());
+            });
+    });
+}
+
+
+TYPED_TEST(Matrix, MixedAdvancedSpMVIsEquivalentToRef)
+{
+    using MixedVec = typename TestFixture::MixedVec;
+    this->forall_matrix_scenarios([&](auto mtx) {
+        this->template forall_vector_scenarios<MixedVec>(
+            mtx, [&](auto b, auto x) {
+                auto alpha = this->template gen_scalar<MixedVec>();
+                auto beta = this->template gen_scalar<MixedVec>();
+
+                mtx.ref->apply(alpha.ref.get(), b.ref.get(), alpha.ref.get(),
+                               x.ref.get());
+                mtx.dev->apply(alpha.dev.get(), b.dev.get(), alpha.dev.get(),
+                               x.dev.get());
+
+                GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol());
+            });
+    });
+}
+#endif
+
+
+TYPED_TEST(Matrix, ConvertToCsrIsEquivalentToRef)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Csr =
+        gko::matrix::Csr<typename Mtx::value_type, typename Mtx::index_type>;
+    this->forall_matrix_scenarios([&](auto mtx) {
+        auto ref_result = Csr::create(this->ref);
+        auto dev_result = Csr::create(this->exec);
+
+        mtx.ref->convert_to(ref_result.get());
+        mtx.dev->convert_to(dev_result.get());
+
+        GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(ref_result, dev_result);
+    });
+}
+
+
+TYPED_TEST(Matrix, ConvertFromCsrIsEquivalentToRef)
+{
+    using TestConfig = typename TestFixture::Config;
+    using Mtx = typename TestFixture::Mtx;
+    using Csr =
+        gko::matrix::Csr<typename Mtx::value_type, typename Mtx::index_type>;
+    this->forall_matrix_data_scenarios([&](auto data) {
+        auto ref_src = Csr::create(this->ref);
+        auto dev_src = Csr::create(this->exec);
+        ref_src->read(data);
+        dev_src->read(data);
+        auto ref_result = TestConfig::create(this->ref, data.size);
+        auto dev_result = TestConfig::create(this->exec, data.size);
+
+        ref_src->convert_to(ref_result.get());
+        dev_src->convert_to(dev_result.get());
+
+        GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(ref_result, dev_result);
+    });
+}
+
+
+TYPED_TEST(Matrix, ConvertToDenseIsEquivalentToRef)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<typename Mtx::value_type>;
+    this->forall_matrix_scenarios([&](auto mtx) {
+        auto ref_result = Dense::create(this->ref);
+        auto dev_result = Dense::create(this->exec);
+
+        mtx.ref->convert_to(ref_result.get());
+        mtx.dev->convert_to(dev_result.get());
+
+        GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0);
+    });
+}
+
+
+TYPED_TEST(Matrix, ConvertFromDenseIsEquivalentToRef)
+{
+    using TestConfig = typename TestFixture::Config;
+    using Mtx = typename TestFixture::Mtx;
+    using Dense = gko::matrix::Dense<typename Mtx::value_type>;
+    this->forall_matrix_data_scenarios([&](auto data) {
+        auto ref_src = Dense::create(this->ref);
+        auto dev_src = Dense::create(this->exec);
+        ref_src->read(data);
+        dev_src->read(data);
+        auto ref_result = TestConfig::create(this->ref, data.size);
+        auto dev_result = TestConfig::create(this->exec, data.size);
+
+        ref_src->convert_to(ref_result.get());
+        dev_src->convert_to(dev_result.get());
+
+        GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(ref_result, dev_result);
+    });
+}
+
+
+TYPED_TEST(Matrix, ReadWriteRoundtrip)
+{
+    using TestConfig = typename TestFixture::Config;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrix_data_scenarios([&](auto data) {
+        auto new_mtx = TestConfig::create(this->exec, data.size);
+        gko::matrix_data<value_type, index_type> out_data;
+
+        new_mtx->read(data);
+        new_mtx->write(out_data);
+
+        if (!TestConfig::preserves_zeros()) {
+            data.remove_zeros();
+            out_data.remove_zeros();
+        }
+        ASSERT_EQ(data.size, out_data.size);
+        ASSERT_EQ(data.nonzeros, out_data.nonzeros);
+    });
+}