diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 7c8cb7f7a5f..853a694ab9c 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1046,3 +1046,60 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
         out_vals[out_begin + i] = in_vals[in_begin + i];
     }
 }
+
+
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
+        const size_type num_rows, const size_type num_cols,
+        const size_type num_nnz, const size_type row_offset,
+        const size_type col_offset,
+        const IndexType* __restrict__ source_row_ptrs,
+        const IndexType* __restrict__ source_col_idxs,
+        const ValueType* __restrict__ source_values,
+        const IndexType* __restrict__ result_row_ptrs,
+        IndexType* __restrict__ result_col_idxs,
+        ValueType* __restrict__ result_values)
+{
+    const auto res_row = thread::get_thread_id_flat();
+    if (res_row < num_rows) {
+        const auto src_row = res_row + row_offset;
+        auto res_nnz = result_row_ptrs[res_row];
+        for (auto nnz = source_row_ptrs[src_row];
+             nnz < source_row_ptrs[src_row + 1]; ++nnz) {
+            const auto res_col = source_col_idxs[nnz] - col_offset;
+            if (res_col < num_cols && res_col >= 0) {
+                result_col_idxs[res_nnz] = res_col;
+                result_values[res_nnz] = source_values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span(
+        const span row_span, const span col_span,
+        const IndexType* __restrict__ row_ptrs,
+        const IndexType* __restrict__ col_idxs,
+        IndexType* __restrict__ nnz_per_row)
+{
+    const auto src_row = thread::get_thread_id_flat() + row_span.begin;
+    if (src_row < row_span.end) {
+        IndexType nnz{};
+        for (auto i = row_ptrs[src_row]; i < row_ptrs[src_row + 1]; ++i) {
+            if (col_idxs[i] >= col_span.begin && col_idxs[i] < col_span.end) {
+                nnz++;
+            }
+        }
+        nnz_per_row[src_row - row_span.begin] = nnz;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 5d55806c8ee..df8cfd05346 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -433,6 +433,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType)
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 1c8ab720900..ae683e11813 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
 
@@ -69,6 +70,9 @@ GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp);
 GKO_REGISTER_OPERATION(calculate_total_cols, csr::calculate_total_cols);
 GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell);
 GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid);
+GKO_REGISTER_OPERATION(calculate_nonzeros_per_row_in_span,
+                       csr::calculate_nonzeros_per_row_in_span);
+GKO_REGISTER_OPERATION(compute_submatrix, csr::compute_submatrix);
 GKO_REGISTER_OPERATION(transpose, csr::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
 GKO_REGISTER_OPERATION(inv_symm_permute, csr::inv_symm_permute);
@@ -85,6 +89,7 @@ GKO_REGISTER_OPERATION(is_sorted_by_column_index,
                        csr::is_sorted_by_column_index);
 GKO_REGISTER_OPERATION(extract_diagonal, csr::extract_diagonal);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
+GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum);
 GKO_REGISTER_OPERATION(inplace_absolute_array,
                        components::inplace_absolute_array);
 GKO_REGISTER_OPERATION(outplace_absolute_array,
@@ -536,6 +541,31 @@ bool Csr<ValueType, IndexType>::is_sorted_by_column_index() const
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>>
+Csr<ValueType, IndexType>::create_submatrix(const gko::span& row_span,
+                                            const gko::span& column_span) const
+{
+    using Mat = Csr<ValueType, IndexType>;
+    auto exec = this->get_executor();
+    auto sub_mat_size = gko::dim<2>(row_span.length(), column_span.length());
+    Array<IndexType> row_ptrs(exec, row_span.length() + 1);
+    exec->run(csr::make_calculate_nonzeros_per_row_in_span(
+        this, row_span, column_span, &row_ptrs));
+    exec->run(csr::make_prefix_sum(row_ptrs.get_data(), row_span.length() + 1));
+    auto num_nnz =
+        exec->copy_val_to_host(row_ptrs.get_data() + sub_mat_size[0]);
+    auto sub_mat = Mat::create(exec, sub_mat_size,
+                               std::move(Array<ValueType>(exec, num_nnz)),
+                               std::move(Array<IndexType>(exec, num_nnz)),
+                               std::move(row_ptrs), this->get_strategy());
+    exec->run(csr::make_compute_submatrix(this, row_span, column_span,
+                                          sub_mat.get()));
+    sub_mat->make_srow();
+    return sub_mat;
+}
+
+
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Diagonal<ValueType>>
 Csr<ValueType, IndexType>::extract_diagonal() const
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index 0e6a7346b7c..13f238e58a7 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -174,6 +174,18 @@ namespace kernels {
         const matrix::Csr<ValueType, IndexType>* source,             \
         Array<size_type>* result)
 
+#define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType)  \
+    void calculate_nonzeros_per_row_in_span(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                           \
+        const matrix::Csr<ValueType, IndexType>* source, const span& row_span, \
+        const span& col_span, Array<IndexType>* row_nnz)
+
+#define GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType)     \
+    void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const matrix::Csr<ValueType, IndexType>* source, \
+                           gko::span row_span, gko::span col_span,          \
+                           matrix::Csr<ValueType, IndexType>* result)
+
 #define GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType)         \
     void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec, \
                               matrix::Csr<ValueType, IndexType>* to_sort)
@@ -240,6 +252,10 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType);              \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType);         \
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 00e6c15039d..f7c3d7c385c 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -1240,6 +1240,55 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+
+    kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size>>>(
+        row_span, col_span, as_cuda_type(row_ptrs), as_cuda_type(col_idxs),
+        as_cuda_type(row_nnz->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    auto num_nnz = source->get_num_stored_elements();
+    grid_dim = ceildiv(num_nnz, default_block_size);
+    kernel::compute_submatrix_idxs_and_vals<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, num_nnz, row_offset, col_offset,
+        as_cuda_type(source->get_const_row_ptrs()),
+        as_cuda_type(source->get_const_col_idxs()),
+        as_cuda_type(source->get_const_values()),
+        as_cuda_type(result->get_const_row_ptrs()),
+        as_cuda_type(result->get_col_idxs()),
+        as_cuda_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
index 0847ac2235b..26ee15a0ae0 100644
--- a/cuda/test/matrix/csr_kernels.cpp
+++ b/cuda/test/matrix/csr_kernels.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "cuda/test/utils.hpp"
@@ -100,6 +101,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(cuda);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -147,6 +156,7 @@ class Csr : public ::testing::Test {
         dmtx->copy_from(mtx.get());
     }
 
+
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
@@ -154,6 +164,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -162,6 +173,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -926,4 +938,74 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::cuda::csr::calculate_nonzeros_per_row_in_span(
+        this->cuda, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->cuda, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->cuda, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->cuda, num_nnz)),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::cuda::csr::compute_submatrix(this->cuda, this->dmtx2.get(),
+                                               rspan, cspan, sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 7435ff35221..e8df9d6c675 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1625,6 +1625,113 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
+namespace kernel {
+
+
+template <typename IndexType>
+void calc_nnz_in_span(const span row_span, const span col_span,
+                      const IndexType* __restrict__ row_ptrs,
+                      const IndexType* __restrict__ col_idxs,
+                      IndexType* __restrict__ nnz_per_row,
+                      sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1) + row_span.begin;
+    if (tidx < row_span.end) {
+        nnz_per_row[tidx - row_span.begin] = zero<IndexType>();
+        for (size_type col = row_ptrs[tidx]; col < row_ptrs[tidx + 1]; ++col) {
+            if (col_idxs[col] >= col_span.begin &&
+                col_idxs[col] < col_span.end) {
+                nnz_per_row[tidx - row_span.begin]++;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(calc_nnz_in_span, calc_nnz_in_span);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix_idxs_and_vals(size_type num_rows, size_type num_cols,
+                                     size_type num_nnz, size_type row_offset,
+                                     size_type col_offset,
+                                     const IndexType* __restrict__ src_row_ptrs,
+                                     const IndexType* __restrict__ src_col_idxs,
+                                     const ValueType* __restrict__ src_values,
+                                     const IndexType* __restrict__ res_row_ptrs,
+                                     IndexType* __restrict__ res_col_idxs,
+                                     ValueType* __restrict__ res_values,
+                                     sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    if (tidx < num_rows) {
+        size_type res_nnz = res_row_ptrs[tidx];
+        for (size_type nnz = src_row_ptrs[row_offset + tidx];
+             nnz < src_row_ptrs[row_offset + tidx + 1]; ++nnz) {
+            if ((src_col_idxs[nnz] < (col_offset + num_cols) &&
+                 src_col_idxs[nnz] >= col_offset)) {
+                res_col_idxs[res_nnz] = src_col_idxs[nnz] - col_offset;
+                res_values[res_nnz] = src_values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(compute_submatrix_idxs_and_vals,
+                        compute_submatrix_idxs_and_vals);
+
+
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+    auto block_dim = default_block_size;
+
+    kernel::calc_nnz_in_span(grid_dim, block_dim, 0, exec->get_queue(),
+                             row_span, col_span, row_ptrs, col_idxs,
+                             row_nnz->get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    const auto row_offset = row_span.begin;
+    const auto col_offset = col_span.begin;
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto row_ptrs = source->get_const_row_ptrs();
+
+    const auto num_nnz = source->get_num_stored_elements();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+    auto block_dim = default_block_size;
+    kernel::compute_submatrix_idxs_and_vals(
+        grid_dim, block_dim, 0, exec->get_queue(), num_rows, num_cols, num_nnz,
+        row_offset, col_offset, source->get_const_row_ptrs(),
+        source->get_const_col_idxs(), source->get_const_values(),
+        result->get_const_row_ptrs(), result->get_col_idxs(),
+        result->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 namespace {
 
 
diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp
index b746d752db6..0ddcf4d84ca 100644
--- a/dpcpp/test/matrix/csr_kernels.cpp
+++ b/dpcpp/test/matrix/csr_kernels.cpp
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
@@ -102,6 +103,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<vtype>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(dpcpp);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -156,6 +165,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -164,6 +174,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -930,4 +941,75 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<vtype, int>;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    auto drow_nnz = gko::Array<int>(this->dpcpp, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::dpcpp::csr::calculate_nonzeros_per_row_in_span(
+        this->dpcpp, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<vtype, int>;
+    using IndexType = int;
+    using ValueType = vtype;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
+    auto drow_nnz = gko::Array<int>(this->dpcpp, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->dpcpp, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->dpcpp, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->dpcpp, num_nnz)),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::dpcpp::csr::compute_submatrix(this->dpcpp, this->dmtx2.get(),
+                                                rspan, cspan, sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<vtype>;
+    set_up_apply_data();
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index be28c3a471b..2fab9bf4e74 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -1082,6 +1082,56 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+
+    hipLaunchKernelGGL(kernel::calculate_nnz_per_row_in_span, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, row_span, col_span,
+                       as_hip_type(row_ptrs), as_hip_type(col_idxs),
+                       as_hip_type(row_nnz->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    auto num_nnz = source->get_num_stored_elements();
+    grid_dim = ceildiv(num_nnz, default_block_size);
+    hipLaunchKernelGGL(
+        kernel::compute_submatrix_idxs_and_vals, dim3(grid_dim),
+        dim3(default_block_size), 0, 0, num_rows, num_cols, num_nnz, row_offset,
+        col_offset, as_hip_type(source->get_const_row_ptrs()),
+        as_hip_type(source->get_const_col_idxs()),
+        as_hip_type(source->get_const_values()),
+        as_hip_type(result->get_const_row_ptrs()),
+        as_hip_type(result->get_col_idxs()), as_hip_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
index 6948d2b7320..311aa01cafd 100644
--- a/hip/test/matrix/csr_kernels.hip.cpp
+++ b/hip/test/matrix/csr_kernels.hip.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "hip/test/utils.hip.hpp"
@@ -100,6 +101,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(hip);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -154,6 +163,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -162,6 +172,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -937,4 +948,74 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    auto drow_nnz = gko::Array<int>(this->hip, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::hip::csr::calculate_nonzeros_per_row_in_span(
+        this->hip, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
+    auto drow_nnz = gko::Array<int>(this->hip, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->hip, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->hip, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->hip, num_nnz)),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::hip::csr::compute_submatrix(this->hip, this->dmtx2.get(),
+                                              rspan, cspan, sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index a9c9cc53255..02969dcc94d 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -763,6 +763,9 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 
     std::unique_ptr<Diagonal<ValueType>> extract_diagonal() const override;
 
+    std::unique_ptr<Csr<ValueType, IndexType>> create_submatrix(
+        const gko::span& row_span, const gko::span& column_span) const;
+
     std::unique_ptr<absolute_type> compute_absolute() const override;
 
     void compute_absolute_inplace() override;
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 785a934c7c6..19f16c8b543 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -713,6 +713,63 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const auto col_idxs = source->get_const_col_idxs();
+#pragma omp parallel for
+    for (size_type row = row_span.begin; row < row_span.end; ++row) {
+        row_nnz->get_data()[row - row_span.begin] = zero<IndexType>();
+        for (size_type nnz = row_ptrs[row]; nnz < row_ptrs[row + 1]; ++nnz) {
+            if (col_idxs[nnz] >= col_span.begin &&
+                col_idxs[nnz] < col_span.end) {
+                row_nnz->get_data()[row - row_span.begin]++;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const auto col_idxs = source->get_const_col_idxs();
+    const auto values = source->get_const_values();
+    auto res_row_ptrs = result->get_row_ptrs();
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        size_type res_nnz = res_row_ptrs[row];
+        for (size_type nnz = row_ptrs[row_offset + row];
+             nnz < row_ptrs[row_offset + row + 1]; ++nnz) {
+            const auto local_col = col_idxs[nnz] - col_offset;
+            if (local_col >= 0 && local_col < num_cols) {
+                result->get_col_idxs()[res_nnz] = local_col;
+                result->get_values()[res_nnz] = values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index cea56e5e91f..d6712cfa7d6 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
@@ -106,6 +107,14 @@ class Csr : public ::testing::Test {
         return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
     }
 
+    void set_up_mat_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(omp);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(int num_vectors = 1)
     {
         mtx = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 1);
@@ -158,6 +167,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -166,6 +176,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -700,4 +711,75 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_mat_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    auto drow_nnz = gko::Array<int>(this->omp, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::omp::csr::calculate_nonzeros_per_row_in_span(
+        this->omp, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
+    set_up_mat_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
+    auto drow_nnz = gko::Array<int>(this->omp, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->omp, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->omp, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->omp, num_nnz)),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::omp::csr::compute_submatrix(this->omp, this->dmtx2.get(),
+                                              rspan, cspan, sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    set_up_mat_data();
+
+    gko::span rspan{36, 98};
+    gko::span cspan{26, 104};
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 9538cdae4af..a8dbefa3c11 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -633,6 +633,64 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    size_type res_row = 0;
+    for (size_type row = row_span.begin; row < row_span.end; ++row) {
+        row_nnz->get_data()[res_row] = zero<IndexType>();
+        for (size_type nnz = source->get_const_row_ptrs()[row];
+             nnz < source->get_const_row_ptrs()[row + 1]; ++nnz) {
+            if (source->get_const_col_idxs()[nnz] < col_span.end &&
+                source->get_const_col_idxs()[nnz] >= col_span.begin) {
+                row_nnz->get_data()[res_row]++;
+            }
+        }
+        res_row++;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto res_row_ptrs = result->get_row_ptrs();
+    auto res_col_idxs = result->get_col_idxs();
+    auto res_values = result->get_values();
+    const auto src_row_ptrs = source->get_const_row_ptrs();
+    const auto src_col_idxs = source->get_const_col_idxs();
+    const auto src_values = source->get_const_values();
+
+    size_type res_nnz = 0;
+    for (size_type nnz = 0; nnz < source->get_num_stored_elements(); ++nnz) {
+        if (nnz >= src_row_ptrs[row_offset] &&
+            nnz < src_row_ptrs[row_offset + num_rows] &&
+            (src_col_idxs[nnz] < (col_offset + num_cols) &&
+             src_col_idxs[nnz] >= col_offset)) {
+            res_col_idxs[res_nnz] = src_col_idxs[nnz] - col_offset;
+            res_values[res_nnz] = src_values[nnz];
+            res_nnz++;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 1d4a3957084..216a4048e64 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -1624,4 +1624,101 @@ TYPED_TEST(CsrComplex, OutplaceAbsolute)
 }
 
 
+TYPED_TEST(Csr, CanGetSubmatrix)
+{
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    /* this->mtx
+     * 1   3   2
+     * 0   5   0
+     */
+    auto sub_mat =
+        this->mtx->create_submatrix(gko::span(0, 2), gko::span(0, 2));
+    auto ref =
+        gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{0.0, 5.0}}, this->exec);
+
+    GKO_ASSERT_MTX_NEAR(sub_mat.get(), ref.get(), 0.0);
+}
+
+
+TYPED_TEST(Csr, CanGetSubmatrix2)
+{
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto mat = gko::initialize<Mtx>(
+        {
+            I<T>{1.0, 3.0, 4.5, 0.0, 2.0},   // 0
+            I<T>{1.0, 0.0, 4.5, 7.5, 3.0},   // 1
+            I<T>{0.0, 3.0, 4.5, 0.0, 2.0},   // 2
+            I<T>{0.0, -1.0, 2.5, 0.0, 2.0},  // 3
+            I<T>{1.0, 0.0, -1.0, 3.5, 1.0},  // 4
+            I<T>{0.0, 1.0, 0.0, 0.0, 2.0},   // 5
+            I<T>{0.0, 3.0, 0.0, 7.5, 1.0}    // 6
+        },
+        this->exec);
+    ASSERT_EQ(mat->get_num_stored_elements(), 23);
+    {
+        auto sub_mat1 = mat->create_submatrix(gko::span(0, 2), gko::span(0, 2));
+        auto ref1 =
+            gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{1.0, 0.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat1.get(), ref1.get(), 0.0);
+    }
+    {
+        auto sub_mat2 = mat->create_submatrix(gko::span(2, 4), gko::span(0, 2));
+        auto ref2 =
+            gko::initialize<Mtx>({I<T>{0.0, 3.0}, I<T>{0.0, -1.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat2.get(), ref2.get(), 0.0);
+    }
+    {
+        auto sub_mat3 = mat->create_submatrix(gko::span(0, 2), gko::span(3, 5));
+        auto ref3 =
+            gko::initialize<Mtx>({I<T>{0.0, 2.0}, I<T>{7.5, 3.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat3.get(), ref3.get(), 0.0);
+    }
+    {
+        auto sub_mat4 = mat->create_submatrix(gko::span(1, 6), gko::span(2, 4));
+        /*
+           4.5, 7.5
+           4.5, 0.0
+           2.5, 0.0
+          -1.0, 3.5
+           0.0, 0.0
+        */
+        auto ref4 = gko::initialize<Mtx>(
+            {I<T>{4.5, 7.5}, I<T>{4.5, 0.0}, I<T>{2.5, 0.0}, I<T>{-1.0, 3.5},
+             I<T>{0.0, 0.0}},
+            this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat4.get(), ref4.get(), 0.0);
+    }
+    {
+        auto sub_mat5 = mat->create_submatrix(gko::span(0, 7), gko::span(0, 5));
+        auto ref5 = gko::initialize<Mtx>(
+            {
+                I<T>{1.0, 3.0, 4.5, 0.0, 2.0},   // 0
+                I<T>{1.0, 0.0, 4.5, 7.5, 3.0},   // 1
+                I<T>{0.0, 3.0, 4.5, 0.0, 2.0},   // 2
+                I<T>{0.0, -1.0, 2.5, 0.0, 2.0},  // 3
+                I<T>{1.0, 0.0, -1.0, 3.5, 1.0},  // 4
+                I<T>{0.0, 1.0, 0.0, 0.0, 2.0},   // 5
+                I<T>{0.0, 3.0, 0.0, 7.5, 1.0}    // 6
+            },
+            this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat5.get(), ref5.get(), 0.0);
+    }
+    {
+        auto sub_mat7 = mat->create_submatrix(gko::span(0, 1), gko::span(0, 1));
+        auto ref7 = gko::initialize<Mtx>({I<T>{1.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat7.get(), ref7.get(), 0.0);
+    }
+}
+
+
 }  // namespace