From 77ec16f77d54caa20db5f696e8a6b5aa7bc4092f Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 13 Sep 2021 12:12:56 +0200
Subject: [PATCH 01/14] Add a SubMatrixExtractable interface.

---
 include/ginkgo/core/base/lin_op.hpp | 14 ++++++++++++++
 include/ginkgo/core/matrix/csr.hpp  |  4 ++++
 2 files changed, 18 insertions(+)
diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp
index 540fd6ebad3..a43580c8d81 100644
--- a/include/ginkgo/core/base/lin_op.hpp
+++ b/include/ginkgo/core/base/lin_op.hpp
@@ -654,6 +654,20 @@ class Preconditionable {
 };
 
 
+/**
+ * A submatrix of the LinOp implementing this interface can be extracted.
+ * The row and column spans must be in range of the size of the matrix.
+ *
+ * @ingroup LinOp
+ */
+template <typename ConcreteType>
+class SubMatrixExtractable {
+public:
+    virtual std::unique_ptr<ConcreteType> create_submatrix(
+        const gko::span& row_span, const gko::span& column_span) const = 0;
+};
+
+
 /**
  * The diagonal of a LinOp can be extracted. It will be implemented by
  * DiagonalExtractable<ValueType>, so the class does not need to implement it.
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index a9c9cc53255..d0ea4a9bfa7 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -127,6 +127,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             public ConvertibleTo<Sellp<ValueType, IndexType>>,
             public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
             public DiagonalExtractable<ValueType>,
+            public SubMatrixExtractable<Csr<ValueType, IndexType>>,
             public ReadableFromMatrixData<ValueType, IndexType>,
             public WritableToMatrixData<ValueType, IndexType>,
             public Transposable,
@@ -763,6 +764,9 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 
     std::unique_ptr<Diagonal<ValueType>> extract_diagonal() const override;
 
+    std::unique_ptr<Csr<ValueType, IndexType>> create_submatrix(
+        const gko::span& row_span, const gko::span& column_span) const override;
+
     std::unique_ptr<absolute_type> compute_absolute() const override;
 
     void compute_absolute_inplace() override;

From 30bde4547a1a1b35026d7be7749b2d6003501949 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 13 Sep 2021 12:13:47 +0200
Subject: [PATCH 02/14] Add core code and declare kernels

---
 core/device_hooks/common_kernels.inc.cpp |  2 ++
 core/matrix/csr.cpp                      | 27 ++++++++++++++++++++++++
 core/matrix/csr_kernels.hpp              | 17 +++++++++++++++
 dpcpp/matrix/csr_kernels.dp.cpp          | 21 ++++++++++++++++++
 hip/matrix/csr_kernels.hip.cpp           | 21 ++++++++++++++++++
 5 files changed, 88 insertions(+)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 5d55806c8ee..df8cfd05346 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -433,6 +433,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType)
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 1c8ab720900..4e1f712805f 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
+#include "core/components/reduce_array.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
 
@@ -69,6 +70,9 @@ GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp);
 GKO_REGISTER_OPERATION(calculate_total_cols, csr::calculate_total_cols);
 GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell);
 GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid);
+GKO_REGISTER_OPERATION(calculate_nonzeros_per_row_in_span,
+                       csr::calculate_nonzeros_per_row_in_span);
+GKO_REGISTER_OPERATION(compute_submatrix, csr::compute_submatrix);
 GKO_REGISTER_OPERATION(transpose, csr::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
 GKO_REGISTER_OPERATION(inv_symm_permute, csr::inv_symm_permute);
@@ -85,6 +89,7 @@ GKO_REGISTER_OPERATION(is_sorted_by_column_index,
                        csr::is_sorted_by_column_index);
 GKO_REGISTER_OPERATION(extract_diagonal, csr::extract_diagonal);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
+GKO_REGISTER_OPERATION(reduce_array, components::reduce_array);
 GKO_REGISTER_OPERATION(inplace_absolute_array,
                        components::inplace_absolute_array);
 GKO_REGISTER_OPERATION(outplace_absolute_array,
@@ -536,6 +541,28 @@ bool Csr<ValueType, IndexType>::is_sorted_by_column_index() const
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>>
+Csr<ValueType, IndexType>::create_submatrix(const gko::span& row_span,
+                                            const gko::span& column_span) const
+{
+    using Mat = Csr<ValueType, IndexType>;
+    auto exec = this->get_executor();
+    auto sub_mat_size = gko::dim<2>(row_span.length(), column_span.length());
+    Array<size_type> row_nnz(exec, row_span.length());
+    exec->run(csr::make_fill_array(row_nnz.get_data(), row_nnz.get_num_elems(),
+                                   zero<size_type>()));
+    exec->run(csr::make_calculate_nonzeros_per_row_in_span(
+        this, row_span, column_span, &row_nnz));
+    auto sub_mat_nnz = reduce(row_nnz);
+    auto sub_mat =
+        Mat::create(exec, sub_mat_size, sub_mat_nnz, this->get_strategy());
+    exec->run(csr::make_compute_submatrix(this, &row_nnz, row_span, column_span,
+                                          sub_mat.get()));
+    return sub_mat;
+}
+
+
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Diagonal<ValueType>>
 Csr<ValueType, IndexType>::extract_diagonal() const
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index 0e6a7346b7c..c3428c4bbf3 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -174,6 +174,19 @@ namespace kernels {
         const matrix::Csr<ValueType, IndexType>* source,             \
         Array<size_type>* result)
 
+#define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType)  \
+    void calculate_nonzeros_per_row_in_span(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                           \
+        const matrix::Csr<ValueType, IndexType>* source, const span& row_span, \
+        const span& col_span, Array<size_type>* row_nnz)
+
+#define GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType)     \
+    void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const matrix::Csr<ValueType, IndexType>* source, \
+                           const Array<size_type>* row_nnz,                 \
+                           gko::span row_span, gko::span col_span,          \
+                           matrix::Csr<ValueType, IndexType>* result)
+
 #define GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType)         \
     void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec, \
                               matrix::Csr<ValueType, IndexType>* to_sort)
@@ -240,6 +253,10 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType);              \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType);         \
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 7435ff35221..bb4d554387f 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -795,6 +795,27 @@ void calculate_nnz_per_row(size_type num_rows,
 GKO_ENABLE_DEFAULT_HOST(calculate_nnz_per_row, calculate_nnz_per_row);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<size_type>* row_nnz) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source,
+    const Array<size_type>* row_nnz, gko::span row_span, gko::span col_span,
+    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              size_type stride_factor,
                              const size_type* __restrict__ nnz_per_row,
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index be28c3a471b..3646a40b2bc 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -1082,6 +1082,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<size_type>* row_nnz) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source,
+    const Array<size_type>* row_nnz, gko::span row_span, gko::span col_span,
+    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,

From 9cc20303f96a9e97382829bb31bb822f9573087b Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 13 Sep 2021 12:15:53 +0200
Subject: [PATCH 03/14] Add reference, omp kernels, tests

---
 omp/matrix/csr_kernels.cpp             |  62 ++++++++
 omp/test/matrix/CMakeLists.txt         |   1 +
 omp/test/matrix/csr_kernels2.cpp       | 140 ++++++++++++++++++
 reference/matrix/csr_kernels.cpp       |  56 +++++++
 reference/test/matrix/CMakeLists.txt   |   1 +
 reference/test/matrix/csr_kernels2.cpp | 195 +++++++++++++++++++++++++
 6 files changed, 455 insertions(+)
 create mode 100644 omp/test/matrix/csr_kernels2.cpp
 create mode 100644 reference/test/matrix/csr_kernels2.cpp

diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 785a934c7c6..5eccbaeb803 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -713,6 +713,68 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<size_type>* row_nnz)
+{
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const auto col_idxs = source->get_const_col_idxs();
+#pragma omp parallel for
+    for (size_type row = row_span.begin; row < row_span.end; ++row) {
+        for (size_type col = row_ptrs[row]; col < row_ptrs[row + 1]; ++col) {
+            if (col_idxs[col] >= col_span.begin &&
+                col_idxs[col] < col_span.end) {
+                row_nnz->get_data()[row - row_span.begin]++;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       const Array<size_type>* row_nnz, gko::span row_span,
+                       gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const auto col_idxs = source->get_const_col_idxs();
+    const auto values = source->get_const_values();
+    auto res_row_ptrs = result->get_row_ptrs();
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        res_row_ptrs[row] = row_nnz->get_const_data()[row];
+    }
+    components::prefix_sum(exec, res_row_ptrs, num_rows + 1);
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        size_type res_nnz = res_row_ptrs[row];
+        for (size_type nnz = row_ptrs[row_offset + row];
+             nnz < row_ptrs[row_offset + row + 1]; ++nnz) {
+            if ((col_idxs[nnz] < (col_offset + num_cols) &&
+                 col_idxs[nnz] >= col_offset)) {
+                result->get_col_idxs()[res_nnz] = col_idxs[nnz] - col_offset;
+                result->get_values()[res_nnz] = values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/omp/test/matrix/CMakeLists.txt b/omp/test/matrix/CMakeLists.txt
index d2843b7e459..7198ddf87ea 100644
--- a/omp/test/matrix/CMakeLists.txt
+++ b/omp/test/matrix/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
+ginkgo_create_test(csr_kernels2)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
 ginkgo_create_test(ell_kernels)
diff --git a/omp/test/matrix/csr_kernels2.cpp b/omp/test/matrix/csr_kernels2.cpp
new file mode 100644
index 00000000000..e6b8bb999ce
--- /dev/null
+++ b/omp/test/matrix/csr_kernels2.cpp
@@ -0,0 +1,140 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+
+
+namespace {
+
+
+class Csr : public ::testing::Test {
+protected:
+    using Arr = gko::Array<int>;
+    using Mtx = gko::matrix::Csr<>;
+    using Vec = gko::matrix::Dense<>;
+    using ComplexVec = gko::matrix::Dense<std::complex<double>>;
+    using ComplexMtx = gko::matrix::Csr<std::complex<double>>;
+
+    Csr()
+#ifdef GINKGO_FAST_TESTS
+        : mtx_size(152, 185),
+#else
+        : mtx_size(532, 231),
+#endif
+          rand_engine(42)
+    {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        omp = gko::OmpExecutor::create();
+    }
+
+    void TearDown()
+    {
+        if (omp != nullptr) {
+            ASSERT_NO_THROW(omp->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row, int max_nnz_row)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row)
+    {
+        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
+    }
+
+    void set_up_apply_data(int num_vectors = 1)
+    {
+        mtx = Mtx::create(ref);
+        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
+        dmtx = Mtx::create(omp);
+        dmtx->copy_from(mtx.get());
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::OmpExecutor> omp;
+
+    const gko::dim<2> mtx_size;
+    std::ranlux48 rand_engine;
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> dmtx;
+};
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    gko::span rspan{36, 98};
+    gko::span cspan{26, 104};
+    auto smat1 = this->mtx->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 1e-14);
+}
+
+
+}  // namespace
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 9538cdae4af..3fe3148ff5d 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -633,6 +633,62 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<size_type>* row_nnz)
+{
+    size_type res_row = 0;
+    for (size_type row = row_span.begin; row < row_span.end; ++row) {
+        for (size_type col = source->get_const_row_ptrs()[row];
+             col < source->get_const_row_ptrs()[row + 1]; ++col) {
+            if (source->get_const_col_idxs()[col] >= col_span.begin &&
+                source->get_const_col_idxs()[col] < col_span.end) {
+                row_nnz->get_data()[res_row]++;
+            }
+        }
+        res_row++;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       const Array<size_type>* row_nnz, gko::span row_span,
+                       gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    for (size_type row = 0; row < num_rows; ++row) {
+        result->get_row_ptrs()[row] = row_nnz->get_const_data()[row];
+    }
+    components::prefix_sum(exec, result->get_row_ptrs(), num_rows + 1);
+    size_type res_nnz = 0;
+    for (size_type nnz = 0; nnz < source->get_num_stored_elements(); ++nnz) {
+        if (nnz >= source->get_const_row_ptrs()[row_offset] &&
+            nnz < source->get_const_row_ptrs()[row_offset + num_rows] &&
+            (source->get_const_col_idxs()[nnz] < (col_offset + num_cols) &&
+             source->get_const_col_idxs()[nnz] >= col_offset)) {
+            result->get_col_idxs()[res_nnz] =
+                source->get_const_col_idxs()[nnz] - col_offset;
+            result->get_values()[res_nnz] = source->get_const_values()[nnz];
+            res_nnz++;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const ReferenceExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt
index 9670a5df80c..977146062a7 100644
--- a/reference/test/matrix/CMakeLists.txt
+++ b/reference/test/matrix/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
+ginkgo_create_test(csr_kernels2)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
 ginkgo_create_test(ell_kernels)
diff --git a/reference/test/matrix/csr_kernels2.cpp b/reference/test/matrix/csr_kernels2.cpp
new file mode 100644
index 00000000000..3206ffcde4c
--- /dev/null
+++ b/reference/test/matrix/csr_kernels2.cpp
@@ -0,0 +1,195 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include <algorithm>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Csr : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Coo = gko::matrix::Coo<value_type, index_type>;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+    using Sellp = gko::matrix::Sellp<value_type, index_type>;
+    using SparsityCsr = gko::matrix::SparsityCsr<value_type, index_type>;
+    using Ell = gko::matrix::Ell<value_type, index_type>;
+    using Hybrid = gko::matrix::Hybrid<value_type, index_type>;
+    using Vec = gko::matrix::Dense<value_type>;
+    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+
+    Csr()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4,
+                          std::make_shared<typename Mtx::load_balance>(2)))
+    {
+        this->create_mtx(mtx.get());
+    }
+
+    void create_mtx(Mtx* m)
+    {
+        value_type* v = m->get_values();
+        index_type* c = m->get_col_idxs();
+        index_type* r = m->get_row_ptrs();
+        auto* s = m->get_srow();
+        /*
+         * 1   3   2
+         * 0   5   0
+         */
+        r[0] = 0;
+        r[1] = 3;
+        r[2] = 4;
+        c[0] = 0;
+        c[1] = 1;
+        c[2] = 2;
+        c[3] = 1;
+        v[0] = 1.0;
+        v[1] = 3.0;
+        v[2] = 2.0;
+        v[3] = 5.0;
+        s[0] = 0;
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<Mtx> mtx;
+};
+
+TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes);
+
+
+TYPED_TEST(Csr, CanGetSubmatrix)
+{
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    /* this->mtx
+     * 1   3   2
+     * 0   5   0
+     */
+    auto sub_mat =
+        this->mtx->create_submatrix(gko::span(0, 2), gko::span(0, 2));
+    auto ref =
+        gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{0.0, 5.0}}, this->exec);
+
+    GKO_ASSERT_MTX_NEAR(sub_mat.get(), ref.get(), 0.0);
+}
+
+
+TYPED_TEST(Csr, CanGetSubmatrix2)
+{
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto mat = gko::initialize<Mtx>(
+        {
+            // clang-format off
+            I<T>{1.0, 3.0, 4.5, 0.0, 2.0}, // 0
+            I<T>{1.0, 0.0, 4.5, 7.5, 3.0}, // 1
+            I<T>{0.0, 3.0, 4.5, 0.0, 2.0}, // 2
+            I<T>{0.0,-1.0, 2.5, 0.0, 2.0}, // 3
+            I<T>{1.0, 0.0,-1.0, 3.5, 1.0}, // 4
+            I<T>{0.0, 1.0, 0.0, 0.0, 2.0}, // 5
+            I<T>{0.0, 3.0, 0.0, 7.5, 1.0}  // 6
+                                           // clang-format on
+        },
+        this->exec);
+    ASSERT_EQ(mat->get_num_stored_elements(), 23);
+    {
+        auto sub_mat1 = mat->create_submatrix(gko::span(0, 2), gko::span(0, 2));
+        auto ref1 =
+            gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{1.0, 0.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat1.get(), ref1.get(), 0.0);
+    }
+    {
+        auto sub_mat2 = mat->create_submatrix(gko::span(2, 4), gko::span(0, 2));
+        auto ref2 =
+            gko::initialize<Mtx>({I<T>{0.0, 3.0}, I<T>{0.0, -1.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat2.get(), ref2.get(), 0.0);
+    }
+    {
+        auto sub_mat3 = mat->create_submatrix(gko::span(0, 2), gko::span(3, 5));
+        auto ref3 =
+            gko::initialize<Mtx>({I<T>{0.0, 2.0}, I<T>{7.5, 3.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat3.get(), ref3.get(), 0.0);
+    }
+    {
+        auto sub_mat4 = mat->create_submatrix(gko::span(1, 6), gko::span(2, 4));
+        /*
+           4.5, 7.5
+           4.5, 0.0
+           2.5, 0.0
+           1.0, 3.5
+           0.0, 0.0
+        */
+        auto ref4 = gko::initialize<Mtx>(
+            {I<T>{4.5, 7.5}, I<T>{4.5, 0.0}, I<T>{2.5, 0.0}, I<T>{-1.0, 3.5},
+             I<T>{0.0, 0.0}},
+            this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat4.get(), ref4.get(), 0.0);
+    }
+}
+
+
+}  // namespace

From be0a777007bb8af07f45f823e41e192047bf93a8 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 13 Sep 2021 12:18:36 +0200
Subject: [PATCH 04/14] Add CUDA kernels, tests

---
 common/cuda_hip/matrix/csr_kernels.hpp.inc |  67 ++++++++++
 cuda/matrix/csr_kernels.cu                 |  54 ++++++++
 cuda/test/matrix/CMakeLists.txt            |   1 +
 cuda/test/matrix/csr_kernels2.cpp          | 139 +++++++++++++++++++++
 dpcpp/matrix/csr_kernels.dp.cpp            |  42 +++----
 omp/test/matrix/csr_kernels2.cpp           |   4 +-
 reference/test/matrix/csr_kernels2.cpp     |   4 +-
 7 files changed, 284 insertions(+), 27 deletions(-)
 create mode 100644 cuda/test/matrix/csr_kernels2.cpp

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 7c8cb7f7a5f..35a18d36aa7 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1046,3 +1046,70 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
         out_vals[out_begin + i] = in_vals[in_begin + i];
     }
 }
+
+
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void get_row_nnz_data(
+    const size_type num_rows, const size_type* __restrict__ nnz_per_row,
+    IndexType* __restrict__ row_ptrs)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        row_ptrs[tidx] = nnz_per_row[tidx];
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
+        const size_type num_rows, const size_type num_cols,
+        const size_type num_nnz, const size_type row_offset,
+        const size_type col_offset,
+        const IndexType* __restrict__ source_row_ptrs,
+        const IndexType* __restrict__ source_col_idxs,
+        const ValueType* __restrict__ source_values,
+        const IndexType* __restrict__ result_row_ptrs,
+        IndexType* __restrict__ result_col_idxs,
+        ValueType* __restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx < num_rows) {
+        size_type res_nnz = result_row_ptrs[tidx];
+        for (size_type nnz = source_row_ptrs[row_offset + tidx];
+             nnz < source_row_ptrs[row_offset + tidx + 1]; ++nnz) {
+            if ((source_col_idxs[nnz] < (col_offset + num_cols) &&
+                 source_col_idxs[nnz] >= col_offset)) {
+                result_col_idxs[res_nnz] = source_col_idxs[nnz] - col_offset;
+                result_values[res_nnz] = source_values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span(
+        const span row_span, const span col_span,
+        const IndexType* __restrict__ row_ptrs,
+        const IndexType* __restrict__ col_idxs,
+        size_type* __restrict__ nnz_per_row)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    if (tidx >= row_span.begin && tidx < row_span.end) {
+        for (size_type col = row_ptrs[tidx]; col < row_ptrs[tidx + 1]; ++col) {
+            if (col_idxs[col] >= col_span.begin &&
+                col_idxs[col] < col_span.end) {
+                nnz_per_row[tidx - row_span.begin]++;
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 00e6c15039d..42807b50926 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -1240,6 +1240,60 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<size_type>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size>>>(
+        row_span, col_span, as_cuda_type(row_ptrs), as_cuda_type(col_idxs),
+        as_cuda_type(row_nnz->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       const Array<size_type>* row_nnz, gko::span row_span,
+                       gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+    kernel::get_row_nnz_data<<<grid_dim, default_block_size>>>(
+        num_rows, as_cuda_type(row_nnz->get_const_data()),
+        as_cuda_type(result->get_row_ptrs()));
+    components::prefix_sum(exec, result->get_row_ptrs(), num_rows + 1);
+
+    auto num_nnz = source->get_num_stored_elements();
+    grid_dim = ceildiv(num_nnz, default_block_size);
+    kernel::compute_submatrix_idxs_and_vals<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, num_nnz, row_offset, col_offset,
+        as_cuda_type(source->get_const_row_ptrs()),
+        as_cuda_type(source->get_const_col_idxs()),
+        as_cuda_type(source->get_const_values()),
+        as_cuda_type(result->get_const_row_ptrs()),
+        as_cuda_type(result->get_col_idxs()),
+        as_cuda_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
diff --git a/cuda/test/matrix/CMakeLists.txt b/cuda/test/matrix/CMakeLists.txt
index be2ebd7ef03..fa0e20c1b74 100644
--- a/cuda/test/matrix/CMakeLists.txt
+++ b/cuda/test/matrix/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
+ginkgo_create_test(csr_kernels2)
 ginkgo_create_test(fbcsr_kernels)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
diff --git a/cuda/test/matrix/csr_kernels2.cpp b/cuda/test/matrix/csr_kernels2.cpp
new file mode 100644
index 00000000000..dde23eaaa2d
--- /dev/null
+++ b/cuda/test/matrix/csr_kernels2.cpp
@@ -0,0 +1,139 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+
+
+namespace {
+
+
+class Csr : public ::testing::Test {
+protected:
+    using Arr = gko::Array<int>;
+    using Mtx = gko::matrix::Csr<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Csr()
+#ifdef GINKGO_FAST_TESTS
+        : mtx_size(152, 185),
+#else
+        : mtx_size(532, 231),
+#endif
+          rand_engine(42)
+    {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        cuda = gko::CudaExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (cuda != nullptr) {
+            ASSERT_NO_THROW(cuda->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row, int max_nnz_row)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row)
+    {
+        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
+    }
+
+    void set_up_apply_data(int num_vectors = 1)
+    {
+        mtx = Mtx::create(ref);
+        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
+        dmtx = Mtx::create(cuda);
+        dmtx->copy_from(mtx.get());
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::CudaExecutor> cuda;
+
+    const gko::dim<2> mtx_size;
+    std::ranlux48 rand_engine;
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> dmtx;
+};
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{36, 98};
+    gko::span cspan{26, 104};
+    auto smat1 = this->mtx->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 1e-14);
+}
+
+
+}  // namespace
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index bb4d554387f..83dda90a9c4 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -795,27 +795,6 @@ void calculate_nnz_per_row(size_type num_rows,
 GKO_ENABLE_DEFAULT_HOST(calculate_nnz_per_row, calculate_nnz_per_row);
 
 
-template <typename ValueType, typename IndexType>
-void calculate_nonzeros_per_row_in_span(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<size_type>* row_nnz) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_submatrix(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const Array<size_type>* row_nnz, gko::span row_span, gko::span col_span,
-    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-
-
 void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              size_type stride_factor,
                              const size_type* __restrict__ nnz_per_row,
@@ -1646,6 +1625,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<size_type>* row_nnz) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source,
+    const Array<size_type>* row_nnz, gko::span row_span, gko::span col_span,
+    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 namespace {
 
 
diff --git a/omp/test/matrix/csr_kernels2.cpp b/omp/test/matrix/csr_kernels2.cpp
index e6b8bb999ce..d3dfa65067a 100644
--- a/omp/test/matrix/csr_kernels2.cpp
+++ b/omp/test/matrix/csr_kernels2.cpp
@@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/matrix/csr.hpp>
-
-
 #include <algorithm>
 #include <numeric>
 #include <random>
@@ -45,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
diff --git a/reference/test/matrix/csr_kernels2.cpp b/reference/test/matrix/csr_kernels2.cpp
index 3206ffcde4c..84d1a5ca307 100644
--- a/reference/test/matrix/csr_kernels2.cpp
+++ b/reference/test/matrix/csr_kernels2.cpp
@@ -30,9 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/matrix/csr.hpp>
-
-
 #include <algorithm>
 
 
@@ -44,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>

From f7ae6d06a2f536d0060e1fbd911a5bc77ce3bdaf Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 4 Oct 2021 17:44:13 +0200
Subject: [PATCH 05/14] Review udpate

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 common/cuda_hip/matrix/csr_kernels.hpp.inc |  4 +--
 core/matrix/csr.cpp                        | 21 +++++++++-------
 core/matrix/csr_kernels.hpp                |  3 +--
 cuda/matrix/csr_kernels.cu                 | 11 +++-----
 cuda/test/matrix/csr_kernels2.cpp          |  2 +-
 dpcpp/matrix/csr_kernels.dp.cpp            | 12 ++++-----
 omp/matrix/csr_kernels.cpp                 | 16 ++++--------
 omp/test/matrix/csr_kernels2.cpp           |  2 +-
 reference/matrix/csr_kernels.cpp           | 29 +++++++++++-----------
 9 files changed, 46 insertions(+), 54 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 35a18d36aa7..6f9501f01a7 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1100,8 +1100,8 @@ __global__
         const IndexType* __restrict__ col_idxs,
         size_type* __restrict__ nnz_per_row)
 {
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx >= row_span.begin && tidx < row_span.end) {
+    const auto tidx = thread::get_thread_id_flat() + row_span.begin;
+    if (tidx < row_span.end) {
         for (size_type col = row_ptrs[tidx]; col < row_ptrs[tidx + 1]; ++col) {
             if (col_idxs[col] >= col_span.begin &&
                 col_idxs[col] < col_span.end) {
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 4e1f712805f..22d918d9833 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/components/reduce_array.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
@@ -89,7 +90,8 @@ GKO_REGISTER_OPERATION(is_sorted_by_column_index,
                        csr::is_sorted_by_column_index);
 GKO_REGISTER_OPERATION(extract_diagonal, csr::extract_diagonal);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
-GKO_REGISTER_OPERATION(reduce_array, components::reduce_array);
+GKO_REGISTER_OPERATION(reduce_add_array, components::reduce_add_array);
+GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum);
 GKO_REGISTER_OPERATION(inplace_absolute_array,
                        components::inplace_absolute_array);
 GKO_REGISTER_OPERATION(outplace_absolute_array,
@@ -549,16 +551,17 @@ Csr<ValueType, IndexType>::create_submatrix(const gko::span& row_span,
     using Mat = Csr<ValueType, IndexType>;
     auto exec = this->get_executor();
     auto sub_mat_size = gko::dim<2>(row_span.length(), column_span.length());
-    Array<size_type> row_nnz(exec, row_span.length());
-    exec->run(csr::make_fill_array(row_nnz.get_data(), row_nnz.get_num_elems(),
-                                   zero<size_type>()));
+    Array<IndexType> row_ptrs(exec, row_span.length() + 1);
+    exec->run(csr::make_fill_array(
+        row_ptrs.get_data(), row_ptrs.get_num_elems() + 1, zero<IndexType>()));
     exec->run(csr::make_calculate_nonzeros_per_row_in_span(
-        this, row_span, column_span, &row_nnz));
-    auto sub_mat_nnz = reduce(row_nnz);
-    auto sub_mat =
-        Mat::create(exec, sub_mat_size, sub_mat_nnz, this->get_strategy());
-    exec->run(csr::make_compute_submatrix(this, &row_nnz, row_span, column_span,
+        this, row_span, column_span, &row_ptrs));
+    exec->run(csr::make_prefix_sum(row_ptrs.get_data(), row_span.length() + 1));
+    auto sub_mat = Mat::create(exec, sub_mat_size, std::move(row_ptrs),
+                               this->get_strategy());
+    exec->run(csr::make_compute_submatrix(this, row_span, column_span,
                                           sub_mat.get()));
+    sub_mat->make_srow();
     return sub_mat;
 }
 
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index c3428c4bbf3..13f238e58a7 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -178,12 +178,11 @@ namespace kernels {
     void calculate_nonzeros_per_row_in_span(                                   \
         std::shared_ptr<const DefaultExecutor> exec,                           \
         const matrix::Csr<ValueType, IndexType>* source, const span& row_span, \
-        const span& col_span, Array<size_type>* row_nnz)
+        const span& col_span, Array<IndexType>* row_nnz)
 
 #define GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType)     \
     void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,     \
                            const matrix::Csr<ValueType, IndexType>* source, \
-                           const Array<size_type>* row_nnz,                 \
                            gko::span row_span, gko::span col_span,          \
                            matrix::Csr<ValueType, IndexType>* result)
 
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 42807b50926..f7c3d7c385c 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -1244,12 +1244,12 @@ template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row_in_span(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<size_type>* row_nnz)
+    const span& col_span, Array<IndexType>* row_nnz)
 {
     const auto num_rows = source->get_size()[0];
     auto row_ptrs = source->get_const_row_ptrs();
     auto col_idxs = source->get_const_col_idxs();
-    auto grid_dim = ceildiv(num_rows, default_block_size);
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
 
     kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size>>>(
         row_span, col_span, as_cuda_type(row_ptrs), as_cuda_type(col_idxs),
@@ -1263,8 +1263,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
-                       const Array<size_type>* row_nnz, gko::span row_span,
-                       gko::span col_span,
+                       gko::span row_span, gko::span col_span,
                        matrix::Csr<ValueType, IndexType>* result)
 {
     auto row_offset = row_span.begin;
@@ -1273,10 +1272,6 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     auto num_cols = result->get_size()[1];
     auto row_ptrs = source->get_const_row_ptrs();
     auto grid_dim = ceildiv(num_rows, default_block_size);
-    kernel::get_row_nnz_data<<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(row_nnz->get_const_data()),
-        as_cuda_type(result->get_row_ptrs()));
-    components::prefix_sum(exec, result->get_row_ptrs(), num_rows + 1);
 
     auto num_nnz = source->get_num_stored_elements();
     grid_dim = ceildiv(num_nnz, default_block_size);
diff --git a/cuda/test/matrix/csr_kernels2.cpp b/cuda/test/matrix/csr_kernels2.cpp
index dde23eaaa2d..6e9673ea79c 100644
--- a/cuda/test/matrix/csr_kernels2.cpp
+++ b/cuda/test/matrix/csr_kernels2.cpp
@@ -132,7 +132,7 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
     auto hmat = Mtx::create(this->ref);
     hmat->copy_from(sdmat1.get());
 
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 1e-14);
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
 }
 
 
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 83dda90a9c4..237545e2110 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1629,18 +1629,18 @@ template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row_in_span(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<size_type>* row_nnz) GKO_NOT_IMPLEMENTED;
+    const span& col_span, Array<IndexType>* row_nnz) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void compute_submatrix(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const Array<size_type>* row_nnz, gko::span row_span, gko::span col_span,
-    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+    GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 5eccbaeb803..64f10a63f04 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -717,7 +717,7 @@ template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row_in_span(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<size_type>* row_nnz)
+    const span& col_span, Array<IndexType>* row_nnz)
 {
     const auto row_ptrs = source->get_const_row_ptrs();
     const auto col_idxs = source->get_const_col_idxs();
@@ -739,8 +739,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
-                       const Array<size_type>* row_nnz, gko::span row_span,
-                       gko::span col_span,
+                       gko::span row_span, gko::span col_span,
                        matrix::Csr<ValueType, IndexType>* result)
 {
     auto row_offset = row_span.begin;
@@ -751,19 +750,14 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     const auto col_idxs = source->get_const_col_idxs();
     const auto values = source->get_const_values();
     auto res_row_ptrs = result->get_row_ptrs();
-#pragma omp parallel for
-    for (size_type row = 0; row < num_rows; ++row) {
-        res_row_ptrs[row] = row_nnz->get_const_data()[row];
-    }
-    components::prefix_sum(exec, res_row_ptrs, num_rows + 1);
 #pragma omp parallel for
     for (size_type row = 0; row < num_rows; ++row) {
         size_type res_nnz = res_row_ptrs[row];
         for (size_type nnz = row_ptrs[row_offset + row];
              nnz < row_ptrs[row_offset + row + 1]; ++nnz) {
-            if ((col_idxs[nnz] < (col_offset + num_cols) &&
-                 col_idxs[nnz] >= col_offset)) {
-                result->get_col_idxs()[res_nnz] = col_idxs[nnz] - col_offset;
+            const auto local_col = col_idxs[nnz] - col_offset;
+            if (local_col >= 0 && local_col < num_cols) {
+                result->get_col_idxs()[res_nnz] = local_col;
                 result->get_values()[res_nnz] = values[nnz];
                 res_nnz++;
             }
diff --git a/omp/test/matrix/csr_kernels2.cpp b/omp/test/matrix/csr_kernels2.cpp
index d3dfa65067a..c3dd39f2fb9 100644
--- a/omp/test/matrix/csr_kernels2.cpp
+++ b/omp/test/matrix/csr_kernels2.cpp
@@ -131,7 +131,7 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
     auto smat1 = this->mtx->create_submatrix(rspan, cspan);
     auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
 
-    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 1e-14);
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
 }
 
 
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 3fe3148ff5d..72420a5fa18 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -637,7 +637,7 @@ template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row_in_span(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<size_type>* row_nnz)
+    const span& col_span, Array<IndexType>* row_nnz)
 {
     size_type res_row = 0;
     for (size_type row = row_span.begin; row < row_span.end; ++row) {
@@ -659,27 +659,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
-                       const Array<size_type>* row_nnz, gko::span row_span,
-                       gko::span col_span,
+                       gko::span row_span, gko::span col_span,
                        matrix::Csr<ValueType, IndexType>* result)
 {
     auto row_offset = row_span.begin;
     auto col_offset = col_span.begin;
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
-    for (size_type row = 0; row < num_rows; ++row) {
-        result->get_row_ptrs()[row] = row_nnz->get_const_data()[row];
-    }
-    components::prefix_sum(exec, result->get_row_ptrs(), num_rows + 1);
+    auto res_row_ptrs = result->get_row_ptrs();
+    auto res_col_idxs = result->get_col_idxs();
+    auto res_values = result->get_values();
+    const auto src_row_ptrs = source->get_const_row_ptrs();
+    const auto src_col_idxs = source->get_const_col_idxs();
+    const auto src_values = source->get_const_values();
+
     size_type res_nnz = 0;
     for (size_type nnz = 0; nnz < source->get_num_stored_elements(); ++nnz) {
-        if (nnz >= source->get_const_row_ptrs()[row_offset] &&
-            nnz < source->get_const_row_ptrs()[row_offset + num_rows] &&
-            (source->get_const_col_idxs()[nnz] < (col_offset + num_cols) &&
-             source->get_const_col_idxs()[nnz] >= col_offset)) {
-            result->get_col_idxs()[res_nnz] =
-                source->get_const_col_idxs()[nnz] - col_offset;
-            result->get_values()[res_nnz] = source->get_const_values()[nnz];
+        if (nnz >= src_row_ptrs[row_offset] &&
+            nnz < src_row_ptrs[row_offset + num_rows] &&
+            (src_col_idxs[nnz] < (col_offset + num_cols) &&
+             src_col_idxs[nnz] >= col_offset)) {
+            res_col_idxs[res_nnz] = src_col_idxs[nnz] - col_offset;
+            res_values[res_nnz] = src_values[nnz];
             res_nnz++;
         }
     }

From 18958b1e62fcfdae125b583994ec7b1f25bcea73 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 19 Oct 2021 16:42:25 +0200
Subject: [PATCH 06/14] Add a CSR constructor with only row_ptrs

---
 include/ginkgo/core/matrix/csr.hpp | 31 ++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index d0ea4a9bfa7..72882f18469 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -985,6 +985,37 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
           strategy_(strategy->copy())
     {}
 
+    /**
+     * Creates a CSR matrix from already allocated (and initialized) row
+     * pointer array.
+     *
+     * @tparam RowPtrsArray  type of `row_ptrs` array
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     * @param row_ptrs  array of row pointers
+     *
+     * @note If `row_ptrs`, is not an rvalue, not
+     *       an array of IndexType, IndexType and ValueType, respectively, or
+     *       is on the wrong executor, an internal copy of that array will be
+     *       created, and the original array data will not be used in the
+     *       matrix.
+     */
+    explicit Csr(
+        std::shared_ptr<const Executor> exec, const dim<2>& size,
+        Array<IndexType>&& row_ptrs,
+        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
+        : EnableLinOp<Csr>(exec, size),
+          row_ptrs_{exec, std::forward<Array<IndexType>>(row_ptrs)},
+          srow_(exec),
+          strategy_(strategy->copy())
+    {
+        auto num_nnz = exec->copy_val_to_host(row_ptrs_.get_data() + size[0]);
+        values_ = Array<ValueType>(exec, num_nnz);
+        col_idxs_ = Array<IndexType>(exec, num_nnz);
+        this->make_srow();
+    }
+
     /**
      * Creates a CSR matrix from already allocated (and initialized) row
      * pointer, column index and value arrays.

From b2ab9936d4108e3578c7fa2dfde43761db3eef18 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 19 Oct 2021 16:42:56 +0200
Subject: [PATCH 07/14] Update CUDA/HIP kernels and add hip tests

---
 common/cuda_hip/matrix/csr_kernels.hpp.inc |   4 +-
 core/matrix/csr.cpp                        |   2 +-
 hip/matrix/csr_kernels.hip.cpp             |  41 +++++-
 hip/test/matrix/CMakeLists.txt             |   1 +
 hip/test/matrix/csr_kernels2.hip.cpp       | 139 +++++++++++++++++++++
 include/ginkgo/core/matrix/csr.hpp         |  10 +-
 reference/test/matrix/csr_kernels2.cpp     |   2 +-
 7 files changed, 181 insertions(+), 18 deletions(-)
 create mode 100644 hip/test/matrix/csr_kernels2.hip.cpp

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 6f9501f01a7..d01c9cc4f09 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1053,7 +1053,7 @@ namespace kernel {
 
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void get_row_nnz_data(
-    const size_type num_rows, const size_type* __restrict__ nnz_per_row,
+    const size_type num_rows, const IndexType* __restrict__ nnz_per_row,
     IndexType* __restrict__ row_ptrs)
 {
     const auto tidx = thread::get_thread_id_flat();
@@ -1098,7 +1098,7 @@ __global__
         const span row_span, const span col_span,
         const IndexType* __restrict__ row_ptrs,
         const IndexType* __restrict__ col_idxs,
-        size_type* __restrict__ nnz_per_row)
+        IndexType* __restrict__ nnz_per_row)
 {
     const auto tidx = thread::get_thread_id_flat() + row_span.begin;
     if (tidx < row_span.end) {
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 22d918d9833..0e015baabcb 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -553,7 +553,7 @@ Csr<ValueType, IndexType>::create_submatrix(const gko::span& row_span,
     auto sub_mat_size = gko::dim<2>(row_span.length(), column_span.length());
     Array<IndexType> row_ptrs(exec, row_span.length() + 1);
     exec->run(csr::make_fill_array(
-        row_ptrs.get_data(), row_ptrs.get_num_elems() + 1, zero<IndexType>()));
+        row_ptrs.get_data(), row_ptrs.get_num_elems(), zero<IndexType>()));
     exec->run(csr::make_calculate_nonzeros_per_row_in_span(
         this, row_span, column_span, &row_ptrs));
     exec->run(csr::make_prefix_sum(row_ptrs.get_data(), row_span.length() + 1));
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 3646a40b2bc..2fab9bf4e74 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -1086,18 +1086,47 @@ template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row_in_span(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<size_type>* row_nnz) GKO_NOT_IMPLEMENTED;
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+
+    hipLaunchKernelGGL(kernel::calculate_nnz_per_row_in_span, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, row_span, col_span,
+                       as_hip_type(row_ptrs), as_hip_type(col_idxs),
+                       as_hip_type(row_nnz->get_data()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void compute_submatrix(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const Array<size_type>* row_nnz, gko::span row_span, gko::span col_span,
-    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    auto num_nnz = source->get_num_stored_elements();
+    grid_dim = ceildiv(num_nnz, default_block_size);
+    hipLaunchKernelGGL(
+        kernel::compute_submatrix_idxs_and_vals, dim3(grid_dim),
+        dim3(default_block_size), 0, 0, num_rows, num_cols, num_nnz, row_offset,
+        col_offset, as_hip_type(source->get_const_row_ptrs()),
+        as_hip_type(source->get_const_col_idxs()),
+        as_hip_type(source->get_const_values()),
+        as_hip_type(result->get_const_row_ptrs()),
+        as_hip_type(result->get_col_idxs()), as_hip_type(result->get_values()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
diff --git a/hip/test/matrix/CMakeLists.txt b/hip/test/matrix/CMakeLists.txt
index 1759cd1ed28..ecc6df25b17 100644
--- a/hip/test/matrix/CMakeLists.txt
+++ b/hip/test/matrix/CMakeLists.txt
@@ -1,5 +1,6 @@
 ginkgo_create_hip_test(coo_kernels)
 ginkgo_create_hip_test(csr_kernels)
+ginkgo_create_hip_test(csr_kernels2)
 ginkgo_create_hip_test(dense_kernels)
 ginkgo_create_hip_test(diagonal_kernels)
 ginkgo_create_hip_test(ell_kernels)
diff --git a/hip/test/matrix/csr_kernels2.hip.cpp b/hip/test/matrix/csr_kernels2.hip.cpp
new file mode 100644
index 00000000000..d12759936a4
--- /dev/null
+++ b/hip/test/matrix/csr_kernels2.hip.cpp
@@ -0,0 +1,139 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/matrix/csr_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+
+
+namespace {
+
+
+class Csr : public ::testing::Test {
+protected:
+    using Arr = gko::Array<int>;
+    using Mtx = gko::matrix::Csr<>;
+    using Vec = gko::matrix::Dense<>;
+
+    Csr()
+#ifdef GINKGO_FAST_TESTS
+        : mtx_size(152, 185),
+#else
+        : mtx_size(532, 231),
+#endif
+          rand_engine(42)
+    {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row, int max_nnz_row)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row)
+    {
+        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
+    }
+
+    void set_up_apply_data(int num_vectors = 1)
+    {
+        mtx = Mtx::create(ref);
+        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
+        dmtx = Mtx::create(hip);
+        dmtx->copy_from(mtx.get());
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    const gko::dim<2> mtx_size;
+    std::ranlux48 rand_engine;
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> dmtx;
+};
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{36, 98};
+    gko::span cspan{26, 104};
+    auto smat1 = this->mtx->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
+}  // namespace
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 72882f18469..0b8a023b8ea 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -989,27 +989,21 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
      * Creates a CSR matrix from already allocated (and initialized) row
      * pointer array.
      *
-     * @tparam RowPtrsArray  type of `row_ptrs` array
      *
      * @param exec  Executor associated to the matrix
      * @param size  size of the matrix
      * @param row_ptrs  array of row pointers
-     *
-     * @note If `row_ptrs`, is not an rvalue, not
-     *       an array of IndexType, IndexType and ValueType, respectively, or
-     *       is on the wrong executor, an internal copy of that array will be
-     *       created, and the original array data will not be used in the
-     *       matrix.
      */
     explicit Csr(
         std::shared_ptr<const Executor> exec, const dim<2>& size,
         Array<IndexType>&& row_ptrs,
         std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
         : EnableLinOp<Csr>(exec, size),
-          row_ptrs_{exec, std::forward<Array<IndexType>>(row_ptrs)},
+          row_ptrs_{exec, std::move(row_ptrs)},
           srow_(exec),
           strategy_(strategy->copy())
     {
+        GKO_ASSERT(row_ptrs_.get_num_elems() == size[0] + 1);
         auto num_nnz = exec->copy_val_to_host(row_ptrs_.get_data() + size[0]);
         values_ = Array<ValueType>(exec, num_nnz);
         col_idxs_ = Array<IndexType>(exec, num_nnz);
diff --git a/reference/test/matrix/csr_kernels2.cpp b/reference/test/matrix/csr_kernels2.cpp
index 84d1a5ca307..1682287d857 100644
--- a/reference/test/matrix/csr_kernels2.cpp
+++ b/reference/test/matrix/csr_kernels2.cpp
@@ -177,7 +177,7 @@ TYPED_TEST(Csr, CanGetSubmatrix2)
            4.5, 7.5
            4.5, 0.0
            2.5, 0.0
-           1.0, 3.5
+          -1.0, 3.5
            0.0, 0.0
         */
         auto ref4 = gko::initialize<Mtx>(

From 1d8f3a69bffb6cef7a800f9b0417aa860a192e9e Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 2 Nov 2021 13:41:02 +0100
Subject: [PATCH 08/14] Review updates.

Co-authored-by: Aditya Kashi <aditya.kashi@kit.edu>
---
 common/cuda_hip/matrix/csr_kernels.hpp.inc | 13 +----
 cuda/test/matrix/csr_kernels2.cpp          | 66 ++++++++++++++++++++--
 include/ginkgo/core/matrix/csr.hpp         |  1 -
 reference/matrix/csr_kernels.cpp           |  4 +-
 4 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index d01c9cc4f09..41257c40de9 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1051,18 +1051,6 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
 namespace kernel {
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void get_row_nnz_data(
-    const size_type num_rows, const IndexType* __restrict__ nnz_per_row,
-    IndexType* __restrict__ row_ptrs)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < num_rows) {
-        row_ptrs[tidx] = nnz_per_row[tidx];
-    }
-}
-
-
 template <typename ValueType, typename IndexType>
 __global__
     __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
@@ -1077,6 +1065,7 @@ __global__
         ValueType* __restrict__ result_values)
 {
     const auto tidx = thread::get_thread_id_flat();
+    // for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) {
     if (tidx < num_rows) {
         size_type res_nnz = result_row_ptrs[tidx];
         for (size_type nnz = source_row_ptrs[row_offset + tidx];
diff --git a/cuda/test/matrix/csr_kernels2.cpp b/cuda/test/matrix/csr_kernels2.cpp
index 6e9673ea79c..0497ca8892a 100644
--- a/cuda/test/matrix/csr_kernels2.cpp
+++ b/cuda/test/matrix/csr_kernels2.cpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
@@ -65,7 +66,7 @@ class Csr : public ::testing::Test {
 
     Csr()
 #ifdef GINKGO_FAST_TESTS
-        : mtx_size(152, 185),
+        : mtx_size(131, 155),
 #else
         : mtx_size(532, 231),
 #endif
@@ -102,10 +103,10 @@ class Csr : public ::testing::Test {
         return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
     }
 
-    void set_up_apply_data(int num_vectors = 1)
+    void set_up_apply_data()
     {
         mtx = Mtx::create(ref);
-        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
+        mtx->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
         dmtx = Mtx::create(cuda);
         dmtx->copy_from(mtx.get());
     }
@@ -120,13 +121,68 @@ class Csr : public ::testing::Test {
 };
 
 
+TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx.get(), rspan, cspan, &row_nnz);
+    gko::kernels::cuda::csr::calculate_nonzeros_per_row_in_span(
+        this->cuda, this->dmtx.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->cuda, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::cuda::csr::compute_submatrix(this->cuda, this->dmtx.get(),
+                                               rspan, cspan, sdmat1.get());
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
 TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
     set_up_apply_data();
 
-    gko::span rspan{36, 98};
-    gko::span cspan{26, 104};
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
     auto smat1 = this->mtx->create_submatrix(rspan, cspan);
     auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
     auto hmat = Mtx::create(this->ref);
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 0b8a023b8ea..cfd8ab8dbe2 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -989,7 +989,6 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
      * Creates a CSR matrix from already allocated (and initialized) row
      * pointer array.
      *
-     *
      * @param exec  Executor associated to the matrix
      * @param size  size of the matrix
      * @param row_ptrs  array of row pointers
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 72420a5fa18..dd5a269bc3a 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -643,8 +643,8 @@ void calculate_nonzeros_per_row_in_span(
     for (size_type row = row_span.begin; row < row_span.end; ++row) {
         for (size_type col = source->get_const_row_ptrs()[row];
              col < source->get_const_row_ptrs()[row + 1]; ++col) {
-            if (source->get_const_col_idxs()[col] >= col_span.begin &&
-                source->get_const_col_idxs()[col] < col_span.end) {
+            if (source->get_const_col_idxs()[col] < col_span.end &&
+                source->get_const_col_idxs()[col] >= col_span.begin) {
                 row_nnz->get_data()[res_row]++;
             }
         }

From 476f74fd8756a58dd39e70014eedd3e95b7deb2c Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 2 Nov 2021 14:22:19 +0100
Subject: [PATCH 09/14] Move tests back to csr_kernels

---
 cuda/test/matrix/CMakeLists.txt        |   1 -
 cuda/test/matrix/csr_kernels.cpp       |  83 +++++++++++
 cuda/test/matrix/csr_kernels2.cpp      | 195 -------------------------
 hip/test/matrix/CMakeLists.txt         |   1 -
 hip/test/matrix/csr_kernels.hip.cpp    |  82 +++++++++++
 hip/test/matrix/csr_kernels2.hip.cpp   | 139 ------------------
 omp/test/matrix/CMakeLists.txt         |   1 -
 omp/test/matrix/csr_kernels.cpp        |  79 ++++++++++
 omp/test/matrix/csr_kernels2.cpp       | 138 -----------------
 reference/test/matrix/CMakeLists.txt   |   1 -
 reference/test/matrix/csr_kernels.cpp  |  77 ++++++++++
 reference/test/matrix/csr_kernels2.cpp | 193 ------------------------
 12 files changed, 321 insertions(+), 669 deletions(-)
 delete mode 100644 cuda/test/matrix/csr_kernels2.cpp
 delete mode 100644 hip/test/matrix/csr_kernels2.hip.cpp
 delete mode 100644 omp/test/matrix/csr_kernels2.cpp
 delete mode 100644 reference/test/matrix/csr_kernels2.cpp

diff --git a/cuda/test/matrix/CMakeLists.txt b/cuda/test/matrix/CMakeLists.txt
index fa0e20c1b74..be2ebd7ef03 100644
--- a/cuda/test/matrix/CMakeLists.txt
+++ b/cuda/test/matrix/CMakeLists.txt
@@ -1,6 +1,5 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
-ginkgo_create_test(csr_kernels2)
 ginkgo_create_test(fbcsr_kernels)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
index 0847ac2235b..a6f6843b71f 100644
--- a/cuda/test/matrix/csr_kernels.cpp
+++ b/cuda/test/matrix/csr_kernels.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "cuda/test/utils.hpp"
@@ -100,6 +101,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(cuda);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -147,6 +156,7 @@ class Csr : public ::testing::Test {
         dmtx->copy_from(mtx.get());
     }
 
+
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
@@ -154,6 +164,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -162,6 +173,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -926,4 +938,75 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::cuda::csr::calculate_nonzeros_per_row_in_span(
+        this->cuda, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->cuda, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::cuda::csr::compute_submatrix(this->cuda, this->dmtx2.get(),
+                                               rspan, cspan, sdmat1.get());
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/cuda/test/matrix/csr_kernels2.cpp b/cuda/test/matrix/csr_kernels2.cpp
deleted file mode 100644
index 0497ca8892a..00000000000
--- a/cuda/test/matrix/csr_kernels2.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <vector>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/identity.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "core/components/prefix_sum.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/test/utils.hpp"
-#include "core/test/utils/unsort_matrix.hpp"
-
-
-namespace {
-
-
-class Csr : public ::testing::Test {
-protected:
-    using Arr = gko::Array<int>;
-    using Mtx = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
-
-    Csr()
-#ifdef GINKGO_FAST_TESTS
-        : mtx_size(131, 155),
-#else
-        : mtx_size(532, 231),
-#endif
-          rand_engine(42)
-    {}
-
-    void SetUp()
-    {
-        ref = gko::ReferenceExecutor::create();
-        cuda = gko::CudaExecutor::create(0, ref);
-    }
-
-    void TearDown()
-    {
-        if (cuda != nullptr) {
-            ASSERT_NO_THROW(cuda->synchronize());
-        }
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row, int max_nnz_row)
-    {
-        return gko::test::generate_random_matrix<MtxType>(
-            num_rows, num_cols,
-            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row)
-    {
-        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
-    }
-
-    void set_up_apply_data()
-    {
-        mtx = Mtx::create(ref);
-        mtx->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
-        dmtx = Mtx::create(cuda);
-        dmtx->copy_from(mtx.get());
-    }
-
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<const gko::CudaExecutor> cuda;
-
-    const gko::dim<2> mtx_size;
-    std::ranlux48 rand_engine;
-    std::unique_ptr<Mtx> mtx;
-    std::unique_ptr<Mtx> dmtx;
-};
-
-
-TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
-{
-    using Mtx = gko::matrix::Csr<>;
-    set_up_apply_data();
-
-    gko::span rspan{7, 51};
-    gko::span cspan{22, 88};
-    auto size = this->mtx->get_size();
-    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
-    row_nnz.fill(gko::zero<int>());
-    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
-
-    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
-        this->ref, this->mtx.get(), rspan, cspan, &row_nnz);
-    gko::kernels::cuda::csr::calculate_nonzeros_per_row_in_span(
-        this->cuda, this->dmtx.get(), rspan, cspan, &drow_nnz);
-
-    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
-}
-
-
-TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
-{
-    using Mtx = gko::matrix::Csr<>;
-    set_up_apply_data();
-
-    gko::span rspan{7, 51};
-    gko::span cspan{22, 88};
-    auto size = this->mtx->get_size();
-    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
-    row_nnz.fill(gko::zero<int>());
-    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
-        this->ref, this->mtx.get(), rspan, cspan, &row_nnz);
-    gko::kernels::reference::components::prefix_sum(
-        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
-    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
-    auto smat1 =
-        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
-                    std::move(row_nnz));
-    auto sdmat1 =
-        Mtx::create(this->cuda, gko::dim<2>(rspan.length(), cspan.length()),
-                    std::move(drow_nnz));
-
-
-    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx.get(),
-                                                    rspan, cspan, smat1.get());
-    gko::kernels::cuda::csr::compute_submatrix(this->cuda, this->dmtx.get(),
-                                               rspan, cspan, sdmat1.get());
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
-
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
-}
-
-
-TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
-{
-    using Mtx = gko::matrix::Csr<>;
-    set_up_apply_data();
-
-    gko::span rspan{47, 81};
-    gko::span cspan{2, 31};
-    auto smat1 = this->mtx->create_submatrix(rspan, cspan);
-    auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
-
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
-}
-
-
-}  // namespace
diff --git a/hip/test/matrix/CMakeLists.txt b/hip/test/matrix/CMakeLists.txt
index ecc6df25b17..1759cd1ed28 100644
--- a/hip/test/matrix/CMakeLists.txt
+++ b/hip/test/matrix/CMakeLists.txt
@@ -1,6 +1,5 @@
 ginkgo_create_hip_test(coo_kernels)
 ginkgo_create_hip_test(csr_kernels)
-ginkgo_create_hip_test(csr_kernels2)
 ginkgo_create_hip_test(dense_kernels)
 ginkgo_create_hip_test(diagonal_kernels)
 ginkgo_create_hip_test(ell_kernels)
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
index 6948d2b7320..332f87e3837 100644
--- a/hip/test/matrix/csr_kernels.hip.cpp
+++ b/hip/test/matrix/csr_kernels.hip.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "hip/test/utils.hip.hpp"
@@ -100,6 +101,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(hip);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -154,6 +163,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -162,6 +172,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -937,4 +948,75 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    auto drow_nnz = gko::Array<int>(this->hip, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::hip::csr::calculate_nonzeros_per_row_in_span(
+        this->hip, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto drow_nnz = gko::Array<int>(this->hip, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->hip, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::hip::csr::compute_submatrix(this->hip, this->dmtx2.get(),
+                                              rspan, cspan, sdmat1.get());
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/hip/test/matrix/csr_kernels2.hip.cpp b/hip/test/matrix/csr_kernels2.hip.cpp
deleted file mode 100644
index d12759936a4..00000000000
--- a/hip/test/matrix/csr_kernels2.hip.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <vector>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/identity.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "core/matrix/csr_kernels.hpp"
-#include "core/test/utils.hpp"
-#include "core/test/utils/unsort_matrix.hpp"
-
-
-namespace {
-
-
-class Csr : public ::testing::Test {
-protected:
-    using Arr = gko::Array<int>;
-    using Mtx = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
-
-    Csr()
-#ifdef GINKGO_FAST_TESTS
-        : mtx_size(152, 185),
-#else
-        : mtx_size(532, 231),
-#endif
-          rand_engine(42)
-    {}
-
-    void SetUp()
-    {
-        ref = gko::ReferenceExecutor::create();
-        hip = gko::HipExecutor::create(0, ref);
-    }
-
-    void TearDown()
-    {
-        if (hip != nullptr) {
-            ASSERT_NO_THROW(hip->synchronize());
-        }
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row, int max_nnz_row)
-    {
-        return gko::test::generate_random_matrix<MtxType>(
-            num_rows, num_cols,
-            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row)
-    {
-        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
-    }
-
-    void set_up_apply_data(int num_vectors = 1)
-    {
-        mtx = Mtx::create(ref);
-        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
-        dmtx = Mtx::create(hip);
-        dmtx->copy_from(mtx.get());
-    }
-
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<const gko::HipExecutor> hip;
-
-    const gko::dim<2> mtx_size;
-    std::ranlux48 rand_engine;
-    std::unique_ptr<Mtx> mtx;
-    std::unique_ptr<Mtx> dmtx;
-};
-
-
-TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
-{
-    using Mtx = gko::matrix::Csr<>;
-    set_up_apply_data();
-
-    gko::span rspan{36, 98};
-    gko::span cspan{26, 104};
-    auto smat1 = this->mtx->create_submatrix(rspan, cspan);
-    auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
-
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
-}
-
-
-}  // namespace
diff --git a/omp/test/matrix/CMakeLists.txt b/omp/test/matrix/CMakeLists.txt
index 7198ddf87ea..d2843b7e459 100644
--- a/omp/test/matrix/CMakeLists.txt
+++ b/omp/test/matrix/CMakeLists.txt
@@ -1,6 +1,5 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
-ginkgo_create_test(csr_kernels2)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
 ginkgo_create_test(ell_kernels)
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index cea56e5e91f..11bfdf1dced 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
@@ -106,6 +107,14 @@ class Csr : public ::testing::Test {
         return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
     }
 
+    void set_up_mat_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(omp);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(int num_vectors = 1)
     {
         mtx = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 1);
@@ -158,6 +167,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -166,6 +176,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -700,4 +711,72 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_mat_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    auto drow_nnz = gko::Array<int>(this->omp, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::omp::csr::calculate_nonzeros_per_row_in_span(
+        this->omp, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_mat_data();
+
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto drow_nnz = gko::Array<int>(this->omp, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->omp, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::omp::csr::compute_submatrix(this->omp, this->dmtx2.get(),
+                                              rspan, cspan, sdmat1.get());
+    auto hmat = Mtx::create(this->ref);
+    hmat->copy_from(sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    set_up_mat_data();
+
+    gko::span rspan{36, 98};
+    gko::span cspan{26, 104};
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace
diff --git a/omp/test/matrix/csr_kernels2.cpp b/omp/test/matrix/csr_kernels2.cpp
deleted file mode 100644
index c3dd39f2fb9..00000000000
--- a/omp/test/matrix/csr_kernels2.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <algorithm>
-#include <numeric>
-#include <random>
-#include <vector>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/identity.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "core/matrix/csr_kernels.hpp"
-#include "core/test/utils.hpp"
-#include "core/test/utils/unsort_matrix.hpp"
-
-
-namespace {
-
-
-class Csr : public ::testing::Test {
-protected:
-    using Arr = gko::Array<int>;
-    using Mtx = gko::matrix::Csr<>;
-    using Vec = gko::matrix::Dense<>;
-    using ComplexVec = gko::matrix::Dense<std::complex<double>>;
-    using ComplexMtx = gko::matrix::Csr<std::complex<double>>;
-
-    Csr()
-#ifdef GINKGO_FAST_TESTS
-        : mtx_size(152, 185),
-#else
-        : mtx_size(532, 231),
-#endif
-          rand_engine(42)
-    {}
-
-    void SetUp()
-    {
-        ref = gko::ReferenceExecutor::create();
-        omp = gko::OmpExecutor::create();
-    }
-
-    void TearDown()
-    {
-        if (omp != nullptr) {
-            ASSERT_NO_THROW(omp->synchronize());
-        }
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row, int max_nnz_row)
-    {
-        return gko::test::generate_random_matrix<MtxType>(
-            num_rows, num_cols,
-            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
-            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
-    }
-
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
-                                     int min_nnz_row)
-    {
-        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
-    }
-
-    void set_up_apply_data(int num_vectors = 1)
-    {
-        mtx = Mtx::create(ref);
-        mtx->copy_from(gen_mtx<Vec>(mtx_size[0], mtx_size[1], 1));
-        dmtx = Mtx::create(omp);
-        dmtx->copy_from(mtx.get());
-    }
-
-    std::shared_ptr<gko::ReferenceExecutor> ref;
-    std::shared_ptr<const gko::OmpExecutor> omp;
-
-    const gko::dim<2> mtx_size;
-    std::ranlux48 rand_engine;
-    std::unique_ptr<Mtx> mtx;
-    std::unique_ptr<Mtx> dmtx;
-};
-
-
-TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    gko::span rspan{36, 98};
-    gko::span cspan{26, 104};
-    auto smat1 = this->mtx->create_submatrix(rspan, cspan);
-    auto sdmat1 = this->dmtx->create_submatrix(rspan, cspan);
-
-    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
-}
-
-
-}  // namespace
diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt
index 977146062a7..9670a5df80c 100644
--- a/reference/test/matrix/CMakeLists.txt
+++ b/reference/test/matrix/CMakeLists.txt
@@ -1,6 +1,5 @@
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
-ginkgo_create_test(csr_kernels2)
 ginkgo_create_test(dense_kernels)
 ginkgo_create_test(diagonal_kernels)
 ginkgo_create_test(ell_kernels)
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 1d4a3957084..cce523eb10f 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -1624,4 +1624,81 @@ TYPED_TEST(CsrComplex, OutplaceAbsolute)
 }
 
 
+TYPED_TEST(Csr, CanGetSubmatrix)
+{
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    /* this->mtx
+     * 1   3   2
+     * 0   5   0
+     */
+    auto sub_mat =
+        this->mtx->create_submatrix(gko::span(0, 2), gko::span(0, 2));
+    auto ref =
+        gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{0.0, 5.0}}, this->exec);
+
+    GKO_ASSERT_MTX_NEAR(sub_mat.get(), ref.get(), 0.0);
+}
+
+
+TYPED_TEST(Csr, CanGetSubmatrix2)
+{
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto mat = gko::initialize<Mtx>(
+        {
+            // clang-format off
+            I<T>{1.0, 3.0, 4.5, 0.0, 2.0}, // 0
+            I<T>{1.0, 0.0, 4.5, 7.5, 3.0}, // 1
+            I<T>{0.0, 3.0, 4.5, 0.0, 2.0}, // 2
+            I<T>{0.0,-1.0, 2.5, 0.0, 2.0}, // 3
+            I<T>{1.0, 0.0,-1.0, 3.5, 1.0}, // 4
+            I<T>{0.0, 1.0, 0.0, 0.0, 2.0}, // 5
+            I<T>{0.0, 3.0, 0.0, 7.5, 1.0}  // 6
+                                           // clang-format on
+        },
+        this->exec);
+    ASSERT_EQ(mat->get_num_stored_elements(), 23);
+    {
+        auto sub_mat1 = mat->create_submatrix(gko::span(0, 2), gko::span(0, 2));
+        auto ref1 =
+            gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{1.0, 0.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat1.get(), ref1.get(), 0.0);
+    }
+    {
+        auto sub_mat2 = mat->create_submatrix(gko::span(2, 4), gko::span(0, 2));
+        auto ref2 =
+            gko::initialize<Mtx>({I<T>{0.0, 3.0}, I<T>{0.0, -1.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat2.get(), ref2.get(), 0.0);
+    }
+    {
+        auto sub_mat3 = mat->create_submatrix(gko::span(0, 2), gko::span(3, 5));
+        auto ref3 =
+            gko::initialize<Mtx>({I<T>{0.0, 2.0}, I<T>{7.5, 3.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat3.get(), ref3.get(), 0.0);
+    }
+    {
+        auto sub_mat4 = mat->create_submatrix(gko::span(1, 6), gko::span(2, 4));
+        /*
+           4.5, 7.5
+           4.5, 0.0
+           2.5, 0.0
+          -1.0, 3.5
+           0.0, 0.0
+        */
+        auto ref4 = gko::initialize<Mtx>(
+            {I<T>{4.5, 7.5}, I<T>{4.5, 0.0}, I<T>{2.5, 0.0}, I<T>{-1.0, 3.5},
+             I<T>{0.0, 0.0}},
+            this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat4.get(), ref4.get(), 0.0);
+    }
+}
+
+
 }  // namespace
diff --git a/reference/test/matrix/csr_kernels2.cpp b/reference/test/matrix/csr_kernels2.cpp
deleted file mode 100644
index 1682287d857..00000000000
--- a/reference/test/matrix/csr_kernels2.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <algorithm>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/identity.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "core/matrix/csr_kernels.hpp"
-#include "core/test/utils.hpp"
-
-
-namespace {
-
-
-template <typename ValueIndexType>
-class Csr : public ::testing::Test {
-protected:
-    using value_type =
-        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
-    using index_type =
-        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
-    using Coo = gko::matrix::Coo<value_type, index_type>;
-    using Mtx = gko::matrix::Csr<value_type, index_type>;
-    using Sellp = gko::matrix::Sellp<value_type, index_type>;
-    using SparsityCsr = gko::matrix::SparsityCsr<value_type, index_type>;
-    using Ell = gko::matrix::Ell<value_type, index_type>;
-    using Hybrid = gko::matrix::Hybrid<value_type, index_type>;
-    using Vec = gko::matrix::Dense<value_type>;
-    using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
-
-    Csr()
-        : exec(gko::ReferenceExecutor::create()),
-          mtx(Mtx::create(exec, gko::dim<2>{2, 3}, 4,
-                          std::make_shared<typename Mtx::load_balance>(2)))
-    {
-        this->create_mtx(mtx.get());
-    }
-
-    void create_mtx(Mtx* m)
-    {
-        value_type* v = m->get_values();
-        index_type* c = m->get_col_idxs();
-        index_type* r = m->get_row_ptrs();
-        auto* s = m->get_srow();
-        /*
-         * 1   3   2
-         * 0   5   0
-         */
-        r[0] = 0;
-        r[1] = 3;
-        r[2] = 4;
-        c[0] = 0;
-        c[1] = 1;
-        c[2] = 2;
-        c[3] = 1;
-        v[0] = 1.0;
-        v[1] = 3.0;
-        v[2] = 2.0;
-        v[3] = 5.0;
-        s[0] = 0;
-    }
-
-    std::shared_ptr<const gko::ReferenceExecutor> exec;
-    std::unique_ptr<Mtx> mtx;
-};
-
-TYPED_TEST_SUITE(Csr, gko::test::ValueIndexTypes);
-
-
-TYPED_TEST(Csr, CanGetSubmatrix)
-{
-    using Vec = typename TestFixture::Vec;
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    /* this->mtx
-     * 1   3   2
-     * 0   5   0
-     */
-    auto sub_mat =
-        this->mtx->create_submatrix(gko::span(0, 2), gko::span(0, 2));
-    auto ref =
-        gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{0.0, 5.0}}, this->exec);
-
-    GKO_ASSERT_MTX_NEAR(sub_mat.get(), ref.get(), 0.0);
-}
-
-
-TYPED_TEST(Csr, CanGetSubmatrix2)
-{
-    using Vec = typename TestFixture::Vec;
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto mat = gko::initialize<Mtx>(
-        {
-            // clang-format off
-            I<T>{1.0, 3.0, 4.5, 0.0, 2.0}, // 0
-            I<T>{1.0, 0.0, 4.5, 7.5, 3.0}, // 1
-            I<T>{0.0, 3.0, 4.5, 0.0, 2.0}, // 2
-            I<T>{0.0,-1.0, 2.5, 0.0, 2.0}, // 3
-            I<T>{1.0, 0.0,-1.0, 3.5, 1.0}, // 4
-            I<T>{0.0, 1.0, 0.0, 0.0, 2.0}, // 5
-            I<T>{0.0, 3.0, 0.0, 7.5, 1.0}  // 6
-                                           // clang-format on
-        },
-        this->exec);
-    ASSERT_EQ(mat->get_num_stored_elements(), 23);
-    {
-        auto sub_mat1 = mat->create_submatrix(gko::span(0, 2), gko::span(0, 2));
-        auto ref1 =
-            gko::initialize<Mtx>({I<T>{1.0, 3.0}, I<T>{1.0, 0.0}}, this->exec);
-
-        GKO_EXPECT_MTX_NEAR(sub_mat1.get(), ref1.get(), 0.0);
-    }
-    {
-        auto sub_mat2 = mat->create_submatrix(gko::span(2, 4), gko::span(0, 2));
-        auto ref2 =
-            gko::initialize<Mtx>({I<T>{0.0, 3.0}, I<T>{0.0, -1.0}}, this->exec);
-
-        GKO_EXPECT_MTX_NEAR(sub_mat2.get(), ref2.get(), 0.0);
-    }
-    {
-        auto sub_mat3 = mat->create_submatrix(gko::span(0, 2), gko::span(3, 5));
-        auto ref3 =
-            gko::initialize<Mtx>({I<T>{0.0, 2.0}, I<T>{7.5, 3.0}}, this->exec);
-
-        GKO_EXPECT_MTX_NEAR(sub_mat3.get(), ref3.get(), 0.0);
-    }
-    {
-        auto sub_mat4 = mat->create_submatrix(gko::span(1, 6), gko::span(2, 4));
-        /*
-           4.5, 7.5
-           4.5, 0.0
-           2.5, 0.0
-          -1.0, 3.5
-           0.0, 0.0
-        */
-        auto ref4 = gko::initialize<Mtx>(
-            {I<T>{4.5, 7.5}, I<T>{4.5, 0.0}, I<T>{2.5, 0.0}, I<T>{-1.0, 3.5},
-             I<T>{0.0, 0.0}},
-            this->exec);
-
-        GKO_EXPECT_MTX_NEAR(sub_mat4.get(), ref4.get(), 0.0);
-    }
-}
-
-
-}  // namespace

From fbfcee51847f91e672e86ab3cb09209a45f4d6b4 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Wed, 3 Nov 2021 11:19:35 +0100
Subject: [PATCH 10/14] Review update.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Fritz Göbel <goebel.fritz@gmail.com>
---
 common/cuda_hip/matrix/csr_kernels.hpp.inc | 1 -
 omp/matrix/csr_kernels.cpp                 | 6 +++---
 reference/matrix/csr_kernels.cpp           | 8 ++++----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 41257c40de9..6007820e840 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1065,7 +1065,6 @@ __global__
         ValueType* __restrict__ result_values)
 {
     const auto tidx = thread::get_thread_id_flat();
-    // for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) {
     if (tidx < num_rows) {
         size_type res_nnz = result_row_ptrs[tidx];
         for (size_type nnz = source_row_ptrs[row_offset + tidx];
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 64f10a63f04..332a37e3bd7 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -723,9 +723,9 @@ void calculate_nonzeros_per_row_in_span(
     const auto col_idxs = source->get_const_col_idxs();
 #pragma omp parallel for
     for (size_type row = row_span.begin; row < row_span.end; ++row) {
-        for (size_type col = row_ptrs[row]; col < row_ptrs[row + 1]; ++col) {
-            if (col_idxs[col] >= col_span.begin &&
-                col_idxs[col] < col_span.end) {
+        for (size_type nnz = row_ptrs[row]; nnz < row_ptrs[row + 1]; ++nnz) {
+            if (col_idxs[nnz] >= col_span.begin &&
+                col_idxs[nnz] < col_span.end) {
                 row_nnz->get_data()[row - row_span.begin]++;
             }
         }
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index dd5a269bc3a..f65242126c1 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -641,10 +641,10 @@ void calculate_nonzeros_per_row_in_span(
 {
     size_type res_row = 0;
     for (size_type row = row_span.begin; row < row_span.end; ++row) {
-        for (size_type col = source->get_const_row_ptrs()[row];
-             col < source->get_const_row_ptrs()[row + 1]; ++col) {
-            if (source->get_const_col_idxs()[col] < col_span.end &&
-                source->get_const_col_idxs()[col] >= col_span.begin) {
+        for (size_type nnz = source->get_const_row_ptrs()[row];
+             nnz < source->get_const_row_ptrs()[row + 1]; ++nnz) {
+            if (source->get_const_col_idxs()[nnz] < col_span.end &&
+                source->get_const_col_idxs()[nnz] >= col_span.begin) {
                 row_nnz->get_data()[res_row]++;
             }
         }

From 16a516592b0551ea44ef69a3d9971e79877c1e5c Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 8 Nov 2021 12:11:58 +0100
Subject: [PATCH 11/14] Review update.

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 common/cuda_hip/matrix/csr_kernels.hpp.inc |  1 +
 core/matrix/csr.cpp                        | 12 ++++-----
 cuda/test/matrix/csr_kernels.cpp           | 23 ++++++++---------
 hip/test/matrix/csr_kernels.hip.cpp        | 23 ++++++++---------
 include/ginkgo/core/base/lin_op.hpp        | 14 -----------
 include/ginkgo/core/matrix/csr.hpp         | 29 ++--------------------
 omp/matrix/csr_kernels.cpp                 |  1 +
 omp/test/matrix/csr_kernels.cpp            | 15 ++++++-----
 reference/matrix/csr_kernels.cpp           |  1 +
 9 files changed, 42 insertions(+), 77 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 6007820e840..fb63bf59e03 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1090,6 +1090,7 @@ __global__
 {
     const auto tidx = thread::get_thread_id_flat() + row_span.begin;
     if (tidx < row_span.end) {
+        nnz_per_row[tidx - row_span.begin] = zero<IndexType>();
         for (size_type col = row_ptrs[tidx]; col < row_ptrs[tidx + 1]; ++col) {
             if (col_idxs[col] >= col_span.begin &&
                 col_idxs[col] < col_span.end) {
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 0e015baabcb..ae683e11813 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
-#include "core/components/reduce_array.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
 
@@ -90,7 +89,6 @@ GKO_REGISTER_OPERATION(is_sorted_by_column_index,
                        csr::is_sorted_by_column_index);
 GKO_REGISTER_OPERATION(extract_diagonal, csr::extract_diagonal);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
-GKO_REGISTER_OPERATION(reduce_add_array, components::reduce_add_array);
 GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum);
 GKO_REGISTER_OPERATION(inplace_absolute_array,
                        components::inplace_absolute_array);
@@ -552,13 +550,15 @@ Csr<ValueType, IndexType>::create_submatrix(const gko::span& row_span,
     auto exec = this->get_executor();
     auto sub_mat_size = gko::dim<2>(row_span.length(), column_span.length());
     Array<IndexType> row_ptrs(exec, row_span.length() + 1);
-    exec->run(csr::make_fill_array(
-        row_ptrs.get_data(), row_ptrs.get_num_elems(), zero<IndexType>()));
     exec->run(csr::make_calculate_nonzeros_per_row_in_span(
         this, row_span, column_span, &row_ptrs));
     exec->run(csr::make_prefix_sum(row_ptrs.get_data(), row_span.length() + 1));
-    auto sub_mat = Mat::create(exec, sub_mat_size, std::move(row_ptrs),
-                               this->get_strategy());
+    auto num_nnz =
+        exec->copy_val_to_host(row_ptrs.get_data() + sub_mat_size[0]);
+    auto sub_mat = Mat::create(exec, sub_mat_size,
+                               std::move(Array<ValueType>(exec, num_nnz)),
+                               std::move(Array<IndexType>(exec, num_nnz)),
+                               std::move(row_ptrs), this->get_strategy());
     exec->run(csr::make_compute_submatrix(this, row_span, column_span,
                                           sub_mat.get()));
     sub_mat->make_srow();
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
index a6f6843b71f..26ee15a0ae0 100644
--- a/cuda/test/matrix/csr_kernels.cpp
+++ b/cuda/test/matrix/csr_kernels.cpp
@@ -938,16 +938,14 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
-TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
     set_up_apply_data();
-
     gko::span rspan{7, 51};
     gko::span cspan{22, 88};
     auto size = this->mtx2->get_size();
     auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
-    row_nnz.fill(gko::zero<int>());
     auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
 
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
@@ -962,23 +960,28 @@ TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
 TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
     set_up_apply_data();
-
     gko::span rspan{7, 51};
     gko::span cspan{22, 88};
     auto size = this->mtx2->get_size();
     auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
-    row_nnz.fill(gko::zero<int>());
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
         this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
     gko::kernels::reference::components::prefix_sum(
         this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
     auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
     auto smat1 =
         Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
                     std::move(row_nnz));
     auto sdmat1 =
         Mtx::create(this->cuda, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->cuda, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->cuda, num_nnz)),
                     std::move(drow_nnz));
 
 
@@ -986,10 +989,8 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
                                                     rspan, cspan, smat1.get());
     gko::kernels::cuda::csr::compute_submatrix(this->cuda, this->dmtx2.get(),
                                                rspan, cspan, sdmat1.get());
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
 
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
 }
 
 
@@ -997,15 +998,13 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
     set_up_apply_data();
-
     gko::span rspan{47, 81};
     gko::span cspan{2, 31};
+
     auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
     auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
 
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
 }
 
 
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
index 332f87e3837..311aa01cafd 100644
--- a/hip/test/matrix/csr_kernels.hip.cpp
+++ b/hip/test/matrix/csr_kernels.hip.cpp
@@ -948,16 +948,14 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
-TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
     set_up_apply_data();
-
     gko::span rspan{7, 51};
     gko::span cspan{22, 88};
     auto size = this->mtx2->get_size();
     auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
-    row_nnz.fill(gko::zero<int>());
     auto drow_nnz = gko::Array<int>(this->hip, row_nnz);
 
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
@@ -972,23 +970,28 @@ TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
 TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
     set_up_apply_data();
-
     gko::span rspan{7, 51};
     gko::span cspan{22, 88};
     auto size = this->mtx2->get_size();
     auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
-    row_nnz.fill(gko::zero<int>());
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
         this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
     gko::kernels::reference::components::prefix_sum(
         this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
     auto drow_nnz = gko::Array<int>(this->hip, row_nnz);
     auto smat1 =
         Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
                     std::move(row_nnz));
     auto sdmat1 =
         Mtx::create(this->hip, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->hip, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->hip, num_nnz)),
                     std::move(drow_nnz));
 
 
@@ -996,10 +999,8 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
                                                     rspan, cspan, smat1.get());
     gko::kernels::hip::csr::compute_submatrix(this->hip, this->dmtx2.get(),
                                               rspan, cspan, sdmat1.get());
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
 
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
 }
 
 
@@ -1007,15 +1008,13 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
     set_up_apply_data();
-
     gko::span rspan{47, 81};
     gko::span cspan{2, 31};
+
     auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
     auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
 
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
 }
 
 
diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp
index a43580c8d81..540fd6ebad3 100644
--- a/include/ginkgo/core/base/lin_op.hpp
+++ b/include/ginkgo/core/base/lin_op.hpp
@@ -654,20 +654,6 @@ class Preconditionable {
 };
 
 
-/**
- * A submatrix of the LinOp implementing this interface can be extracted.
- * The row and column spans must be in range of the size of the matrix.
- *
- * @ingroup LinOp
- */
-template <typename ConcreteType>
-class SubMatrixExtractable {
-public:
-    virtual std::unique_ptr<ConcreteType> create_submatrix(
-        const gko::span& row_span, const gko::span& column_span) const = 0;
-};
-
-
 /**
  * The diagonal of a LinOp can be extracted. It will be implemented by
  * DiagonalExtractable<ValueType>, so the class does not need to implement it.
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index cfd8ab8dbe2..94f7384438a 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -127,7 +127,6 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
             public ConvertibleTo<Sellp<ValueType, IndexType>>,
             public ConvertibleTo<SparsityCsr<ValueType, IndexType>>,
             public DiagonalExtractable<ValueType>,
-            public SubMatrixExtractable<Csr<ValueType, IndexType>>,
             public ReadableFromMatrixData<ValueType, IndexType>,
             public WritableToMatrixData<ValueType, IndexType>,
             public Transposable,
@@ -765,9 +764,9 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     std::unique_ptr<Diagonal<ValueType>> extract_diagonal() const override;
 
     std::unique_ptr<Csr<ValueType, IndexType>> create_submatrix(
-        const gko::span& row_span, const gko::span& column_span) const override;
+        const gko::span& row_span, const gko::span& column_span) const;
 
-    std::unique_ptr<absolute_type> compute_absolute() const override;
+    std::unique_ptr<absolute_type> compute_absolute() const;
 
     void compute_absolute_inplace() override;
 
@@ -985,30 +984,6 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
           strategy_(strategy->copy())
     {}
 
-    /**
-     * Creates a CSR matrix from already allocated (and initialized) row
-     * pointer array.
-     *
-     * @param exec  Executor associated to the matrix
-     * @param size  size of the matrix
-     * @param row_ptrs  array of row pointers
-     */
-    explicit Csr(
-        std::shared_ptr<const Executor> exec, const dim<2>& size,
-        Array<IndexType>&& row_ptrs,
-        std::shared_ptr<strategy_type> strategy = std::make_shared<sparselib>())
-        : EnableLinOp<Csr>(exec, size),
-          row_ptrs_{exec, std::move(row_ptrs)},
-          srow_(exec),
-          strategy_(strategy->copy())
-    {
-        GKO_ASSERT(row_ptrs_.get_num_elems() == size[0] + 1);
-        auto num_nnz = exec->copy_val_to_host(row_ptrs_.get_data() + size[0]);
-        values_ = Array<ValueType>(exec, num_nnz);
-        col_idxs_ = Array<IndexType>(exec, num_nnz);
-        this->make_srow();
-    }
-
     /**
      * Creates a CSR matrix from already allocated (and initialized) row
      * pointer, column index and value arrays.
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 332a37e3bd7..19f16c8b543 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -723,6 +723,7 @@ void calculate_nonzeros_per_row_in_span(
     const auto col_idxs = source->get_const_col_idxs();
 #pragma omp parallel for
     for (size_type row = row_span.begin; row < row_span.end; ++row) {
+        row_nnz->get_data()[row - row_span.begin] = zero<IndexType>();
         for (size_type nnz = row_ptrs[row]; nnz < row_ptrs[row + 1]; ++nnz) {
             if (col_idxs[nnz] >= col_span.begin &&
                 col_idxs[nnz] < col_span.end) {
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index 11bfdf1dced..d6712cfa7d6 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -711,11 +711,10 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
-TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
     set_up_mat_data();
-
     gko::span rspan{7, 51};
     gko::span cspan{22, 88};
     auto size = this->mtx2->get_size();
@@ -735,8 +734,9 @@ TEST_F(Csr, CalculateNnzPerRowIsEquivalentToRef)
 TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
 {
     using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
     set_up_mat_data();
-
     gko::span rspan{7, 51};
     gko::span cspan{22, 88};
     auto size = this->mtx2->get_size();
@@ -746,12 +746,17 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
         this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
     gko::kernels::reference::components::prefix_sum(
         this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
     auto drow_nnz = gko::Array<int>(this->omp, row_nnz);
     auto smat1 =
         Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
                     std::move(row_nnz));
     auto sdmat1 =
         Mtx::create(this->omp, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->omp, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->omp, num_nnz)),
                     std::move(drow_nnz));
 
 
@@ -759,10 +764,8 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
                                                     rspan, cspan, smat1.get());
     gko::kernels::omp::csr::compute_submatrix(this->omp, this->dmtx2.get(),
                                               rspan, cspan, sdmat1.get());
-    auto hmat = Mtx::create(this->ref);
-    hmat->copy_from(sdmat1.get());
 
-    GKO_ASSERT_MTX_NEAR(hmat, smat1, 0.0);
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
 }
 
 
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index f65242126c1..a8dbefa3c11 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -641,6 +641,7 @@ void calculate_nonzeros_per_row_in_span(
 {
     size_type res_row = 0;
     for (size_type row = row_span.begin; row < row_span.end; ++row) {
+        row_nnz->get_data()[res_row] = zero<IndexType>();
         for (size_type nnz = source->get_const_row_ptrs()[row];
              nnz < source->get_const_row_ptrs()[row + 1]; ++nnz) {
             if (source->get_const_col_idxs()[nnz] < col_span.end &&

From 77087ad2a43a7f1e1a36ac741a198be76cca66b8 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 9 Nov 2021 18:43:58 +0100
Subject: [PATCH 12/14] Add dpcpp kernels + tests.

---
 dpcpp/matrix/csr_kernels.dp.cpp   | 88 ++++++++++++++++++++++++++++++-
 dpcpp/test/matrix/csr_kernels.cpp | 82 ++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 237545e2110..15fa13c488c 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1625,11 +1625,79 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
 
 
+namespace kernel {
+
+
+template <typename IndexType>
+void calc_nnz_in_span(const span row_span, const span col_span,
+                      const IndexType* __restrict__ row_ptrs,
+                      const IndexType* __restrict__ col_idxs,
+                      IndexType* __restrict__ nnz_per_row,
+                      sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1) + row_span.begin;
+    if (tidx < row_span.end) {
+        nnz_per_row[tidx - row_span.begin] = zero<IndexType>();
+        for (size_type col = row_ptrs[tidx]; col < row_ptrs[tidx + 1]; ++col) {
+            if (col_idxs[col] >= col_span.begin &&
+                col_idxs[col] < col_span.end) {
+                nnz_per_row[tidx - row_span.begin]++;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(calc_nnz_in_span, calc_nnz_in_span);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submat(size_type num_rows, size_type num_cols, size_type num_nnz,
+                    size_type row_offset, size_type col_offset,
+                    const IndexType* __restrict__ src_row_ptrs,
+                    const IndexType* __restrict__ src_col_idxs,
+                    const ValueType* __restrict__ src_values,
+                    const IndexType* __restrict__ res_row_ptrs,
+                    IndexType* __restrict__ res_col_idxs,
+                    ValueType* __restrict__ res_values,
+                    sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    if (tidx < num_rows) {
+        size_type res_nnz = res_row_ptrs[tidx];
+        for (size_type nnz = src_row_ptrs[row_offset + tidx];
+             nnz < src_row_ptrs[row_offset + tidx + 1]; ++nnz) {
+            if ((src_col_idxs[nnz] < (col_offset + num_cols) &&
+                 src_col_idxs[nnz] >= col_offset)) {
+                res_col_idxs[res_nnz] = src_col_idxs[nnz] - col_offset;
+                res_values[res_nnz] = src_values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(compute_submat, compute_submat);
+
+
+}  // namespace kernel
+
+
 template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row_in_span(
     std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, Array<IndexType>* row_nnz) GKO_NOT_IMPLEMENTED;
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+    auto block_dim = default_block_size;
+
+    kernel::calc_nnz_in_span(grid_dim, block_dim, 0, exec->get_queue(),
+                             row_span, col_span, row_ptrs, col_idxs,
+                             row_nnz->get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
@@ -1640,7 +1708,23 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,
                        gko::span row_span, gko::span col_span,
                        matrix::Csr<ValueType, IndexType>* result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto row_offset = row_span.begin;
+    const auto col_offset = col_span.begin;
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto row_ptrs = source->get_const_row_ptrs();
+
+    const auto num_nnz = source->get_num_stored_elements();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+    auto block_dim = default_block_size;
+    kernel::compute_submat(
+        grid_dim, block_dim, 0, exec->get_queue(), num_rows, num_cols, num_nnz,
+        row_offset, col_offset, source->get_const_row_ptrs(),
+        source->get_const_col_idxs(), source->get_const_values(),
+        result->get_const_row_ptrs(), result->get_col_idxs(),
+        result->get_values());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp
index b746d752db6..0ddcf4d84ca 100644
--- a/dpcpp/test/matrix/csr_kernels.cpp
+++ b/dpcpp/test/matrix/csr_kernels.cpp
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
@@ -102,6 +103,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<vtype>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(dpcpp);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -156,6 +165,7 @@ class Csr : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -164,6 +174,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -930,4 +941,75 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<vtype, int>;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    auto drow_nnz = gko::Array<int>(this->dpcpp, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::dpcpp::csr::calculate_nonzeros_per_row_in_span(
+        this->dpcpp, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<vtype, int>;
+    using IndexType = int;
+    using ValueType = vtype;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    row_nnz.fill(gko::zero<int>());
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
+    auto drow_nnz = gko::Array<int>(this->dpcpp, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->dpcpp, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->dpcpp, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->dpcpp, num_nnz)),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::dpcpp::csr::compute_submatrix(this->dpcpp, this->dmtx2.get(),
+                                                rspan, cspan, sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<vtype>;
+    set_up_apply_data();
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace

From 57cc8d86a7c0e1ac73d8b243554dd1e94edbba3e Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 9 Nov 2021 22:45:05 +0100
Subject: [PATCH 13/14] Some more edge case tests

---
 dpcpp/matrix/csr_kernels.dp.cpp       | 24 +++++++++--------
 reference/test/matrix/csr_kernels.cpp | 38 ++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 15fa13c488c..e8df9d6c675 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -1651,15 +1651,16 @@ GKO_ENABLE_DEFAULT_HOST(calc_nnz_in_span, calc_nnz_in_span);
 
 
 template <typename ValueType, typename IndexType>
-void compute_submat(size_type num_rows, size_type num_cols, size_type num_nnz,
-                    size_type row_offset, size_type col_offset,
-                    const IndexType* __restrict__ src_row_ptrs,
-                    const IndexType* __restrict__ src_col_idxs,
-                    const ValueType* __restrict__ src_values,
-                    const IndexType* __restrict__ res_row_ptrs,
-                    IndexType* __restrict__ res_col_idxs,
-                    ValueType* __restrict__ res_values,
-                    sycl::nd_item<3> item_ct1)
+void compute_submatrix_idxs_and_vals(size_type num_rows, size_type num_cols,
+                                     size_type num_nnz, size_type row_offset,
+                                     size_type col_offset,
+                                     const IndexType* __restrict__ src_row_ptrs,
+                                     const IndexType* __restrict__ src_col_idxs,
+                                     const ValueType* __restrict__ src_values,
+                                     const IndexType* __restrict__ res_row_ptrs,
+                                     IndexType* __restrict__ res_col_idxs,
+                                     ValueType* __restrict__ res_values,
+                                     sycl::nd_item<3> item_ct1)
 {
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     if (tidx < num_rows) {
@@ -1676,7 +1677,8 @@ void compute_submat(size_type num_rows, size_type num_cols, size_type num_nnz,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(compute_submat, compute_submat);
+GKO_ENABLE_DEFAULT_HOST(compute_submatrix_idxs_and_vals,
+                        compute_submatrix_idxs_and_vals);
 
 
 }  // namespace kernel
@@ -1718,7 +1720,7 @@ void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_nnz = source->get_num_stored_elements();
     auto grid_dim = ceildiv(num_rows, default_block_size);
     auto block_dim = default_block_size;
-    kernel::compute_submat(
+    kernel::compute_submatrix_idxs_and_vals(
         grid_dim, block_dim, 0, exec->get_queue(), num_rows, num_cols, num_nnz,
         row_offset, col_offset, source->get_const_row_ptrs(),
         source->get_const_col_idxs(), source->get_const_values(),
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index cce523eb10f..216a4048e64 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -1649,15 +1649,13 @@ TYPED_TEST(Csr, CanGetSubmatrix2)
     using T = typename TestFixture::value_type;
     auto mat = gko::initialize<Mtx>(
         {
-            // clang-format off
-            I<T>{1.0, 3.0, 4.5, 0.0, 2.0}, // 0
-            I<T>{1.0, 0.0, 4.5, 7.5, 3.0}, // 1
-            I<T>{0.0, 3.0, 4.5, 0.0, 2.0}, // 2
-            I<T>{0.0,-1.0, 2.5, 0.0, 2.0}, // 3
-            I<T>{1.0, 0.0,-1.0, 3.5, 1.0}, // 4
-            I<T>{0.0, 1.0, 0.0, 0.0, 2.0}, // 5
-            I<T>{0.0, 3.0, 0.0, 7.5, 1.0}  // 6
-                                           // clang-format on
+            I<T>{1.0, 3.0, 4.5, 0.0, 2.0},   // 0
+            I<T>{1.0, 0.0, 4.5, 7.5, 3.0},   // 1
+            I<T>{0.0, 3.0, 4.5, 0.0, 2.0},   // 2
+            I<T>{0.0, -1.0, 2.5, 0.0, 2.0},  // 3
+            I<T>{1.0, 0.0, -1.0, 3.5, 1.0},  // 4
+            I<T>{0.0, 1.0, 0.0, 0.0, 2.0},   // 5
+            I<T>{0.0, 3.0, 0.0, 7.5, 1.0}    // 6
         },
         this->exec);
     ASSERT_EQ(mat->get_num_stored_elements(), 23);
@@ -1698,6 +1696,28 @@ TYPED_TEST(Csr, CanGetSubmatrix2)
 
         GKO_EXPECT_MTX_NEAR(sub_mat4.get(), ref4.get(), 0.0);
     }
+    {
+        auto sub_mat5 = mat->create_submatrix(gko::span(0, 7), gko::span(0, 5));
+        auto ref5 = gko::initialize<Mtx>(
+            {
+                I<T>{1.0, 3.0, 4.5, 0.0, 2.0},   // 0
+                I<T>{1.0, 0.0, 4.5, 7.5, 3.0},   // 1
+                I<T>{0.0, 3.0, 4.5, 0.0, 2.0},   // 2
+                I<T>{0.0, -1.0, 2.5, 0.0, 2.0},  // 3
+                I<T>{1.0, 0.0, -1.0, 3.5, 1.0},  // 4
+                I<T>{0.0, 1.0, 0.0, 0.0, 2.0},   // 5
+                I<T>{0.0, 3.0, 0.0, 7.5, 1.0}    // 6
+            },
+            this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat5.get(), ref5.get(), 0.0);
+    }
+    {
+        auto sub_mat7 = mat->create_submatrix(gko::span(0, 1), gko::span(0, 1));
+        auto ref7 = gko::initialize<Mtx>({I<T>{1.0}}, this->exec);
+
+        GKO_EXPECT_MTX_NEAR(sub_mat7.get(), ref7.get(), 0.0);
+    }
 }
 
 

From 7494456e80e2ea00bfd80ef4305e1914552e6485 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 11 Nov 2021 17:52:06 +0100
Subject: [PATCH 14/14] Review update.

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 common/cuda_hip/matrix/csr_kernels.hpp.inc | 31 +++++++++++-----------
 include/ginkgo/core/matrix/csr.hpp         |  2 +-
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index fb63bf59e03..853a694ab9c 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1064,14 +1064,15 @@ __global__
         IndexType* __restrict__ result_col_idxs,
         ValueType* __restrict__ result_values)
 {
-    const auto tidx = thread::get_thread_id_flat();
-    if (tidx < num_rows) {
-        size_type res_nnz = result_row_ptrs[tidx];
-        for (size_type nnz = source_row_ptrs[row_offset + tidx];
-             nnz < source_row_ptrs[row_offset + tidx + 1]; ++nnz) {
-            if ((source_col_idxs[nnz] < (col_offset + num_cols) &&
-                 source_col_idxs[nnz] >= col_offset)) {
-                result_col_idxs[res_nnz] = source_col_idxs[nnz] - col_offset;
+    const auto res_row = thread::get_thread_id_flat();
+    if (res_row < num_rows) {
+        const auto src_row = res_row + row_offset;
+        auto res_nnz = result_row_ptrs[res_row];
+        for (auto nnz = source_row_ptrs[src_row];
+             nnz < source_row_ptrs[src_row + 1]; ++nnz) {
+            const auto res_col = source_col_idxs[nnz] - col_offset;
+            if (res_col < num_cols && res_col >= 0) {
+                result_col_idxs[res_nnz] = res_col;
                 result_values[res_nnz] = source_values[nnz];
                 res_nnz++;
             }
@@ -1088,15 +1089,15 @@ __global__
         const IndexType* __restrict__ col_idxs,
         IndexType* __restrict__ nnz_per_row)
 {
-    const auto tidx = thread::get_thread_id_flat() + row_span.begin;
-    if (tidx < row_span.end) {
-        nnz_per_row[tidx - row_span.begin] = zero<IndexType>();
-        for (size_type col = row_ptrs[tidx]; col < row_ptrs[tidx + 1]; ++col) {
-            if (col_idxs[col] >= col_span.begin &&
-                col_idxs[col] < col_span.end) {
-                nnz_per_row[tidx - row_span.begin]++;
+    const auto src_row = thread::get_thread_id_flat() + row_span.begin;
+    if (src_row < row_span.end) {
+        IndexType nnz{};
+        for (auto i = row_ptrs[src_row]; i < row_ptrs[src_row + 1]; ++i) {
+            if (col_idxs[i] >= col_span.begin && col_idxs[i] < col_span.end) {
+                nnz++;
             }
         }
+        nnz_per_row[src_row - row_span.begin] = nnz;
     }
 }
 
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 94f7384438a..02969dcc94d 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -766,7 +766,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
     std::unique_ptr<Csr<ValueType, IndexType>> create_submatrix(
         const gko::span& row_span, const gko::span& column_span) const;
 
-    std::unique_ptr<absolute_type> compute_absolute() const;
+    std::unique_ptr<absolute_type> compute_absolute() const override;
 
     void compute_absolute_inplace() override;