ginkgo-project · pratikvn · Nov 12, 2021 · Sep 13, 2021 · Sep 13, 2021 · Sep 13, 2021
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -1046,3 +1046,60 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute_kernel(
         out_vals[out_begin + i] = in_vals[in_begin + i];
     }
 }
+
+
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
+        const size_type num_rows, const size_type num_cols,
+        const size_type num_nnz, const size_type row_offset,
+        const size_type col_offset,
+        const IndexType* __restrict__ source_row_ptrs,
+        const IndexType* __restrict__ source_col_idxs,
+        const ValueType* __restrict__ source_values,
+        const IndexType* __restrict__ result_row_ptrs,
+        IndexType* __restrict__ result_col_idxs,
+        ValueType* __restrict__ result_values)
+{
+    const auto res_row = thread::get_thread_id_flat();
+    if (res_row < num_rows) {
+        const auto src_row = res_row + row_offset;
+        auto res_nnz = result_row_ptrs[res_row];
+        for (auto nnz = source_row_ptrs[src_row];
+             nnz < source_row_ptrs[src_row + 1]; ++nnz) {
+            const auto res_col = source_col_idxs[nnz] - col_offset;
+            if (res_col < num_cols && res_col >= 0) {
+                result_col_idxs[res_nnz] = res_col;
+                result_values[res_nnz] = source_values[nnz];
+                res_nnz++;
+            }
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span(
+        const span row_span, const span col_span,
+        const IndexType* __restrict__ row_ptrs,
+        const IndexType* __restrict__ col_idxs,
+        IndexType* __restrict__ nnz_per_row)
+{
+    const auto src_row = thread::get_thread_id_flat() + row_span.begin;
+    if (src_row < row_span.end) {
+        IndexType nnz{};
+        for (auto i = row_ptrs[src_row]; i < row_ptrs[src_row + 1]; ++i) {
+            if (col_idxs[i] >= col_span.begin && col_idxs[i] < col_span.end) {
+                nnz++;
+            }
+        }
+        nnz_per_row[src_row - row_span.begin] = nnz;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -433,6 +433,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType)

diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 
 
@@ -69,6 +70,9 @@ GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp);
 GKO_REGISTER_OPERATION(calculate_total_cols, csr::calculate_total_cols);
 GKO_REGISTER_OPERATION(convert_to_ell, csr::convert_to_ell);
 GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid);
+GKO_REGISTER_OPERATION(calculate_nonzeros_per_row_in_span,
+                       csr::calculate_nonzeros_per_row_in_span);
+GKO_REGISTER_OPERATION(compute_submatrix, csr::compute_submatrix);
 GKO_REGISTER_OPERATION(transpose, csr::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
 GKO_REGISTER_OPERATION(inv_symm_permute, csr::inv_symm_permute);
@@ -85,6 +89,7 @@ GKO_REGISTER_OPERATION(is_sorted_by_column_index,
                        csr::is_sorted_by_column_index);
 GKO_REGISTER_OPERATION(extract_diagonal, csr::extract_diagonal);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
+GKO_REGISTER_OPERATION(prefix_sum, components::prefix_sum);
 GKO_REGISTER_OPERATION(inplace_absolute_array,
                        components::inplace_absolute_array);
 GKO_REGISTER_OPERATION(outplace_absolute_array,
@@ -536,6 +541,31 @@ bool Csr<ValueType, IndexType>::is_sorted_by_column_index() const
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>>
+Csr<ValueType, IndexType>::create_submatrix(const gko::span& row_span,
+                                            const gko::span& column_span) const
+{
+    using Mat = Csr<ValueType, IndexType>;
+    auto exec = this->get_executor();
+    auto sub_mat_size = gko::dim<2>(row_span.length(), column_span.length());
+    Array<IndexType> row_ptrs(exec, row_span.length() + 1);
+    exec->run(csr::make_calculate_nonzeros_per_row_in_span(
+        this, row_span, column_span, &row_ptrs));
+    exec->run(csr::make_prefix_sum(row_ptrs.get_data(), row_span.length() + 1));
+    auto num_nnz =
+        exec->copy_val_to_host(row_ptrs.get_data() + sub_mat_size[0]);
+    auto sub_mat = Mat::create(exec, sub_mat_size,
+                               std::move(Array<ValueType>(exec, num_nnz)),
+                               std::move(Array<IndexType>(exec, num_nnz)),
+                               std::move(row_ptrs), this->get_strategy());
+    exec->run(csr::make_compute_submatrix(this, row_span, column_span,
+                                          sub_mat.get()));
+    sub_mat->make_srow();
+    return sub_mat;
+}
+
+
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Diagonal<ValueType>>
 Csr<ValueType, IndexType>::extract_diagonal() const

diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
@@ -174,6 +174,18 @@ namespace kernels {
         const matrix::Csr<ValueType, IndexType>* source,             \
         Array<size_type>* result)
 
+#define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType)  \
+    void calculate_nonzeros_per_row_in_span(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                           \
+        const matrix::Csr<ValueType, IndexType>* source, const span& row_span, \
+        const span& col_span, Array<IndexType>* row_nnz)
+
+#define GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType)     \
+    void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const matrix::Csr<ValueType, IndexType>* source, \
+                           gko::span row_span, gko::span col_span,          \
+                           matrix::Csr<ValueType, IndexType>* result)
+
 #define GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType)         \
     void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec, \
                               matrix::Csr<ValueType, IndexType>* to_sort)
@@ -240,6 +252,10 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CALCULATE_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType);              \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType);         \

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -1240,6 +1240,55 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, Array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+
+    kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size>>>(
+        row_span, col_span, as_cuda_type(row_ptrs), as_cuda_type(col_idxs),
+        as_cuda_type(row_nnz->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+
+    auto num_nnz = source->get_num_stored_elements();
+    grid_dim = ceildiv(num_nnz, default_block_size);
+    kernel::compute_submatrix_idxs_and_vals<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, num_nnz, row_offset, col_offset,
+        as_cuda_type(source->get_const_row_ptrs()),
+        as_cuda_type(source->get_const_col_idxs()),
+        as_cuda_type(source->get_const_values()),
+        as_cuda_type(result->get_const_row_ptrs()),
+        as_cuda_type(result->get_col_idxs()),
+        as_cuda_type(result->get_values()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
                        const matrix::Csr<ValueType, IndexType>* source,

diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "cuda/test/utils.hpp"
@@ -100,6 +101,14 @@ class Csr : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
+    void set_up_apply_data()
+    {
+        mtx2 = Mtx::create(ref);
+        mtx2->copy_from(gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 5));
+        dmtx2 = Mtx::create(cuda);
+        dmtx2->copy_from(mtx2.get());
+    }
+
     void set_up_apply_data(std::shared_ptr<Mtx::strategy_type> strategy,
                            int num_vectors = 1)
     {
@@ -147,13 +156,15 @@ class Csr : public ::testing::Test {
         dmtx->copy_from(mtx.get());
     }
 
+
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
     const gko::dim<2> mtx_size;
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<ComplexMtx> complex_mtx;
     std::unique_ptr<Mtx> square_mtx;
     std::unique_ptr<Vec> expected;
@@ -162,6 +173,7 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dmtx2;
     std::unique_ptr<ComplexMtx> complex_dmtx;
     std::unique_ptr<Mtx> square_dmtx;
     std::unique_ptr<Vec> dresult;
@@ -926,4 +938,74 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::cuda::csr::calculate_nonzeros_per_row_in_span(
+        this->cuda, this->dmtx2.get(), rspan, cspan, &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    using IndexType = int;
+    using ValueType = double;
+    set_up_apply_data();
+    gko::span rspan{7, 51};
+    gko::span cspan{22, 88};
+    auto size = this->mtx2->get_size();
+    auto row_nnz = gko::Array<int>(this->ref, rspan.length() + 1);
+    gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
+        this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
+    gko::kernels::reference::components::prefix_sum(
+        this->ref, row_nnz.get_data(), row_nnz.get_num_elems());
+    auto num_nnz = row_nnz.get_data()[rspan.length()];
+    auto drow_nnz = gko::Array<int>(this->cuda, row_nnz);
+    auto smat1 =
+        Mtx::create(this->ref, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->ref, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->ref, num_nnz)),
+                    std::move(row_nnz));
+    auto sdmat1 =
+        Mtx::create(this->cuda, gko::dim<2>(rspan.length(), cspan.length()),
+                    std::move(gko::Array<ValueType>(this->cuda, num_nnz)),
+                    std::move(gko::Array<IndexType>(this->cuda, num_nnz)),
+                    std::move(drow_nnz));
+
+
+    gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
+                                                    rspan, cspan, smat1.get());
+    gko::kernels::cuda::csr::compute_submatrix(this->cuda, this->dmtx2.get(),
+                                               rspan, cspan, sdmat1.get());
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
+TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
+{
+    using Mtx = gko::matrix::Csr<>;
+    set_up_apply_data();
+    gko::span rspan{47, 81};
+    gko::span cspan{2, 31};
+
+    auto smat1 = this->mtx2->create_submatrix(rspan, cspan);
+    auto sdmat1 = this->dmtx2->create_submatrix(rspan, cspan);
+
+    GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
+}
+
+
 }  // namespace