use two csr multiplication to generate coarse

need to improve it with less memory footprint
ginkgo-project · May 5, 2021 · 8b5901b · 8b5901b
1 parent bef4373
commit 8b5901b
Show file tree

Hide file tree

Showing 13 changed files with 24 additions and 530 deletions.
diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -225,106 +225,4 @@ __global__
 }
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void get_source_row_map_kernel(
-    const size_type source_nrows, const IndexType *__restrict__ agg_val,
-    const IndexType *__restrict__ source_row_ptrs,
-    IndexType *__restrict__ result_row_ptrs, IndexType *__restrict__ row_map)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= source_nrows) {
-        return;
-    }
-    const auto num_elems = source_row_ptrs[row + 1] - source_row_ptrs[row];
-    const auto result_idx = agg_val[row];
-    // atomic_add returns the old value, so it can be the starting point.
-    row_map[row] = atomic_add(result_row_ptrs + result_idx, num_elems);
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void move_row_kernel(
-    const size_type source_nrows, const IndexType *__restrict__ agg_val,
-    const IndexType *__restrict__ row_map,
-    const IndexType *__restrict__ source_row_ptrs,
-    const IndexType *__restrict__ source_col_idxs,
-    const ValueType *__restrict__ source_values,
-    const IndexType *__restrict__ result_row_ptrs,
-    IndexType *__restrict__ result_col_idxs,
-    ValueType *__restrict__ result_values)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= source_nrows) {
-        return;
-    }
-    auto result_i = result_row_ptrs[agg_val[row]] + row_map[row];
-    for (auto i = source_row_ptrs[row]; i < source_row_ptrs[row + 1];
-         i++, result_i++) {
-        result_col_idxs[result_i] = agg_val[source_col_idxs[i]];
-        result_values[result_i] = source_values[i];
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void merge_col_kernel(
-    const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
-    IndexType *__restrict__ temp_col_idxs, ValueType *__restrict__ temp_values,
-    IndexType *__restrict__ coarse_row_ptrs)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= nrows) {
-        return;
-    }
-
-    IndexType num_elems = zero<IndexType>();
-    const auto start = temp_row_ptrs[row];
-    const auto end = temp_row_ptrs[row + 1];
-    IndexType col = temp_col_idxs[start];
-    ValueType value = temp_values[start];
-    for (auto i = start + 1; i < end; i++) {
-        const auto current_col = temp_col_idxs[i];
-        if (current_col != col) {
-            // apply to the original data. It is sorted, so the writing position
-            // is before read position
-            temp_col_idxs[start + num_elems] = col;
-            temp_values[start + num_elems] = value;
-            value = zero<ValueType>();
-            col = current_col;
-            num_elems++;
-        }
-        value += temp_values[i];
-    }
-    // If start != end, need to process the final column
-    if (start != end) {
-        temp_col_idxs[start + num_elems] = col;
-        temp_values[start + num_elems] = value;
-        num_elems++;
-    }
-    coarse_row_ptrs[row] = num_elems;
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void copy_to_coarse_kernel(
-    const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
-    const IndexType *__restrict__ temp_col_idxs,
-    const ValueType *__restrict__ temp_values,
-    const IndexType *__restrict__ coarse_row_ptrs,
-    IndexType *__restrict__ coarse_col_idxs,
-    ValueType *__restrict__ coarse_values)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= nrows) {
-        return;
-    }
-    auto temp_i = temp_row_ptrs[row];
-    for (auto i = coarse_row_ptrs[row]; i < coarse_row_ptrs[row + 1]; i++) {
-        coarse_col_idxs[i] = temp_col_idxs[temp_i];
-        coarse_values[i] = temp_values[temp_i];
-        temp_i++;
-    }
-}
-
-
 }  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -1328,11 +1328,6 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)
-GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
 
 }  // namespace amgx_pgm
 

diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp
@@ -60,38 +60,13 @@ GKO_REGISTER_OPERATION(renumber, amgx_pgm::renumber);
 GKO_REGISTER_OPERATION(find_strongest_neighbor,
                        amgx_pgm::find_strongest_neighbor);
 GKO_REGISTER_OPERATION(assign_to_exist_agg, amgx_pgm::assign_to_exist_agg);
-GKO_REGISTER_OPERATION(amgx_pgm_generate, amgx_pgm::amgx_pgm_generate);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
 GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);
 
 
 }  // namespace amgx_pgm
 
 
-namespace {
-
-
-template <typename ValueType, typename IndexType>
-std::unique_ptr<LinOp> amgx_pgm_generate(
-    std::shared_ptr<const Executor> exec,
-    const matrix::Csr<ValueType, IndexType> *source,
-    const matrix::Csr<ValueType, IndexType> *prolong_op,
-    const matrix::Csr<ValueType, IndexType> *restrict_op)
-{
-    auto num_agg = prolong_op->get_size()[1];
-    auto coarse = matrix::Csr<ValueType, IndexType>::create(
-        exec, dim<2>{num_agg, num_agg}, 0, source->get_strategy());
-    auto temp = matrix::Csr<ValueType, IndexType>::create(
-        exec, dim<2>{num_agg, num_agg}, source->get_num_stored_elements());
-    exec->run(amgx_pgm::make_amgx_pgm_generate(source, prolong_op, restrict_op,
-                                               coarse.get(), temp.get()));
-    return std::move(coarse);
-}
-
-
-}  // namespace
-
-
 template <typename ValueType, typename IndexType>
 void AmgxPgm<ValueType, IndexType>::generate()
 {
@@ -160,7 +135,7 @@ void AmgxPgm<ValueType, IndexType>::generate()
     // Renumber the index
     exec->run(amgx_pgm::make_renumber(agg_, &num_agg));
 
-    auto coarse_dim = num_agg;
+    gko::dim<2>::dimension_type coarse_dim = num_agg;
     auto fine_dim = system_matrix_->get_size()[0];
     // TODO: prolong_op can be done with lightway format
     auto prolong_op = share(
@@ -175,8 +150,13 @@ void AmgxPgm<ValueType, IndexType>::generate()
     auto restrict_op = gko::as<matrix_type>(share(prolong_op->transpose()));
 
     // Construct the coarse matrix
-    auto coarse_matrix = share(amgx_pgm_generate(
-        exec, amgxpgm_op, prolong_op.get(), restrict_op.get()));
+    // TODO: use less memory footprint to improve it
+    auto coarse_matrix =
+        share(matrix_type::create(exec, gko::dim<2>{coarse_dim, coarse_dim}));
+    auto tmp = matrix_type::create(exec, gko::dim<2>{coarse_dim, fine_dim});
+    restrict_op->apply(amgxpgm_op, tmp.get());
+    tmp->apply(prolong_op.get(), coarse_matrix.get());
+
     this->set_multigrid_level(prolong_op, coarse_matrix, restrict_op);
 }
 

diff --git a/core/multigrid/amgx_pgm_kernels.hpp b/core/multigrid/amgx_pgm_kernels.hpp
@@ -78,15 +78,6 @@ namespace amgx_pgm {
         const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg, \
         Array<IndexType> &intermediate_agg)
 
-#define GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)   \
-    void amgx_pgm_generate(                                   \
-        std::shared_ptr<const DefaultExecutor> exec,          \
-        const matrix::Csr<ValueType, IndexType> *source,      \
-        const matrix::Csr<ValueType, IndexType> *prolong_op,  \
-        const matrix::Csr<ValueType, IndexType> *restrict_op, \
-        matrix::Csr<ValueType, IndexType> *coarse,            \
-        matrix::Csr<ValueType, IndexType> *temp)
-
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
     template <typename IndexType>                                       \
     GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL(IndexType);                  \
@@ -97,9 +88,7 @@ namespace amgx_pgm {
     template <typename ValueType, typename IndexType>                   \
     GKO_DECLARE_AMGX_PGM_FIND_STRONGEST_NEIGHBOR(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>                   \
-    GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG(ValueType, IndexType);     \
-    template <typename ValueType, typename IndexType>                   \
-    GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)
+    GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG(ValueType, IndexType)
 
 
 }  // namespace amgx_pgm

diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -172,65 +172,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const CudaExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-{
-    const auto agg_const_val = prolong_op->get_const_col_idxs();
-    const auto source_nrows = source->get_size()[0];
-    const auto source_nnz = source->get_num_stored_elements();
-    const auto coarse_nrows = coarse->get_size()[0];
-    Array<IndexType> row_map(exec, source_nrows);
-    // fill coarse row pointer as zero
-    components::fill_array(exec, temp->get_row_ptrs(), coarse_nrows + 1,
-                           zero<IndexType>());
-    // compute each source row should be moved and also change column index
-    dim3 grid(ceildiv(source_nrows, default_block_size));
-    // agg source_row (for row size) coarse row source map
-    kernel::get_source_row_map_kernel<<<grid, default_block_size>>>(
-        source_nrows, agg_const_val, source->get_const_row_ptrs(),
-        temp->get_row_ptrs(), row_map.get_data());
-    // prefix sum of temp_row_ptrs
-    components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
-    // copy source -> to coarse and change column index
-    kernel::move_row_kernel<<<grid, default_block_size>>>(
-        source_nrows, agg_const_val, row_map.get_const_data(),
-        source->get_const_row_ptrs(), source->get_const_col_idxs(),
-        as_cuda_type(source->get_const_values()), temp->get_const_row_ptrs(),
-        temp->get_col_idxs(), as_cuda_type(temp->get_values()));
-    // sort csr
-    csr::sort_by_column_index(exec, temp);
-    // summation of the elements with same position
-    grid = ceildiv(coarse_nrows, default_block_size);
-    kernel::merge_col_kernel<<<grid, default_block_size>>>(
-        coarse_nrows, temp->get_const_row_ptrs(), temp->get_col_idxs(),
-        as_cuda_type(temp->get_values()), coarse->get_row_ptrs());
-    // build the coarse matrix
-    components::prefix_sum(exec, coarse->get_row_ptrs(), coarse_nrows + 1);
-    // prefix sum of coarse->get_row_ptrs
-    const auto coarse_nnz =
-        exec->copy_val_to_host(coarse->get_row_ptrs() + coarse_nrows);
-    // reallocate size of column and values
-    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
-    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
-    auto &coarse_vals_array = coarse_builder.get_value_array();
-    coarse_col_idxs_array.resize_and_reset(coarse_nnz);
-    coarse_vals_array.resize_and_reset(coarse_nnz);
-    // copy the result
-    kernel::copy_to_coarse_kernel<<<grid, default_block_size>>>(
-        coarse_nrows, temp->get_const_row_ptrs(), temp->get_const_col_idxs(),
-        as_cuda_type(temp->get_const_values()), coarse->get_const_row_ptrs(),
-        coarse_col_idxs_array.get_data(),
-        as_cuda_type(coarse_vals_array.get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace cuda
 }  // namespace kernels

diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -288,44 +288,6 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
 }
 
 
-TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
-{
-    initialize_data();
-    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
-    auto d_csr_coarse = Csr::create(cuda, gko::dim<2>{n, n}, 0);
-    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                system_mtx->get_num_stored_elements());
-    auto d_csr_temp = Csr::create(cuda, gko::dim<2>{n, n},
-                                  d_system_mtx->get_num_stored_elements());
-    index_type num_agg;
-    // renumber again
-    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
-    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
-    for (int i = 0; i < m; i++) {
-        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
-    }
-    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
-              0);
-    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
-    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
-    auto d_prolong_op = Csr::create(cuda);
-    auto d_restrict_op = Csr::create(cuda);
-    d_prolong_op->copy_from(prolong_op.get());
-    d_restrict_op->copy_from(restrict_op.get());
-
-    gko::kernels::cuda::amgx_pgm::amgx_pgm_generate(
-        cuda, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
-        d_csr_coarse.get(), d_csr_temp.get());
-    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
-        csr_coarse.get(), csr_temp.get());
-
-    // it should be checked already in renumber
-    GKO_ASSERT_EQ(num_agg, n);
-    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
 {
     initialize_data();

diff --git a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
@@ -100,18 +100,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const DpcppExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-    GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace dpcpp
 }  // namespace kernels