From b43bbe6f469186e23239cf6f331187519853f75b Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 8 Jul 2020 11:16:25 +0200
Subject: [PATCH 01/16] implement cuda amgx_pgm size2 expect for generate and
 related test

---
 common/multigrid/amgx_pgm_kernels.hpp.inc | 279 ++++++++++++++++++
 cuda/multigrid/amgx_pgm_kernels.cu        |  83 +++++-
 cuda/test/CMakeLists.txt                  |   1 +
 cuda/test/multigrid/CMakeLists.txt        |   1 +
 cuda/test/multigrid/amgx_pgm_kernels.cpp  | 341 ++++++++++++++++++++++
 5 files changed, 693 insertions(+), 12 deletions(-)
 create mode 100644 common/multigrid/amgx_pgm_kernels.hpp.inc
 create mode 100644 cuda/test/multigrid/CMakeLists.txt
 create mode 100644 cuda/test/multigrid/amgx_pgm_kernels.cpp
diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
new file mode 100644
index 00000000000..6f460ad98c2
--- /dev/null
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -0,0 +1,279 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void restrict_apply_kernel(
+    const IndexType *__restrict__ agg, const size_type num_rows,
+    const size_type num_rhs, const ValueType *__restrict__ b,
+    const size_type b_stride, ValueType *__restrict__ x,
+    const size_type x_stride)
+{
+    auto tidx = thread::get_thread_id_flat();
+    auto row = tidx / num_rhs;
+    if (row >= num_rows) {
+        return;
+    }
+    auto col = tidx % num_rhs;
+    auto ind = agg[row];
+    atomic_add(x + ind * x_stride + col, b[row * b_stride + col]);
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void prolong_applyadd_kernel(
+    const IndexType *__restrict__ agg, const size_type num_rows,
+    const size_type num_rhs, const ValueType *__restrict__ b,
+    const size_type b_stride, ValueType *__restrict__ x,
+    const size_type x_stride)
+{
+    auto tidx = thread::get_thread_id_flat();
+    auto row = tidx / num_rhs;
+    if (row >= num_rows) {
+        return;
+    }
+    auto col = tidx % num_rhs;
+    auto ind = agg[row];
+    x[row * x_stride + col] += b[ind * b_stride + col];
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void replace_kernel(
+    size_type size, const IndexType *__restrict__ source,
+    IndexType *__restrict__ result)
+{
+    auto tidx = thread::get_thread_id_flat();
+    if (tidx >= size) {
+        return;
+    }
+
+    result[tidx] = source[tidx] == -1;
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void match_edge_kernel(
+    size_type num, const IndexType *__restrict__ strongest_neighbor_vals,
+    IndexType *__restrict__ agg_vals)
+{
+    auto tidx = thread::get_thread_id_flat();
+    if (tidx >= num) {
+        return;
+    }
+    if (agg_vals[tidx] != -1) {
+        return;
+    }
+    auto neighbor = strongest_neighbor_vals[tidx];
+    if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx) {
+        agg_vals[tidx] = tidx;
+        agg_vals[neighbor] = tidx;
+        // Use the smaller index as agg point
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void activate_kernel(
+    size_type num, const IndexType *__restrict__ agg,
+    IndexType *__restrict__ active_agg)
+{
+    auto tidx = thread::get_thread_id_flat();
+    if (tidx >= num) {
+        return;
+    }
+    active_agg[tidx] = agg[tidx] == -1;
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_agg_kernel(
+    size_type num, const IndexType *__restrict__ index,
+    IndexType *__restrict__ result)
+{
+    auto tidx = thread::get_thread_id_flat();
+    if (tidx >= num) {
+        return;
+    }
+    result[index[tidx]] = 1;
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void renumber_kernel(
+    size_type num, const IndexType *__restrict__ map,
+    IndexType *__restrict__ result)
+{
+    auto tidx = thread::get_thread_id_flat();
+    if (tidx >= num) {
+        return;
+    }
+    result[tidx] = map[result[tidx]];
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void find_strongest_neighbor_kernel(
+        const size_type num, const IndexType *__restrict__ row_ptrs,
+        const IndexType *__restrict__ col_idxs,
+        const ValueType *__restrict__ weight_vals,
+        const ValueType *__restrict__ diag, const size_type diag_stride,
+        IndexType *__restrict__ agg, IndexType *__restrict__ strongest_neighbor)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= num) {
+        return;
+    }
+
+    auto max_weight_unagg = zero<ValueType>();
+    auto max_weight_agg = zero<ValueType>();
+    IndexType strongest_unagg = -1;
+    IndexType strongest_agg = -1;
+    if (agg[row] != -1) {
+        return;
+    }
+    for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
+        auto col = col_idxs[idx];
+        if (col == row) {
+            continue;
+        }
+        auto weight = weight_vals[idx] / max(abs(diag[row * diag_stride]),
+                                             abs(diag[col * diag_stride]));
+        if (agg[col] == -1 &&
+            (weight > max_weight_unagg ||
+             (weight == max_weight_unagg && col > strongest_unagg))) {
+            max_weight_unagg = weight;
+            strongest_unagg = col;
+        } else if (agg[col] != -1 &&
+                   (weight > max_weight_agg ||
+                    (weight == max_weight_agg && col > strongest_agg))) {
+            max_weight_agg = weight;
+            strongest_agg = col;
+        }
+    }
+
+    if (strongest_unagg == -1 && strongest_agg != -1) {
+        // all neighbor is agg, connect to the strongest agg
+        // Also, no others will use this item as their strongest_neighbor
+        // because they are already aggregated. Thus, it is determinstic
+        // behavior
+        agg[row] = agg[strongest_agg];
+    } else if (strongest_unagg != -1) {
+        // set the strongest neighbor in the unagg group
+        strongest_neighbor[row] = strongest_unagg;
+    } else {
+        // no neighbor
+        strongest_neighbor[row] = row;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void assign_to_exist_agg_kernel(
+        const size_type num, const IndexType *__restrict__ row_ptrs,
+        const IndexType *__restrict__ col_idxs,
+        const ValueType *__restrict__ weight_vals,
+        const ValueType *__restrict__ diag, const size_type diag_stride,
+        const IndexType *__restrict__ agg_const_val,
+        IndexType *__restrict__ agg_val)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= num) {
+        return;
+    }
+    if (agg_val[row] != -1) {
+        return;
+    }
+    ValueType max_weight_agg = zero<ValueType>();
+    IndexType strongest_agg = -1;
+    for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
+        auto col = col_idxs[idx];
+        if (col == row) {
+            continue;
+        }
+        auto weight = weight_vals[idx] / max(abs(diag[row * diag_stride]),
+                                             abs(diag[col * diag_stride]));
+        if (agg_const_val[col] != -1 &&
+            (weight > max_weight_agg ||
+             (weight == max_weight_agg && col > strongest_agg))) {
+            max_weight_agg = weight;
+            strongest_agg = col;
+        }
+    }
+    if (strongest_agg != -1) {
+        agg_val[row] = agg_const_val[strongest_agg];
+    } else {
+        agg_val[row] = row;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void merge_col_kernel(
+    const size_type num, const IndexType *__restrict__ row_ptrs,
+    const IndexType *__restrict__ source_col,
+    const ValueType *__restrict__ source_val,
+    IndexType *__restrict__ result_col, ValueType *__restrict__ result_val)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= num) {
+        return;
+    }
+    const auto start = row_ptrs[row];
+    const auto end = row_ptrs[row + 1];
+    auto result_ind = start;
+    auto col = source_col[result_ind];
+    auto temp = source_val[result_ind];
+    for (size_type idx = start + 1; idx < end; idx++) {
+        auto temp_col = source_col[idx];
+        if (temp_col != col) {
+            result_col[result_ind] = col;
+            result_val[result_ind] = temp;
+            result_ind++;
+            col = temp_col;
+            temp = zero<ValueType>();
+        }
+        temp += source_val[idx];
+    }
+    // If start != end, need to process the final column
+    if (start != end) {
+        result_col[result_ind] = col;
+        result_val[result_ind] = temp;
+    }
+}
+
+
+}  // namespace kernel
\ No newline at end of file
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index 9153f120d93..abe32153f4e 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -45,10 +45,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/multigrid/amgx_pgm.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/solver/common_trs_kernels.cuh"
+#include "cuda/components/atomic.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
@@ -62,25 +66,58 @@ namespace cuda {
 namespace amgx_pgm {
 
 
+constexpr int default_block_size = 512;
+
+
+#include "common/multigrid/amgx_pgm_kernels.hpp.inc"
+
+
 template <typename IndexType>
 void match_edge(std::shared_ptr<const CudaExecutor> exec,
                 const Array<IndexType> &strongest_neighbor,
-                Array<IndexType> &agg) GKO_NOT_IMPLEMENTED;
+                Array<IndexType> &agg)
+{
+    const auto num = agg.get_num_elems();
+    const dim3 grid(ceildiv(num, default_block_size));
+    kernel::match_edge_kernel<<<grid, default_block_size>>>(
+        num, strongest_neighbor.get_const_data(), agg.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL);
 
 
 template <typename IndexType>
 void count_unagg(std::shared_ptr<const CudaExecutor> exec,
-                 const Array<IndexType> &agg,
-                 IndexType *num_unagg) GKO_NOT_IMPLEMENTED;
+                 const Array<IndexType> &agg, IndexType *num_unagg)
+{
+    Array<IndexType> active_agg(exec, agg.get_num_elems());
+    const dim3 grid(ceildiv(active_agg.get_num_elems(), default_block_size));
+    kernel::activate_kernel<<<grid, default_block_size>>>(
+        active_agg.get_num_elems(), agg.get_const_data(),
+        active_agg.get_data());
+    *num_unagg = reduce_add_array(exec, active_agg.get_num_elems(),
+                                  active_agg.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_COUNT_UNAGG_KERNEL);
 
 
 template <typename IndexType>
 void renumber(std::shared_ptr<const CudaExecutor> exec, Array<IndexType> &agg,
-              IndexType *num_agg) GKO_NOT_IMPLEMENTED;
+              IndexType *num_agg)
+{
+    const auto num = agg.get_num_elems();
+    Array<IndexType> agg_map(exec, num + 1);
+    components::fill_array(exec, agg_map.get_data(), agg_map.get_num_elems(),
+                           zero<IndexType>());
+    const dim3 grid(ceildiv(num, default_block_size));
+    kernel::fill_agg_kernel<<<grid, default_block_size>>>(
+        num, agg.get_const_data(), agg_map.get_data());
+    components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems());
+    kernel::renumber_kernel<<<grid, default_block_size>>>(
+        num, agg_map.get_const_data(), agg.get_data());
+    *num_agg = exec->copy_val_to_host(agg_map.get_const_data() + num);
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_RENUMBER_KERNEL);
 
@@ -90,18 +127,40 @@ void find_strongest_neighbor(
     std::shared_ptr<const CudaExecutor> exec,
     const matrix::Csr<ValueType, IndexType> *weight_mtx,
     const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg,
-    Array<IndexType> &strongest_neighbor) GKO_NOT_IMPLEMENTED;
+    Array<IndexType> &strongest_neighbor)
+{
+    const auto num = agg.get_num_elems();
+    const dim3 grid(ceildiv(num, default_block_size));
+    kernel::find_strongest_neighbor_kernel<<<grid, default_block_size>>>(
+        num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(),
+        weight_mtx->get_const_values(), diag->get_const_values(),
+        diag->get_stride(), agg.get_data(), strongest_neighbor.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_FIND_STRONGEST_NEIGHBOR);
 
-
 template <typename ValueType, typename IndexType>
-void assign_to_exist_agg(
-    std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *weight_mtx,
-    const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg,
-    Array<IndexType> &intermediate_agg) GKO_NOT_IMPLEMENTED;
+void assign_to_exist_agg(std::shared_ptr<const CudaExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *weight_mtx,
+                         const matrix::Diagonal<ValueType> *diag,
+                         Array<IndexType> &agg,
+                         Array<IndexType> &intermediate_agg)
+{
+    auto agg_val = (intermediate_agg.get_num_elems() > 0)
+                       ? intermediate_agg.get_data()
+                       : agg.get_data();
+    const auto num = agg.get_num_elems();
+    const dim3 grid(ceildiv(num, default_block_size));
+    kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
+        num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(),
+        weight_mtx->get_const_values(), diag->get_const_values(),
+        diag->get_stride(), agg.get_const_data(), agg_val);
+    if (intermediate_agg.get_num_elems() > 0) {
+        // Copy the intermediate_agg to agg
+        agg = intermediate_agg;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
diff --git a/cuda/test/CMakeLists.txt b/cuda/test/CMakeLists.txt
index 83d7b39f35e..fb66aaf270f 100644
--- a/cuda/test/CMakeLists.txt
+++ b/cuda/test/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(base)
 add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
+add_subdirectory(multigrid)
 add_subdirectory(preconditioner)
 add_subdirectory(reorder)
 add_subdirectory(solver)
diff --git a/cuda/test/multigrid/CMakeLists.txt b/cuda/test/multigrid/CMakeLists.txt
new file mode 100644
index 00000000000..8fe8bbeba48
--- /dev/null
+++ b/cuda/test/multigrid/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(amgx_pgm_kernels)
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
new file mode 100644
index 00000000000..d1c899d0f54
--- /dev/null
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -0,0 +1,341 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/multigrid/amgx_pgm.hpp>
+
+
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <string>
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/multigrid/amgx_pgm_kernels.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename Array, typename ValueDistribution, typename Engine>
+Array generate_random_array(gko::size_type num, ValueDistribution &&value_dist,
+                            Engine &&engine,
+                            std::shared_ptr<const gko::Executor> exec)
+{
+    using value_type = typename Array::value_type;
+    Array array_host(exec->get_master(), num);
+    auto val = array_host.get_data();
+    for (int i = 0; i < num; i++) {
+        val[i] =
+            gko::test::detail::get_rand_value<value_type>(value_dist, engine);
+    }
+    Array array(exec);
+    array = array_host;
+    return array;
+}
+
+
+class AmgxPgm : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    AmgxPgm() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        cuda = gko::CudaExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (cuda != nullptr) {
+            ASSERT_NO_THROW(cuda->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    gko::Array<index_type> gen_array(gko::size_type num, index_type min_val,
+                                     index_type max_val)
+    {
+        return generate_random_array<gko::Array<index_type>>(
+            num, std::uniform_int_distribution<>(min_val, max_val), rand_engine,
+            ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        int n = 300;
+        int nrhs = 3;
+
+        agg = gen_array(m, 0, n - 1);
+        unfinished_agg = gen_array(m, -1, n - 1);
+        strongest_neighbor = gen_array(m, 0, n - 1);
+        coarse_vector = gen_mtx(n, nrhs);
+        fine_vector = gen_mtx(m, nrhs);
+        auto weight = gen_mtx(m, m);
+        make_weight(weight.get());
+        weight_csr = Csr::create(ref);
+        weight->convert_to(weight_csr.get());
+        weight_diag = weight_csr->extract_diagonal();
+
+        d_agg.set_executor(cuda);
+        d_unfinished_agg.set_executor(cuda);
+        d_strongest_neighbor.set_executor(cuda);
+        d_coarse_vector = Mtx::create(cuda);
+        d_fine_vector = Mtx::create(cuda);
+        d_weight_csr = Csr::create(cuda);
+        d_weight_diag = Mtx::create(cuda);
+        d_agg = agg;
+        d_unfinished_agg = unfinished_agg;
+        d_strongest_neighbor = strongest_neighbor;
+        d_coarse_vector->copy_from(coarse_vector.get());
+        d_fine_vector->copy_from(fine_vector.get());
+        d_weight_csr->copy_from(weight_csr.get());
+        d_weight_diag->copy_from(weight_diag.get());
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    // only for real value
+    void make_absoulte(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = abs(mtx->at(i, j));
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    void make_weight(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_absoulte(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::CudaExecutor> cuda;
+
+    std::ranlux48 rand_engine;
+
+    gko::Array<index_type> agg;
+    gko::Array<index_type> unfinished_agg;
+    gko::Array<index_type> strongest_neighbor;
+
+    gko::Array<index_type> d_agg;
+    gko::Array<index_type> d_unfinished_agg;
+    gko::Array<index_type> d_strongest_neighbor;
+
+    std::unique_ptr<Mtx> coarse_vector;
+    std::unique_ptr<Mtx> fine_vector;
+    std::unique_ptr<Mtx> weight_diag;
+    std::unique_ptr<Csr> weight_csr;
+
+    std::unique_ptr<Mtx> d_coarse_vector;
+    std::unique_ptr<Mtx> d_fine_vector;
+    std::unique_ptr<Mtx> d_weight_diag;
+    std::unique_ptr<Csr> d_weight_csr;
+};
+
+
+TEST_F(AmgxPgm, RestrictApplyIsEquivalentToRef)
+{
+    initialize_data();
+    // fine->coarse
+    auto x = Mtx::create_with_config_of(gko::lend(coarse_vector));
+    auto d_x = Mtx::create_with_config_of(gko::lend(d_coarse_vector));
+
+    gko::kernels::reference::amgx_pgm::restrict_apply(
+        ref, agg, fine_vector.get(), x.get());
+    gko::kernels::cuda::amgx_pgm::restrict_apply(
+        cuda, d_agg, d_fine_vector.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(AmgxPgm, ProlongApplyaddIsEquivalentToRef)
+{
+    initialize_data();
+    // coarse->fine
+    auto x = fine_vector->clone();
+    auto d_x = d_fine_vector->clone();
+
+    gko::kernels::reference::amgx_pgm::prolong_applyadd(
+        ref, agg, coarse_vector.get(), x.get());
+    gko::kernels::cuda::amgx_pgm::prolong_applyadd(
+        cuda, d_agg, d_coarse_vector.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+
+    gko::kernels::reference::amgx_pgm::match_edge(ref, strongest_neighbor, x);
+    gko::kernels::cuda::amgx_pgm::match_edge(cuda, d_strongest_neighbor, d_x);
+
+    GKO_ASSERT_ARRAY_EQ(d_x, x);
+}
+
+
+TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
+{
+    initialize_data();
+    gko::size_type num_unagg;
+    gko::size_type d_num_unagg;
+
+    gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
+    gko::kernels::cuda::amgx_pgm::count_unagg(cuda, d_agg, &d_num_unagg);
+
+    ASSERT_EQ(d_num_unagg, num_unagg);
+}
+
+
+TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+    gko::size_type num_agg;
+    gko::size_type d_num_agg;
+
+    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
+    gko::kernels::cuda::amgx_pgm::renumber(cuda, d_agg, &d_num_agg);
+
+    ASSERT_EQ(d_num_agg, num_agg);
+    GKO_ASSERT_ARRAY_EQ(d_agg, agg);
+    ASSERT_LE(num_agg, 300);
+}
+
+
+TEST_F(AmgxPgm, FindStrongestNeighborIsEquivalentToRef)
+{
+    initialize_data();
+    auto snb = strongest_neighbor;
+    auto d_snb = d_strongest_neighbor;
+
+    gko::kernels::reference::amgx_pgm::find_strongest_neighbor(
+        ref, weight_csr.get(), weight_diag.get(), agg, snb);
+    gko::kernels::cuda::amgx_pgm::find_strongest_neighbor(
+        cuda, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb);
+
+    GKO_ASSERT_ARRAY_EQ(d_snb, snb);
+}
+
+
+TEST_F(AmgxPgm, AssignToExistAggIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+    auto intermediate_agg = x;
+    auto d_intermediate_agg = d_x;
+
+    gko::kernels::reference::amgx_pgm::assign_to_exist_agg(
+        ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg);
+    gko::kernels::cuda::amgx_pgm::assign_to_exist_agg(
+        cuda, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
+
+    GKO_ASSERT_ARRAY_EQ(d_x, x);
+}
+
+
+TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
+{
+    initialize_data();
+    auto d_x = d_unfinished_agg;
+    auto d_intermediate_agg = gko::Array<index_type>(cuda, 0);
+    gko::size_type d_num_unagg;
+
+    gko::kernels::cuda::amgx_pgm::assign_to_exist_agg(
+        cuda, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
+    gko::kernels::cuda::amgx_pgm::count_unagg(cuda, d_agg, &d_num_unagg);
+
+    // only test whether all elements are aggregated.
+    GKO_ASSERT_EQ(d_num_unagg, 0);
+}
+
+
+}  // namespace

From 6bd9ce89ceafd27c3527829286c6975549319bcd Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 16 Jul 2020 15:10:16 +0200
Subject: [PATCH 02/16] cuda amgx_pgm size2 generation and related test

---
 common/multigrid/amgx_pgm_kernels.hpp.inc     | 108 ++++++++++++++----
 core/multigrid/amgx_pgm.cpp                   |   5 +-
 core/multigrid/amgx_pgm_kernels.hpp           |   3 +-
 cuda/multigrid/amgx_pgm_kernels.cu            |  53 ++++++++-
 cuda/test/multigrid/amgx_pgm_kernels.cpp      |  25 +++-
 hip/multigrid/amgx_pgm_kernels.hip.cpp        |   3 +-
 omp/multigrid/amgx_pgm_kernels.cpp            |   3 +-
 reference/multigrid/amgx_pgm_kernels.cpp      |   3 +-
 reference/test/multigrid/amgx_pgm_kernels.cpp |   3 +-
 9 files changed, 175 insertions(+), 31 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 6f460ad98c2..5bed8696b10 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -241,37 +241,103 @@ __global__
 }
 
 
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void get_source_row_map_kernel(
+    const size_type source_nrows, const IndexType *__restrict__ agg_val,
+    const IndexType *__restrict__ source_row_ptrs,
+    IndexType *__restrict__ result_row_ptrs, IndexType *__restrict__ row_map)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= source_nrows) {
+        return;
+    }
+    const auto num_elems = source_row_ptrs[row + 1] - source_row_ptrs[row];
+    const auto result_idx = agg_val[row];
+    // atomic_add returns the old value, so it can be the starting point.
+    row_map[row] = atomic_add(result_row_ptrs + result_idx, num_elems);
+}
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void move_row_kernel(
+    const size_type source_nrows, const IndexType *__restrict__ agg_val,
+    const IndexType *__restrict__ row_map,
+    const IndexType *__restrict__ source_row_ptrs,
+    const IndexType *__restrict__ source_col_idxs,
+    const ValueType *__restrict__ source_values,
+    const IndexType *__restrict__ result_row_ptrs,
+    IndexType *__restrict__ result_col_idxs,
+    ValueType *__restrict__ result_values)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= source_nrows) {
+        return;
+    }
+    auto result_i = result_row_ptrs[agg_val[row]] + row_map[row];
+    for (auto i = source_row_ptrs[row]; i < source_row_ptrs[row + 1];
+         i++, result_i++) {
+        result_col_idxs[result_i] = agg_val[source_col_idxs[i]];
+        result_values[result_i] = source_values[i];
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void merge_col_kernel(
-    const size_type num, const IndexType *__restrict__ row_ptrs,
-    const IndexType *__restrict__ source_col,
-    const ValueType *__restrict__ source_val,
-    IndexType *__restrict__ result_col, ValueType *__restrict__ result_val)
+    const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
+    IndexType *__restrict__ temp_col_idxs, ValueType *__restrict__ temp_values,
+    IndexType *__restrict__ coarse_row_ptrs)
 {
     auto row = thread::get_thread_id_flat();
-    if (row >= num) {
+    if (row >= nrows) {
         return;
     }
-    const auto start = row_ptrs[row];
-    const auto end = row_ptrs[row + 1];
-    auto result_ind = start;
-    auto col = source_col[result_ind];
-    auto temp = source_val[result_ind];
-    for (size_type idx = start + 1; idx < end; idx++) {
-        auto temp_col = source_col[idx];
-        if (temp_col != col) {
-            result_col[result_ind] = col;
-            result_val[result_ind] = temp;
-            result_ind++;
-            col = temp_col;
-            temp = zero<ValueType>();
+
+    IndexType num_elems = zero<IndexType>();
+    const auto start = temp_row_ptrs[row];
+    const auto end = temp_row_ptrs[row + 1];
+    IndexType col = temp_col_idxs[start];
+    ValueType value = temp_values[start];
+    for (auto i = start + 1; i < end; i++) {
+        const auto current_col = temp_col_idxs[i];
+        if (current_col != col) {
+            // apply to the original data. It is sorted, so the writing position
+            // is before read position
+            temp_col_idxs[start + num_elems] = col;
+            temp_values[start + num_elems] = value;
+            value = zero<ValueType>();
+            col = current_col;
+            num_elems++;
         }
-        temp += source_val[idx];
+        value += temp_values[i];
     }
     // If start != end, need to process the final column
     if (start != end) {
-        result_col[result_ind] = col;
-        result_val[result_ind] = temp;
+        temp_col_idxs[start + num_elems] = col;
+        temp_values[start + num_elems] = value;
+        num_elems++;
+    }
+    coarse_row_ptrs[row] = num_elems;
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void copy_to_coarse_kernel(
+    const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
+    const IndexType *__restrict__ temp_col_idxs,
+    const ValueType *__restrict__ temp_values,
+    const IndexType *__restrict__ coarse_row_ptrs,
+    IndexType *__restrict__ coarse_col_idxs,
+    ValueType *__restrict__ coarse_values)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= nrows) {
+        return;
+    }
+    auto temp_i = temp_row_ptrs[row];
+    for (auto i = coarse_row_ptrs[row]; i < coarse_row_ptrs[row + 1];
+         i++, temp_i++) {
+        coarse_col_idxs[i] = temp_col_idxs[temp_i];
+        coarse_values[i] = temp_values[temp_i];
     }
 }
 
diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp
index 09edd76d101..1803c4252d2 100644
--- a/core/multigrid/amgx_pgm.cpp
+++ b/core/multigrid/amgx_pgm.cpp
@@ -79,7 +79,10 @@ std::unique_ptr<LinOp> amgx_pgm_generate(
 {
     auto coarse = matrix::Csr<ValueType, IndexType>::create(
         exec, dim<2>{num_agg, num_agg}, 0, source->get_strategy());
-    exec->run(amgx_pgm::make_amgx_pgm_generate(source, agg, coarse.get()));
+    auto temp = matrix::Csr<ValueType, IndexType>::create(
+        exec, dim<2>{num_agg, num_agg}, source->get_num_stored_elements());
+    exec->run(amgx_pgm::make_amgx_pgm_generate(source, agg, coarse.get(),
+                                               temp.get()));
     return std::move(coarse);
 }
 
diff --git a/core/multigrid/amgx_pgm_kernels.hpp b/core/multigrid/amgx_pgm_kernels.hpp
index 9109c59e610..e0006be63a7 100644
--- a/core/multigrid/amgx_pgm_kernels.hpp
+++ b/core/multigrid/amgx_pgm_kernels.hpp
@@ -82,7 +82,8 @@ namespace amgx_pgm {
     void amgx_pgm_generate(std::shared_ptr<const DefaultExecutor> exec,     \
                            const matrix::Csr<ValueType, IndexType> *source, \
                            const Array<IndexType> &agg,                     \
-                           matrix::Csr<ValueType, IndexType> *coarse)
+                           matrix::Csr<ValueType, IndexType> *coarse,       \
+                           matrix::Csr<ValueType, IndexType> *temp)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
     template <typename IndexType>                                       \
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index abe32153f4e..d84766557cb 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
@@ -170,8 +172,55 @@ template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const CudaExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
                        const Array<IndexType> &agg,
-                       matrix::Csr<ValueType, IndexType> *coarse)
-    GKO_NOT_IMPLEMENTED;
+                       matrix::Csr<ValueType, IndexType> *coarse,
+                       matrix::Csr<ValueType, IndexType> *temp)
+{
+    const auto source_nrows = source->get_size()[0];
+    const auto source_nnz = source->get_num_stored_elements();
+    const auto coarse_nrows = coarse->get_size()[0];
+    Array<IndexType> row_map(exec, source_nrows);
+    // fill coarse row pointer as zero
+    components::fill_array(exec, temp->get_row_ptrs(), coarse_nrows + 1,
+                           zero<IndexType>());
+    // compute each source row should be moved and also change column index
+    dim3 grid(ceildiv(source_nrows, default_block_size));
+    // agg source_row (for row size) coarse row source map
+    kernel::get_source_row_map_kernel<<<grid, default_block_size>>>(
+        source_nrows, agg.get_const_data(), source->get_const_row_ptrs(),
+        temp->get_row_ptrs(), row_map.get_data());
+    // prefix sum of temp_row_ptrs
+    components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
+    // copy source -> to coarse and change column index
+    kernel::move_row_kernel<<<grid, default_block_size>>>(
+        source_nrows, agg.get_const_data(), row_map.get_const_data(),
+        source->get_const_row_ptrs(), source->get_const_col_idxs(),
+        as_cuda_type(source->get_const_values()), temp->get_const_row_ptrs(),
+        temp->get_col_idxs(), as_cuda_type(temp->get_values()));
+    // sort csr
+    csr::sort_by_column_index(exec, temp);
+    // summation of the elements with same position
+    grid = ceildiv(coarse_nrows, default_block_size);
+    kernel::merge_col_kernel<<<grid, default_block_size>>>(
+        coarse_nrows, temp->get_const_row_ptrs(), temp->get_col_idxs(),
+        as_cuda_type(temp->get_values()), coarse->get_row_ptrs());
+    // build the coarse matrix
+    components::prefix_sum(exec, coarse->get_row_ptrs(), coarse_nrows + 1);
+    // prefix sum of coarse->get_row_ptrs
+    const auto coarse_nnz =
+        exec->copy_val_to_host(coarse->get_row_ptrs() + coarse_nrows);
+    // reallocate size of column and values
+    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
+    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
+    auto &coarse_vals_array = coarse_builder.get_value_array();
+    coarse_col_idxs_array.resize_and_reset(coarse_nnz);
+    coarse_vals_array.resize_and_reset(coarse_nnz);
+    // copy the result
+    kernel::copy_to_coarse_kernel<<<grid, default_block_size>>>(
+        coarse_nrows, temp->get_const_row_ptrs(), temp->get_const_col_idxs(),
+        as_cuda_type(temp->get_const_values()), coarse->get_const_row_ptrs(),
+        coarse_col_idxs_array.get_data(),
+        as_cuda_type(coarse_vals_array.get_data()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
 
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index d1c899d0f54..746d9cf7c40 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -34,10 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <fstream>
-#include <iostream>
 #include <random>
 #include <string>
 
+
 #include <gtest/gtest.h>
 
 
@@ -116,7 +116,7 @@ class AmgxPgm : public ::testing::Test {
     void initialize_data()
     {
         int m = 597;
-        int n = 300;
+        n = 300;
         int nrhs = 3;
 
         agg = gen_array(m, 0, n - 1);
@@ -212,6 +212,8 @@ class AmgxPgm : public ::testing::Test {
     std::unique_ptr<Mtx> d_fine_vector;
     std::unique_ptr<Mtx> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
+
+    int n;
 };
 
 
@@ -338,4 +340,23 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
 }
 
 
+TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
+{
+    initialize_data();
+    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
+    auto d_csr_coarse = Csr::create(cuda, gko::dim<2>{n, n}, 0);
+    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
+                                weight_csr->get_num_stored_elements());
+    auto d_csr_temp = Csr::create(cuda, gko::dim<2>{n, n},
+                                  d_weight_csr->get_num_stored_elements());
+
+    gko::kernels::cuda::amgx_pgm::amgx_pgm_generate(
+        cuda, d_weight_csr.get(), d_agg, d_csr_coarse.get(), d_csr_temp.get());
+    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
+        ref, weight_csr.get(), agg, csr_coarse.get(), csr_temp.get());
+
+    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
+}
+
+
 }  // namespace
diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index d401bb72b72..88c2db995f1 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -111,7 +111,8 @@ template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const HipExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
                        const Array<IndexType> &agg,
-                       matrix::Csr<ValueType, IndexType> *coarse)
+                       matrix::Csr<ValueType, IndexType> *coarse,
+                       matrix::Csr<ValueType, IndexType> *temp)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index d56b3ff9dab..a0db864a250 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -108,7 +108,8 @@ template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const OmpExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
                        const Array<IndexType> &agg,
-                       matrix::Csr<ValueType, IndexType> *coarse)
+                       matrix::Csr<ValueType, IndexType> *coarse,
+                       matrix::Csr<ValueType, IndexType> *temp)
     GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
diff --git a/reference/multigrid/amgx_pgm_kernels.cpp b/reference/multigrid/amgx_pgm_kernels.cpp
index 1ae7ee860b0..64788db9805 100644
--- a/reference/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/multigrid/amgx_pgm_kernels.cpp
@@ -233,7 +233,8 @@ template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const ReferenceExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
                        const Array<IndexType> &agg,
-                       matrix::Csr<ValueType, IndexType> *coarse)
+                       matrix::Csr<ValueType, IndexType> *coarse,
+                       matrix::Csr<ValueType, IndexType> *temp)
 {
     // agg[i] -> I, agg[j] -> J
     const auto coarse_nrows = coarse->get_size()[0];
diff --git a/reference/test/multigrid/amgx_pgm_kernels.cpp b/reference/test/multigrid/amgx_pgm_kernels.cpp
index 63f8baf2e62..847e859529e 100644
--- a/reference/test/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/test/multigrid/amgx_pgm_kernels.cpp
@@ -525,9 +525,10 @@ TYPED_TEST(AmgxPgm, GenerateMtx)
                        {2, 1, -2},
                        {2, 2, 5}}});
     auto csr_coarse = mtx_type::create(this->exec, gko::dim<2>{3, 3}, 0);
+    auto empty = gko::matrix::Csr<value_type, index_type>::create(this->exec);
 
     gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        this->exec, this->mtx.get(), agg, csr_coarse.get());
+        this->exec, this->mtx.get(), agg, csr_coarse.get(), empty.get());
 
     GKO_ASSERT_MTX_NEAR(csr_coarse, coarse_ans, r<value_type>::value);
 }

From d50aae8032526e2178054a8e8afb4613b1ca1428 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 17 Jul 2020 12:22:26 +0200
Subject: [PATCH 03/16] fix windows issue

---
 cuda/test/multigrid/amgx_pgm_kernels.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 746d9cf7c40..6c2c34885a1 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -213,7 +213,7 @@ class AmgxPgm : public ::testing::Test {
     std::unique_ptr<Mtx> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
 
-    int n;
+    gko::size_type n;
 };
 
 

From 2915a3677caa8c607f153ea080f935094fd2b770 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 21 Jul 2020 17:41:34 +0200
Subject: [PATCH 04/16] omp implementation

---
 omp/multigrid/amgx_pgm_kernels.cpp      | 194 ++++++++++++-
 omp/test/CMakeLists.txt                 |   1 +
 omp/test/multigrid/CMakeLists.txt       |   1 +
 omp/test/multigrid/amgx_pgm_kernels.cpp | 353 ++++++++++++++++++++++++
 4 files changed, 538 insertions(+), 11 deletions(-)
 create mode 100644 omp/test/multigrid/CMakeLists.txt
 create mode 100644 omp/test/multigrid/amgx_pgm_kernels.cpp

diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index a0db864a250..80f1b2f88ce 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -48,6 +48,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/multigrid/amgx_pgm.hpp>
 
 
+#include "core/base/allocator.hpp"
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -62,22 +68,61 @@ namespace amgx_pgm {
 template <typename IndexType>
 void match_edge(std::shared_ptr<const OmpExecutor> exec,
                 const Array<IndexType> &strongest_neighbor,
-                Array<IndexType> &agg) GKO_NOT_IMPLEMENTED;
+                Array<IndexType> &agg)
+{
+    auto agg_vals = agg.get_data();
+    auto strongest_neighbor_vals = strongest_neighbor.get_const_data();
+#pragma omp parallel for
+    for (size_type i = 0; i < agg.get_num_elems(); i++) {
+        if (agg_vals[i] == -1) {
+            auto neighbor = strongest_neighbor_vals[i];
+            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i) {
+                agg_vals[i] = i;
+                agg_vals[neighbor] = i;
+                // Use the smaller index as agg point
+            }
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL);
 
 
 template <typename IndexType>
 void count_unagg(std::shared_ptr<const OmpExecutor> exec,
-                 const Array<IndexType> &agg,
-                 IndexType *num_unagg) GKO_NOT_IMPLEMENTED;
+                 const Array<IndexType> &agg, IndexType *num_unagg)
+{
+    IndexType unagg = 0;
+#pragma omp parallel for reduction(+ : unagg)
+    for (size_type i = 0; i < agg.get_num_elems(); i++) {
+        unagg += (agg.get_const_data()[i] == -1);
+    }
+    *num_unagg = unagg;
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_COUNT_UNAGG_KERNEL);
 
 
 template <typename IndexType>
 void renumber(std::shared_ptr<const OmpExecutor> exec, Array<IndexType> &agg,
-              IndexType *num_agg) GKO_NOT_IMPLEMENTED;
+              IndexType *num_agg)
+{
+    const auto num = agg.get_num_elems();
+    Array<IndexType> agg_map(exec, num + 1);
+    auto agg_vals = agg.get_data();
+    auto agg_map_vals = agg_map.get_data();
+    components::fill_array(exec, agg_map_vals, num + 1, zero<IndexType>());
+#pragma omp parallel for
+    for (size_type i = 0; i < num; i++) {
+        agg_map_vals[agg_vals[i]] = 1;
+    }
+    components::prefix_sum(exec, agg_map_vals, num + 1);
+#pragma omp parallel for
+    for (size_type i = 0; i < num; i++) {
+        agg_vals[i] = agg_map_vals[agg_vals[i]];
+    }
+    *num_agg = agg_map_vals[num];
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_RENUMBER_KERNEL);
 
@@ -87,18 +132,104 @@ void find_strongest_neighbor(
     std::shared_ptr<const OmpExecutor> exec,
     const matrix::Csr<ValueType, IndexType> *weight_mtx,
     const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg,
-    Array<IndexType> &strongest_neighbor) GKO_NOT_IMPLEMENTED;
+    Array<IndexType> &strongest_neighbor)
+{
+    const auto row_ptrs = weight_mtx->get_const_row_ptrs();
+    const auto col_idxs = weight_mtx->get_const_col_idxs();
+    const auto vals = weight_mtx->get_const_values();
+#pragma omp parallel for
+    for (size_type row = 0; row < agg.get_num_elems(); row++) {
+        auto max_weight_unagg = zero<ValueType>();
+        auto max_weight_agg = zero<ValueType>();
+        IndexType strongest_unagg = -1;
+        IndexType strongest_agg = -1;
+        if (agg.get_const_data()[row] == -1) {
+            for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
+                auto col = col_idxs[idx];
+                if (col == row) {
+                    continue;
+                }
+                auto weight = vals[idx] /
+                              max(abs(diag->at(row, 0)), abs(diag->at(col, 0)));
+                if (agg.get_const_data()[col] == -1 &&
+                    (weight > max_weight_unagg ||
+                     (weight == max_weight_unagg && col > strongest_unagg))) {
+                    max_weight_unagg = weight;
+                    strongest_unagg = col;
+                } else if (agg.get_const_data()[col] != -1 &&
+                           (weight > max_weight_agg ||
+                            (weight == max_weight_agg &&
+                             col > strongest_agg))) {
+                    max_weight_agg = weight;
+                    strongest_agg = col;
+                }
+            }
+
+            if (strongest_unagg == -1 && strongest_agg != -1) {
+                // all neighbor is agg, connect to the strongest agg
+                agg.get_data()[row] = agg.get_data()[strongest_agg];
+            } else if (strongest_unagg != -1) {
+                // set the strongest neighbor in the unagg group
+                strongest_neighbor.get_data()[row] = strongest_unagg;
+            } else {
+                // no neighbor
+                strongest_neighbor.get_data()[row] = row;
+            }
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_FIND_STRONGEST_NEIGHBOR);
 
 
 template <typename ValueType, typename IndexType>
-void assign_to_exist_agg(
-    std::shared_ptr<const OmpExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *weight_mtx,
-    const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg,
-    Array<IndexType> &intermediate_agg) GKO_NOT_IMPLEMENTED;
+void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *weight_mtx,
+                         const matrix::Diagonal<ValueType> *diag,
+                         Array<IndexType> &agg,
+                         Array<IndexType> &intermediate_agg)
+{
+    const auto row_ptrs = weight_mtx->get_const_row_ptrs();
+    const auto col_idxs = weight_mtx->get_const_col_idxs();
+    const auto vals = weight_mtx->get_const_values();
+    auto max_weight_agg = zero<ValueType>();
+    const auto agg_const_val = agg.get_const_data();
+    auto agg_val = (intermediate_agg.get_num_elems() > 0)
+                       ? intermediate_agg.get_data()
+                       : agg.get_data();
+#pragma omp parallel for
+    for (IndexType row = 0; row < agg.get_num_elems(); row++) {
+        if (agg_const_val[row] != -1) {
+            continue;
+        }
+        IndexType strongest_agg = -1;
+        for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
+            auto col = col_idxs[idx];
+            if (col == row) {
+                continue;
+            }
+            auto weight =
+                vals[idx] / max(abs(diag->at(row, 0)), abs(diag->at(col, 0)));
+            if (agg_const_val[col] != -1 &&
+                (weight > max_weight_agg ||
+                 (weight == max_weight_agg && col > strongest_agg))) {
+                max_weight_agg = weight;
+                strongest_agg = col;
+            }
+        }
+        if (strongest_agg != -1) {
+            agg_val[row] = agg_const_val[strongest_agg];
+        } else {
+            agg_val[row] = row;
+        }
+    }
+
+    if (intermediate_agg.get_num_elems() > 0) {
+        // Copy the intermediate_agg to agg
+        agg = intermediate_agg;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
@@ -110,7 +241,48 @@ void amgx_pgm_generate(std::shared_ptr<const OmpExecutor> exec,
                        const Array<IndexType> &agg,
                        matrix::Csr<ValueType, IndexType> *coarse,
                        matrix::Csr<ValueType, IndexType> *temp)
-    GKO_NOT_IMPLEMENTED;
+{
+    // agg[i] -> I, agg[j] -> J
+    const auto coarse_nrows = coarse->get_size()[0];
+    const auto source_nrows = source->get_size()[0];
+    const auto source_row_ptrs = source->get_const_row_ptrs();
+    const auto source_col_idxs = source->get_const_col_idxs();
+    const auto source_vals = source->get_const_values();
+    vector<map<IndexType, ValueType>> row_list(
+        source_nrows, map<IndexType, ValueType>{exec}, exec);
+    for (size_type i = 0; i < source_nrows; i++) {
+        IndexType row_idx = agg.get_const_data()[i];
+        for (auto j = source_row_ptrs[i]; j < source_row_ptrs[i + 1]; j++) {
+            const auto col = agg.get_const_data()[source_col_idxs[j]];
+            const auto val = source_vals[j];
+            row_list[row_idx][col] += val;
+        }
+    }
+    auto coarse_row_ptrs = coarse->get_row_ptrs();
+#pragma omp parallel for
+    for (size_type i = 0; i < coarse_nrows; i++) {
+        coarse_row_ptrs[i] = row_list[i].size();
+    }
+    components::prefix_sum(exec, coarse_row_ptrs, coarse_nrows + 1);
+
+    auto nnz = coarse_row_ptrs[coarse_nrows];
+    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
+    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
+    auto &coarse_vals_array = coarse_builder.get_value_array();
+    coarse_col_idxs_array.resize_and_reset(nnz);
+    coarse_vals_array.resize_and_reset(nnz);
+    auto coarse_col_idxs = coarse_col_idxs_array.get_data();
+    auto coarse_vals = coarse_vals_array.get_data();
+#pragma omp parallel for
+    for (size_type i = 0; i < coarse_nrows; i++) {
+        auto ind = coarse_row_ptrs[i];
+        for (auto pair : row_list[i]) {
+            coarse_col_idxs[ind] = pair.first;
+            coarse_vals[ind] = pair.second;
+            ind++;
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
 
diff --git a/omp/test/CMakeLists.txt b/omp/test/CMakeLists.txt
index 2f506e51109..3759a39bc77 100644
--- a/omp/test/CMakeLists.txt
+++ b/omp/test/CMakeLists.txt
@@ -3,6 +3,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake)
 add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
+add_subdirectory(multigrid)
 add_subdirectory(preconditioner)
 add_subdirectory(reorder)
 add_subdirectory(solver)
diff --git a/omp/test/multigrid/CMakeLists.txt b/omp/test/multigrid/CMakeLists.txt
new file mode 100644
index 00000000000..8fe8bbeba48
--- /dev/null
+++ b/omp/test/multigrid/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(amgx_pgm_kernels)
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
new file mode 100644
index 00000000000..cc822135a5d
--- /dev/null
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -0,0 +1,353 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/multigrid/amgx_pgm.hpp>
+
+
+#include <fstream>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/multigrid/amgx_pgm_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+
+namespace {
+
+
+template <typename Array, typename ValueDistribution, typename Engine>
+Array generate_random_array(gko::size_type num, ValueDistribution &&value_dist,
+                            Engine &&engine,
+                            std::shared_ptr<const gko::Executor> exec)
+{
+    using value_type = typename Array::value_type;
+    Array array_host(exec->get_master(), num);
+    auto val = array_host.get_data();
+    for (int i = 0; i < num; i++) {
+        val[i] =
+            gko::test::detail::get_rand_value<value_type>(value_dist, engine);
+    }
+    Array array(exec);
+    array = array_host;
+    return array;
+}
+
+
+class AmgxPgm : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    AmgxPgm() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        omp = gko::OmpExecutor::create();
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    gko::Array<index_type> gen_array(gko::size_type num, index_type min_val,
+                                     index_type max_val)
+    {
+        return generate_random_array<gko::Array<index_type>>(
+            num, std::uniform_int_distribution<>(min_val, max_val), rand_engine,
+            ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        n = 300;
+        int nrhs = 3;
+
+        agg = gen_array(m, 0, n - 1);
+        unfinished_agg = gen_array(m, -1, n - 1);
+        strongest_neighbor = gen_array(m, 0, n - 1);
+        coarse_vector = gen_mtx(n, nrhs);
+        fine_vector = gen_mtx(m, nrhs);
+        auto weight = gen_mtx(m, m);
+        make_weight(weight.get());
+        weight_csr = Csr::create(ref);
+        weight->convert_to(weight_csr.get());
+        weight_diag = weight_csr->extract_diagonal();
+
+        d_agg.set_executor(omp);
+        d_unfinished_agg.set_executor(omp);
+        d_strongest_neighbor.set_executor(omp);
+        d_coarse_vector = Mtx::create(omp);
+        d_fine_vector = Mtx::create(omp);
+        d_weight_csr = Csr::create(omp);
+        d_weight_diag = Mtx::create(omp);
+        d_agg = agg;
+        d_unfinished_agg = unfinished_agg;
+        d_strongest_neighbor = strongest_neighbor;
+        d_coarse_vector->copy_from(coarse_vector.get());
+        d_fine_vector->copy_from(fine_vector.get());
+        d_weight_csr->copy_from(weight_csr.get());
+        d_weight_diag->copy_from(weight_diag.get());
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    // only for real value
+    void make_absoulte(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = abs(mtx->at(i, j));
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    void make_weight(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_absoulte(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::OmpExecutor> omp;
+
+    std::ranlux48 rand_engine;
+
+    gko::Array<index_type> agg;
+    gko::Array<index_type> unfinished_agg;
+    gko::Array<index_type> strongest_neighbor;
+
+    gko::Array<index_type> d_agg;
+    gko::Array<index_type> d_unfinished_agg;
+    gko::Array<index_type> d_strongest_neighbor;
+
+    std::unique_ptr<Mtx> coarse_vector;
+    std::unique_ptr<Mtx> fine_vector;
+    std::unique_ptr<Mtx> weight_diag;
+    std::unique_ptr<Csr> weight_csr;
+
+    std::unique_ptr<Mtx> d_coarse_vector;
+    std::unique_ptr<Mtx> d_fine_vector;
+    std::unique_ptr<Mtx> d_weight_diag;
+    std::unique_ptr<Csr> d_weight_csr;
+
+    gko::size_type n;
+};
+
+
+TEST_F(AmgxPgm, RestrictApplyIsEquivalentToRef)
+{
+    initialize_data();
+    // fine->coarse
+    auto x = Mtx::create_with_config_of(gko::lend(coarse_vector));
+    auto d_x = Mtx::create_with_config_of(gko::lend(d_coarse_vector));
+
+    gko::kernels::reference::amgx_pgm::restrict_apply(
+        ref, agg, fine_vector.get(), x.get());
+    gko::kernels::omp::amgx_pgm::restrict_apply(omp, d_agg, d_fine_vector.get(),
+                                                d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(AmgxPgm, ProlongApplyaddIsEquivalentToRef)
+{
+    initialize_data();
+    // coarse->fine
+    auto x = fine_vector->clone();
+    auto d_x = d_fine_vector->clone();
+
+    gko::kernels::reference::amgx_pgm::prolong_applyadd(
+        ref, agg, coarse_vector.get(), x.get());
+    gko::kernels::omp::amgx_pgm::prolong_applyadd(
+        omp, d_agg, d_coarse_vector.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+
+    gko::kernels::reference::amgx_pgm::match_edge(ref, strongest_neighbor, x);
+    gko::kernels::omp::amgx_pgm::match_edge(omp, d_strongest_neighbor, d_x);
+
+    GKO_ASSERT_ARRAY_EQ(d_x, x);
+}
+
+
+TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
+{
+    initialize_data();
+    gko::size_type num_unagg;
+    gko::size_type d_num_unagg;
+
+    gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
+    gko::kernels::omp::amgx_pgm::count_unagg(omp, d_agg, &d_num_unagg);
+
+    ASSERT_EQ(d_num_unagg, num_unagg);
+}
+
+
+TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+    gko::size_type num_agg;
+    gko::size_type d_num_agg;
+
+    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
+    gko::kernels::omp::amgx_pgm::renumber(omp, d_agg, &d_num_agg);
+
+    ASSERT_EQ(d_num_agg, num_agg);
+    GKO_ASSERT_ARRAY_EQ(d_agg, agg);
+    ASSERT_LE(num_agg, 300);
+}
+
+
+TEST_F(AmgxPgm, FindStrongestNeighborIsEquivalentToRef)
+{
+    initialize_data();
+    auto snb = strongest_neighbor;
+    auto d_snb = d_strongest_neighbor;
+
+    gko::kernels::reference::amgx_pgm::find_strongest_neighbor(
+        ref, weight_csr.get(), weight_diag.get(), agg, snb);
+    gko::kernels::omp::amgx_pgm::find_strongest_neighbor(
+        omp, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb);
+
+    GKO_ASSERT_ARRAY_EQ(d_snb, snb);
+}
+
+
+TEST_F(AmgxPgm, AssignToExistAggIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+    auto intermediate_agg = x;
+    auto d_intermediate_agg = d_x;
+
+    gko::kernels::reference::amgx_pgm::assign_to_exist_agg(
+        ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg);
+    gko::kernels::omp::amgx_pgm::assign_to_exist_agg(
+        omp, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
+
+    GKO_ASSERT_ARRAY_EQ(d_x, x);
+}
+
+
+TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
+{
+    initialize_data();
+    auto d_x = d_unfinished_agg;
+    auto d_intermediate_agg = gko::Array<index_type>(omp, 0);
+    gko::size_type d_num_unagg;
+
+    gko::kernels::omp::amgx_pgm::assign_to_exist_agg(
+        omp, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
+    gko::kernels::omp::amgx_pgm::count_unagg(omp, d_agg, &d_num_unagg);
+
+    // only test whether all elements are aggregated.
+    GKO_ASSERT_EQ(d_num_unagg, 0);
+}
+
+
+TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
+{
+    initialize_data();
+    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
+    auto d_csr_coarse = Csr::create(omp, gko::dim<2>{n, n}, 0);
+    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
+                                weight_csr->get_num_stored_elements());
+    auto d_csr_temp = Csr::create(omp, gko::dim<2>{n, n},
+                                  d_weight_csr->get_num_stored_elements());
+
+    gko::kernels::omp::amgx_pgm::amgx_pgm_generate(
+        omp, d_weight_csr.get(), d_agg, d_csr_coarse.get(), d_csr_temp.get());
+    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
+        ref, weight_csr.get(), agg, csr_coarse.get(), csr_temp.get());
+
+    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
+}
+
+
+}  // namespace

From 3c963a72446144e108be87e80362389a3024cda7 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 21 Jul 2020 17:54:54 +0200
Subject: [PATCH 05/16] hip implementation

---
 hip/multigrid/amgx_pgm_kernels.hip.cpp      | 144 +++++++-
 hip/test/CMakeLists.txt                     |   1 +
 hip/test/multigrid/CMakeLists.txt           |   1 +
 hip/test/multigrid/amgx_pgm_kernels.hip.cpp | 362 ++++++++++++++++++++
 4 files changed, 496 insertions(+), 12 deletions(-)
 create mode 100644 hip/test/multigrid/CMakeLists.txt
 create mode 100644 hip/test/multigrid/amgx_pgm_kernels.hip.cpp

diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index 88c2db995f1..e3e90a06b74 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -45,10 +45,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/multigrid/amgx_pgm.hpp>
 
 
+#include "core/components/fill_array.hpp"
+#include "core/components/prefix_sum.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/solver/common_trs_kernels.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
@@ -62,25 +68,62 @@ namespace hip {
 namespace amgx_pgm {
 
 
+constexpr int default_block_size = 512;
+
+
+#include "common/multigrid/amgx_pgm_kernels.hpp.inc"
+
+
 template <typename IndexType>
 void match_edge(std::shared_ptr<const HipExecutor> exec,
                 const Array<IndexType> &strongest_neighbor,
-                Array<IndexType> &agg) GKO_NOT_IMPLEMENTED;
+                Array<IndexType> &agg)
+{
+    const auto num = agg.get_num_elems();
+    const dim3 grid(ceildiv(num, default_block_size));
+    hipLaunchKernelGGL(kernel::match_edge_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, num,
+                       strongest_neighbor.get_const_data(), agg.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL);
 
 
 template <typename IndexType>
 void count_unagg(std::shared_ptr<const HipExecutor> exec,
-                 const Array<IndexType> &agg,
-                 IndexType *num_unagg) GKO_NOT_IMPLEMENTED;
+                 const Array<IndexType> &agg, IndexType *num_unagg)
+{
+    Array<IndexType> active_agg(exec, agg.get_num_elems());
+    const dim3 grid(ceildiv(active_agg.get_num_elems(), default_block_size));
+    hipLaunchKernelGGL(kernel::activate_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0,
+                       active_agg.get_num_elems(), agg.get_const_data(),
+                       active_agg.get_data());
+    *num_unagg = reduce_add_array(exec, active_agg.get_num_elems(),
+                                  active_agg.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_COUNT_UNAGG_KERNEL);
 
 
 template <typename IndexType>
 void renumber(std::shared_ptr<const HipExecutor> exec, Array<IndexType> &agg,
-              IndexType *num_agg) GKO_NOT_IMPLEMENTED;
+              IndexType *num_agg)
+{
+    const auto num = agg.get_num_elems();
+    Array<IndexType> agg_map(exec, num + 1);
+    components::fill_array(exec, agg_map.get_data(), agg_map.get_num_elems(),
+                           zero<IndexType>());
+    const dim3 grid(ceildiv(num, default_block_size));
+    hipLaunchKernelGGL(kernel::fill_agg_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, num,
+                       agg.get_const_data(), agg_map.get_data());
+    components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems());
+    hipLaunchKernelGGL(kernel::renumber_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, num,
+                       agg_map.get_const_data(), agg.get_data());
+    *num_agg = exec->copy_val_to_host(agg_map.get_const_data() + num);
+}
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_RENUMBER_KERNEL);
 
@@ -90,18 +133,45 @@ void find_strongest_neighbor(
     std::shared_ptr<const HipExecutor> exec,
     const matrix::Csr<ValueType, IndexType> *weight_mtx,
     const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg,
-    Array<IndexType> &strongest_neighbor) GKO_NOT_IMPLEMENTED;
+    Array<IndexType> &strongest_neighbor)
+{
+    const auto num = agg.get_num_elems();
+    const dim3 grid(ceildiv(num, default_block_size));
+    hipLaunchKernelGGL(
+        kernel::find_strongest_neighbor_kernel, dim3(grid),
+        dim3(default_block_size), 0, 0, num, weight_mtx->get_const_row_ptrs(),
+        weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
+        diag->get_const_values(), diag->get_stride(), agg.get_data(),
+        strongest_neighbor.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_FIND_STRONGEST_NEIGHBOR);
 
 
 template <typename ValueType, typename IndexType>
-void assign_to_exist_agg(
-    std::shared_ptr<const HipExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *weight_mtx,
-    const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg,
-    Array<IndexType> &intermediate_agg) GKO_NOT_IMPLEMENTED;
+void assign_to_exist_agg(std::shared_ptr<const HipExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType> *weight_mtx,
+                         const matrix::Diagonal<ValueType> *diag,
+                         Array<IndexType> &agg,
+                         Array<IndexType> &intermediate_agg)
+{
+    auto agg_val = (intermediate_agg.get_num_elems() > 0)
+                       ? intermediate_agg.get_data()
+                       : agg.get_data();
+    const auto num = agg.get_num_elems();
+    const dim3 grid(ceildiv(num, default_block_size));
+    hipLaunchKernelGGL(kernel::assign_to_exist_agg_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, num,
+                       weight_mtx->get_const_row_ptrs(),
+                       weight_mtx->get_const_col_idxs(),
+                       weight_mtx->get_const_values(), diag->get_const_values(),
+                       diag->get_stride(), agg.get_const_data(), agg_val);
+    if (intermediate_agg.get_num_elems() > 0) {
+        // Copy the intermediate_agg to agg
+        agg = intermediate_agg;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
@@ -113,7 +183,57 @@ void amgx_pgm_generate(std::shared_ptr<const HipExecutor> exec,
                        const Array<IndexType> &agg,
                        matrix::Csr<ValueType, IndexType> *coarse,
                        matrix::Csr<ValueType, IndexType> *temp)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto source_nrows = source->get_size()[0];
+    const auto source_nnz = source->get_num_stored_elements();
+    const auto coarse_nrows = coarse->get_size()[0];
+    Array<IndexType> row_map(exec, source_nrows);
+    // fill coarse row pointer as zero
+    components::fill_array(exec, temp->get_row_ptrs(), coarse_nrows + 1,
+                           zero<IndexType>());
+    // compute each source row should be moved and also change column index
+    dim3 grid(ceildiv(source_nrows, default_block_size));
+    // agg source_row (for row size) coarse row source map
+    hipLaunchKernelGGL(kernel::get_source_row_map_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, source_nrows,
+                       agg.get_const_data(), source->get_const_row_ptrs(),
+                       temp->get_row_ptrs(), row_map.get_data());
+    // prefix sum of temp_row_ptrs
+    components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
+    // copy source -> to coarse and change column index
+    hipLaunchKernelGGL(
+        kernel::move_row_kernel, dim3(grid), dim3(default_block_size), 0, 0,
+        source_nrows, agg.get_const_data(), row_map.get_const_data(),
+        source->get_const_row_ptrs(), source->get_const_col_idxs(),
+        as_hip_type(source->get_const_values()), temp->get_const_row_ptrs(),
+        temp->get_col_idxs(), as_hip_type(temp->get_values()));
+    // sort csr
+    csr::sort_by_column_index(exec, temp);
+    // summation of the elements with same position
+    grid = ceildiv(coarse_nrows, default_block_size);
+    hipLaunchKernelGGL(kernel::merge_col_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, coarse_nrows,
+                       temp->get_const_row_ptrs(), temp->get_col_idxs(),
+                       as_hip_type(temp->get_values()), coarse->get_row_ptrs());
+    // build the coarse matrix
+    components::prefix_sum(exec, coarse->get_row_ptrs(), coarse_nrows + 1);
+    // prefix sum of coarse->get_row_ptrs
+    const auto coarse_nnz =
+        exec->copy_val_to_host(coarse->get_row_ptrs() + coarse_nrows);
+    // reallocate size of column and values
+    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
+    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
+    auto &coarse_vals_array = coarse_builder.get_value_array();
+    coarse_col_idxs_array.resize_and_reset(coarse_nnz);
+    coarse_vals_array.resize_and_reset(coarse_nnz);
+    // copy the result
+    hipLaunchKernelGGL(
+        kernel::copy_to_coarse_kernel, dim3(grid), dim3(default_block_size), 0,
+        0, coarse_nrows, temp->get_const_row_ptrs(), temp->get_const_col_idxs(),
+        as_hip_type(temp->get_const_values()), coarse->get_const_row_ptrs(),
+        coarse_col_idxs_array.get_data(),
+        as_hip_type(coarse_vals_array.get_data()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
 
diff --git a/hip/test/CMakeLists.txt b/hip/test/CMakeLists.txt
index 9c8e39ca4ab..a5f126893bf 100644
--- a/hip/test/CMakeLists.txt
+++ b/hip/test/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(base)
 add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
+add_subdirectory(multigrid)
 add_subdirectory(solver)
 add_subdirectory(preconditioner)
 add_subdirectory(stop)
diff --git a/hip/test/multigrid/CMakeLists.txt b/hip/test/multigrid/CMakeLists.txt
new file mode 100644
index 00000000000..1c8534e5337
--- /dev/null
+++ b/hip/test/multigrid/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_hip_test(amgx_pgm_kernels)
diff --git a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
new file mode 100644
index 00000000000..10befbe9315
--- /dev/null
+++ b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
@@ -0,0 +1,362 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/multigrid/amgx_pgm.hpp>
+
+
+#include <fstream>
+#include <random>
+#include <string>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/combined.hpp>
+#include <ginkgo/core/stop/iteration.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/multigrid/amgx_pgm_kernels.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+template <typename Array, typename ValueDistribution, typename Engine>
+Array generate_random_array(gko::size_type num, ValueDistribution &&value_dist,
+                            Engine &&engine,
+                            std::shared_ptr<const gko::Executor> exec)
+{
+    using value_type = typename Array::value_type;
+    Array array_host(exec->get_master(), num);
+    auto val = array_host.get_data();
+    for (int i = 0; i < num; i++) {
+        val[i] =
+            gko::test::detail::get_rand_value<value_type>(value_dist, engine);
+    }
+    Array array(exec);
+    array = array_host;
+    return array;
+}
+
+
+class AmgxPgm : public ::testing::Test {
+protected:
+    using value_type = gko::default_precision;
+    using index_type = gko::int32;
+    using Mtx = gko::matrix::Dense<>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    AmgxPgm() : rand_engine(30) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
+        ref = gko::ReferenceExecutor::create();
+        hip = gko::HipExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (hip != nullptr) {
+            ASSERT_NO_THROW(hip->synchronize());
+        }
+    }
+
+    std::unique_ptr<Mtx> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<Mtx>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    gko::Array<index_type> gen_array(gko::size_type num, index_type min_val,
+                                     index_type max_val)
+    {
+        return generate_random_array<gko::Array<index_type>>(
+            num, std::uniform_int_distribution<>(min_val, max_val), rand_engine,
+            ref);
+    }
+
+    void initialize_data()
+    {
+        int m = 597;
+        n = 300;
+        int nrhs = 3;
+
+        agg = gen_array(m, 0, n - 1);
+        unfinished_agg = gen_array(m, -1, n - 1);
+        strongest_neighbor = gen_array(m, 0, n - 1);
+        coarse_vector = gen_mtx(n, nrhs);
+        fine_vector = gen_mtx(m, nrhs);
+        auto weight = gen_mtx(m, m);
+        make_weight(weight.get());
+        weight_csr = Csr::create(ref);
+        weight->convert_to(weight_csr.get());
+        weight_diag = weight_csr->extract_diagonal();
+
+        d_agg.set_executor(hip);
+        d_unfinished_agg.set_executor(hip);
+        d_strongest_neighbor.set_executor(hip);
+        d_coarse_vector = Mtx::create(hip);
+        d_fine_vector = Mtx::create(hip);
+        d_weight_csr = Csr::create(hip);
+        d_weight_diag = Mtx::create(hip);
+        d_agg = agg;
+        d_unfinished_agg = unfinished_agg;
+        d_strongest_neighbor = strongest_neighbor;
+        d_coarse_vector->copy_from(coarse_vector.get());
+        d_fine_vector->copy_from(fine_vector.get());
+        d_weight_csr->copy_from(weight_csr.get());
+        d_weight_diag->copy_from(weight_diag.get());
+    }
+
+    void make_symetric(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = mtx->at(j, i);
+            }
+        }
+    }
+
+    // only for real value
+    void make_absoulte(Mtx *mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                mtx->at(i, j) = abs(mtx->at(i, j));
+            }
+        }
+    }
+
+    void make_diag_dominant(Mtx *mtx)
+    {
+        using std::abs;
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            auto sum = gko::zero<Mtx::value_type>();
+            for (int j = 0; j < mtx->get_size()[1]; ++j) {
+                sum += abs(mtx->at(i, j));
+            }
+            mtx->at(i, i) = sum;
+        }
+    }
+
+    void make_spd(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    void make_weight(Mtx *mtx)
+    {
+        make_symetric(mtx);
+        make_absoulte(mtx);
+        make_diag_dominant(mtx);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::HipExecutor> hip;
+
+    std::ranlux48 rand_engine;
+
+    gko::Array<index_type> agg;
+    gko::Array<index_type> unfinished_agg;
+    gko::Array<index_type> strongest_neighbor;
+
+    gko::Array<index_type> d_agg;
+    gko::Array<index_type> d_unfinished_agg;
+    gko::Array<index_type> d_strongest_neighbor;
+
+    std::unique_ptr<Mtx> coarse_vector;
+    std::unique_ptr<Mtx> fine_vector;
+    std::unique_ptr<Mtx> weight_diag;
+    std::unique_ptr<Csr> weight_csr;
+
+    std::unique_ptr<Mtx> d_coarse_vector;
+    std::unique_ptr<Mtx> d_fine_vector;
+    std::unique_ptr<Mtx> d_weight_diag;
+    std::unique_ptr<Csr> d_weight_csr;
+
+    gko::size_type n;
+};
+
+
+TEST_F(AmgxPgm, RestrictApplyIsEquivalentToRef)
+{
+    initialize_data();
+    // fine->coarse
+    auto x = Mtx::create_with_config_of(gko::lend(coarse_vector));
+    auto d_x = Mtx::create_with_config_of(gko::lend(d_coarse_vector));
+
+    gko::kernels::reference::amgx_pgm::restrict_apply(
+        ref, agg, fine_vector.get(), x.get());
+    gko::kernels::hip::amgx_pgm::restrict_apply(hip, d_agg, d_fine_vector.get(),
+                                                d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(AmgxPgm, ProlongApplyaddIsEquivalentToRef)
+{
+    initialize_data();
+    // coarse->fine
+    auto x = fine_vector->clone();
+    auto d_x = d_fine_vector->clone();
+
+    gko::kernels::reference::amgx_pgm::prolong_applyadd(
+        ref, agg, coarse_vector.get(), x.get());
+    gko::kernels::hip::amgx_pgm::prolong_applyadd(
+        hip, d_agg, d_coarse_vector.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
+TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+
+    gko::kernels::reference::amgx_pgm::match_edge(ref, strongest_neighbor, x);
+    gko::kernels::hip::amgx_pgm::match_edge(hip, d_strongest_neighbor, d_x);
+
+    GKO_ASSERT_ARRAY_EQ(d_x, x);
+}
+
+
+TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
+{
+    initialize_data();
+    gko::size_type num_unagg;
+    gko::size_type d_num_unagg;
+
+    gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
+    gko::kernels::hip::amgx_pgm::count_unagg(hip, d_agg, &d_num_unagg);
+
+    ASSERT_EQ(d_num_unagg, num_unagg);
+}
+
+
+TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+    gko::size_type num_agg;
+    gko::size_type d_num_agg;
+
+    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
+    gko::kernels::hip::amgx_pgm::renumber(hip, d_agg, &d_num_agg);
+
+    ASSERT_EQ(d_num_agg, num_agg);
+    GKO_ASSERT_ARRAY_EQ(d_agg, agg);
+    ASSERT_LE(num_agg, 300);
+}
+
+
+TEST_F(AmgxPgm, FindStrongestNeighborIsEquivalentToRef)
+{
+    initialize_data();
+    auto snb = strongest_neighbor;
+    auto d_snb = d_strongest_neighbor;
+
+    gko::kernels::reference::amgx_pgm::find_strongest_neighbor(
+        ref, weight_csr.get(), weight_diag.get(), agg, snb);
+    gko::kernels::hip::amgx_pgm::find_strongest_neighbor(
+        hip, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb);
+
+    GKO_ASSERT_ARRAY_EQ(d_snb, snb);
+}
+
+
+TEST_F(AmgxPgm, AssignToExistAggIsEquivalentToRef)
+{
+    initialize_data();
+    auto x = unfinished_agg;
+    auto d_x = d_unfinished_agg;
+    auto intermediate_agg = x;
+    auto d_intermediate_agg = d_x;
+
+    gko::kernels::reference::amgx_pgm::assign_to_exist_agg(
+        ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg);
+    gko::kernels::hip::amgx_pgm::assign_to_exist_agg(
+        hip, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
+
+    GKO_ASSERT_ARRAY_EQ(d_x, x);
+}
+
+
+TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
+{
+    initialize_data();
+    auto d_x = d_unfinished_agg;
+    auto d_intermediate_agg = gko::Array<index_type>(hip, 0);
+    gko::size_type d_num_unagg;
+
+    gko::kernels::hip::amgx_pgm::assign_to_exist_agg(
+        hip, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
+    gko::kernels::hip::amgx_pgm::count_unagg(hip, d_agg, &d_num_unagg);
+
+    // only test whether all elements are aggregated.
+    GKO_ASSERT_EQ(d_num_unagg, 0);
+}
+
+
+TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
+{
+    initialize_data();
+    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
+    auto d_csr_coarse = Csr::create(hip, gko::dim<2>{n, n}, 0);
+    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
+                                weight_csr->get_num_stored_elements());
+    auto d_csr_temp = Csr::create(hip, gko::dim<2>{n, n},
+                                  d_weight_csr->get_num_stored_elements());
+
+    gko::kernels::hip::amgx_pgm::amgx_pgm_generate(
+        hip, d_weight_csr.get(), d_agg, d_csr_coarse.get(), d_csr_temp.get());
+    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
+        ref, weight_csr.get(), agg, csr_coarse.get(), csr_temp.get());
+
+    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
+}
+
+
+}  // namespace

From e01afa84018c54371a7f92f2ee26b40a98354110 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 12 Aug 2020 11:10:15 +0200
Subject: [PATCH 06/16] fix amg_pgm_kernel determinstic problem

---
 common/multigrid/amgx_pgm_kernels.hpp.inc | 9 +++++----
 omp/multigrid/amgx_pgm_kernels.cpp        | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 5bed8696b10..851a7023409 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -95,11 +95,12 @@ __global__ __launch_bounds__(default_block_size) void match_edge_kernel(
     if (agg_vals[tidx] != -1) {
         return;
     }
-    auto neighbor = strongest_neighbor_vals[tidx];
+    size_type neighbor = strongest_neighbor_vals[tidx];
     if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx) {
-        agg_vals[tidx] = tidx;
-        agg_vals[neighbor] = tidx;
         // Use the smaller index as agg point
+        auto group = min(tidx, neighbor);
+        agg_vals[tidx] = group;
+        agg_vals[neighbor] = group;
     }
 }
 
@@ -342,4 +343,4 @@ __global__ __launch_bounds__(default_block_size) void copy_to_coarse_kernel(
 }
 
 
-}  // namespace kernel
\ No newline at end of file
+}  // namespace kernel
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index 80f1b2f88ce..540a7bdddcf 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -75,11 +75,12 @@ void match_edge(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel for
     for (size_type i = 0; i < agg.get_num_elems(); i++) {
         if (agg_vals[i] == -1) {
-            auto neighbor = strongest_neighbor_vals[i];
+            size_type neighbor = strongest_neighbor_vals[i];
             if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i) {
-                agg_vals[i] = i;
-                agg_vals[neighbor] = i;
                 // Use the smaller index as agg point
+                auto group = min(i, neighbor);
+                agg_vals[i] = group;
+                agg_vals[neighbor] = group;
             }
         }
     }
@@ -193,7 +194,6 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
     const auto row_ptrs = weight_mtx->get_const_row_ptrs();
     const auto col_idxs = weight_mtx->get_const_col_idxs();
     const auto vals = weight_mtx->get_const_values();
-    auto max_weight_agg = zero<ValueType>();
     const auto agg_const_val = agg.get_const_data();
     auto agg_val = (intermediate_agg.get_num_elems() > 0)
                        ? intermediate_agg.get_data()
@@ -203,6 +203,7 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
         if (agg_const_val[row] != -1) {
             continue;
         }
+        auto max_weight_agg = zero<ValueType>();
         IndexType strongest_agg = -1;
         for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
             auto col = col_idxs[idx];

From dd6ff83dfeb71821bb3129ee4cb918f2fb9609c3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 31 Aug 2020 22:05:42 +0200
Subject: [PATCH 07/16] update diag related function to Diagonal format

---
 common/multigrid/amgx_pgm_kernels.hpp.inc   | 12 +++++------
 cuda/multigrid/amgx_pgm_kernels.cu          |  4 ++--
 cuda/test/multigrid/amgx_pgm_kernels.cpp    |  8 ++++---
 hip/multigrid/amgx_pgm_kernels.hip.cpp      | 23 ++++++++++-----------
 hip/test/multigrid/amgx_pgm_kernels.hip.cpp |  8 ++++---
 omp/multigrid/amgx_pgm_kernels.cpp          |  8 ++++---
 omp/test/multigrid/amgx_pgm_kernels.cpp     |  8 ++++---
 7 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 851a7023409..d8bda57a2e0 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -150,8 +150,8 @@ __global__
         const size_type num, const IndexType *__restrict__ row_ptrs,
         const IndexType *__restrict__ col_idxs,
         const ValueType *__restrict__ weight_vals,
-        const ValueType *__restrict__ diag, const size_type diag_stride,
-        IndexType *__restrict__ agg, IndexType *__restrict__ strongest_neighbor)
+        const ValueType *__restrict__ diag, IndexType *__restrict__ agg,
+        IndexType *__restrict__ strongest_neighbor)
 {
     auto row = thread::get_thread_id_flat();
     if (row >= num) {
@@ -170,8 +170,7 @@ __global__
         if (col == row) {
             continue;
         }
-        auto weight = weight_vals[idx] / max(abs(diag[row * diag_stride]),
-                                             abs(diag[col * diag_stride]));
+        auto weight = weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
         if (agg[col] == -1 &&
             (weight > max_weight_unagg ||
              (weight == max_weight_unagg && col > strongest_unagg))) {
@@ -207,7 +206,7 @@ __global__
         const size_type num, const IndexType *__restrict__ row_ptrs,
         const IndexType *__restrict__ col_idxs,
         const ValueType *__restrict__ weight_vals,
-        const ValueType *__restrict__ diag, const size_type diag_stride,
+        const ValueType *__restrict__ diag,
         const IndexType *__restrict__ agg_const_val,
         IndexType *__restrict__ agg_val)
 {
@@ -225,8 +224,7 @@ __global__
         if (col == row) {
             continue;
         }
-        auto weight = weight_vals[idx] / max(abs(diag[row * diag_stride]),
-                                             abs(diag[col * diag_stride]));
+        auto weight = weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
         if (agg_const_val[col] != -1 &&
             (weight > max_weight_agg ||
              (weight == max_weight_agg && col > strongest_agg))) {
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index d84766557cb..b8700466365 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -136,7 +136,7 @@ void find_strongest_neighbor(
     kernel::find_strongest_neighbor_kernel<<<grid, default_block_size>>>(
         num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(),
         weight_mtx->get_const_values(), diag->get_const_values(),
-        diag->get_stride(), agg.get_data(), strongest_neighbor.get_data());
+        agg.get_data(), strongest_neighbor.get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
@@ -157,7 +157,7 @@ void assign_to_exist_agg(std::shared_ptr<const CudaExecutor> exec,
     kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
         num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(),
         weight_mtx->get_const_values(), diag->get_const_values(),
-        diag->get_stride(), agg.get_const_data(), agg_val);
+        agg.get_const_data(), agg_val);
     if (intermediate_agg.get_num_elems() > 0) {
         // Copy the intermediate_agg to agg
         agg = intermediate_agg;
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 6c2c34885a1..fa4e582085d 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
@@ -81,6 +82,7 @@ class AmgxPgm : public ::testing::Test {
     using index_type = gko::int32;
     using Mtx = gko::matrix::Dense<>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Diag = gko::matrix::Diagonal<value_type>;
     AmgxPgm() : rand_engine(30) {}
 
     void SetUp()
@@ -136,7 +138,7 @@ class AmgxPgm : public ::testing::Test {
         d_coarse_vector = Mtx::create(cuda);
         d_fine_vector = Mtx::create(cuda);
         d_weight_csr = Csr::create(cuda);
-        d_weight_diag = Mtx::create(cuda);
+        d_weight_diag = Diag::create(cuda);
         d_agg = agg;
         d_unfinished_agg = unfinished_agg;
         d_strongest_neighbor = strongest_neighbor;
@@ -205,12 +207,12 @@ class AmgxPgm : public ::testing::Test {
 
     std::unique_ptr<Mtx> coarse_vector;
     std::unique_ptr<Mtx> fine_vector;
-    std::unique_ptr<Mtx> weight_diag;
+    std::unique_ptr<Diag> weight_diag;
     std::unique_ptr<Csr> weight_csr;
 
     std::unique_ptr<Mtx> d_coarse_vector;
     std::unique_ptr<Mtx> d_fine_vector;
-    std::unique_ptr<Mtx> d_weight_diag;
+    std::unique_ptr<Diag> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
 
     gko::size_type n;
diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index e3e90a06b74..41fc38854a8 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -137,12 +137,12 @@ void find_strongest_neighbor(
 {
     const auto num = agg.get_num_elems();
     const dim3 grid(ceildiv(num, default_block_size));
-    hipLaunchKernelGGL(
-        kernel::find_strongest_neighbor_kernel, dim3(grid),
-        dim3(default_block_size), 0, 0, num, weight_mtx->get_const_row_ptrs(),
-        weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
-        diag->get_const_values(), diag->get_stride(), agg.get_data(),
-        strongest_neighbor.get_data());
+    hipLaunchKernelGGL(kernel::find_strongest_neighbor_kernel, dim3(grid),
+                       dim3(default_block_size), 0, 0, num,
+                       weight_mtx->get_const_row_ptrs(),
+                       weight_mtx->get_const_col_idxs(),
+                       weight_mtx->get_const_values(), diag->get_const_values(),
+                       agg.get_data(), strongest_neighbor.get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
@@ -161,12 +161,11 @@ void assign_to_exist_agg(std::shared_ptr<const HipExecutor> exec,
                        : agg.get_data();
     const auto num = agg.get_num_elems();
     const dim3 grid(ceildiv(num, default_block_size));
-    hipLaunchKernelGGL(kernel::assign_to_exist_agg_kernel, dim3(grid),
-                       dim3(default_block_size), 0, 0, num,
-                       weight_mtx->get_const_row_ptrs(),
-                       weight_mtx->get_const_col_idxs(),
-                       weight_mtx->get_const_values(), diag->get_const_values(),
-                       diag->get_stride(), agg.get_const_data(), agg_val);
+    hipLaunchKernelGGL(
+        kernel::assign_to_exist_agg_kernel, dim3(grid),
+        dim3(default_block_size), 0, 0, num, weight_mtx->get_const_row_ptrs(),
+        weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
+        diag->get_const_values(), agg.get_const_data(), agg_val);
     if (intermediate_agg.get_num_elems() > 0) {
         // Copy the intermediate_agg to agg
         agg = intermediate_agg;
diff --git a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
index 10befbe9315..9575e854207 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
@@ -81,6 +82,7 @@ class AmgxPgm : public ::testing::Test {
     using index_type = gko::int32;
     using Mtx = gko::matrix::Dense<>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Diag = gko::matrix::Diagonal<value_type>;
     AmgxPgm() : rand_engine(30) {}
 
     void SetUp()
@@ -136,7 +138,7 @@ class AmgxPgm : public ::testing::Test {
         d_coarse_vector = Mtx::create(hip);
         d_fine_vector = Mtx::create(hip);
         d_weight_csr = Csr::create(hip);
-        d_weight_diag = Mtx::create(hip);
+        d_weight_diag = Diag::create(hip);
         d_agg = agg;
         d_unfinished_agg = unfinished_agg;
         d_strongest_neighbor = strongest_neighbor;
@@ -205,12 +207,12 @@ class AmgxPgm : public ::testing::Test {
 
     std::unique_ptr<Mtx> coarse_vector;
     std::unique_ptr<Mtx> fine_vector;
-    std::unique_ptr<Mtx> weight_diag;
+    std::unique_ptr<Diag> weight_diag;
     std::unique_ptr<Csr> weight_csr;
 
     std::unique_ptr<Mtx> d_coarse_vector;
     std::unique_ptr<Mtx> d_fine_vector;
-    std::unique_ptr<Mtx> d_weight_diag;
+    std::unique_ptr<Diag> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
 
     gko::size_type n;
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index 540a7bdddcf..9c4efefebe6 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -138,6 +138,7 @@ void find_strongest_neighbor(
     const auto row_ptrs = weight_mtx->get_const_row_ptrs();
     const auto col_idxs = weight_mtx->get_const_col_idxs();
     const auto vals = weight_mtx->get_const_values();
+    const auto diag_vals = diag->get_const_values();
 #pragma omp parallel for
     for (size_type row = 0; row < agg.get_num_elems(); row++) {
         auto max_weight_unagg = zero<ValueType>();
@@ -150,8 +151,8 @@ void find_strongest_neighbor(
                 if (col == row) {
                     continue;
                 }
-                auto weight = vals[idx] /
-                              max(abs(diag->at(row, 0)), abs(diag->at(col, 0)));
+                auto weight =
+                    vals[idx] / max(abs(diag_vals[col]), abs(diag_vals[col]));
                 if (agg.get_const_data()[col] == -1 &&
                     (weight > max_weight_unagg ||
                      (weight == max_weight_unagg && col > strongest_unagg))) {
@@ -198,6 +199,7 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
     auto agg_val = (intermediate_agg.get_num_elems() > 0)
                        ? intermediate_agg.get_data()
                        : agg.get_data();
+    const auto diag_vals = diag->get_const_values();
 #pragma omp parallel for
     for (IndexType row = 0; row < agg.get_num_elems(); row++) {
         if (agg_const_val[row] != -1) {
@@ -211,7 +213,7 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
                 continue;
             }
             auto weight =
-                vals[idx] / max(abs(diag->at(row, 0)), abs(diag->at(col, 0)));
+                vals[idx] / max(abs(diag_vals[row]), abs(diag_vals[col]));
             if (agg_const_val[col] != -1 &&
                 (weight > max_weight_agg ||
                  (weight == max_weight_agg && col > strongest_agg))) {
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index cc822135a5d..09e1cf918a3 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
@@ -80,6 +81,7 @@ class AmgxPgm : public ::testing::Test {
     using index_type = gko::int32;
     using Mtx = gko::matrix::Dense<>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Diag = gko::matrix::Diagonal<value_type>;
     AmgxPgm() : rand_engine(30) {}
 
     void SetUp()
@@ -127,7 +129,7 @@ class AmgxPgm : public ::testing::Test {
         d_coarse_vector = Mtx::create(omp);
         d_fine_vector = Mtx::create(omp);
         d_weight_csr = Csr::create(omp);
-        d_weight_diag = Mtx::create(omp);
+        d_weight_diag = Diag::create(omp);
         d_agg = agg;
         d_unfinished_agg = unfinished_agg;
         d_strongest_neighbor = strongest_neighbor;
@@ -196,12 +198,12 @@ class AmgxPgm : public ::testing::Test {
 
     std::unique_ptr<Mtx> coarse_vector;
     std::unique_ptr<Mtx> fine_vector;
-    std::unique_ptr<Mtx> weight_diag;
+    std::unique_ptr<Diag> weight_diag;
     std::unique_ptr<Csr> weight_csr;
 
     std::unique_ptr<Mtx> d_coarse_vector;
     std::unique_ptr<Mtx> d_fine_vector;
-    std::unique_ptr<Mtx> d_weight_diag;
+    std::unique_ptr<Diag> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
 
     gko::size_type n;

From 9009d4b60087b6f6a8cfd416f5c989ac480fd0c8 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 27 Jan 2021 16:37:50 +0800
Subject: [PATCH 08/16] fix the type of the tests

---
 cuda/test/multigrid/amgx_pgm_kernels.cpp    | 10 +++++-----
 hip/test/multigrid/amgx_pgm_kernels.hip.cpp | 10 +++++-----
 omp/test/multigrid/amgx_pgm_kernels.cpp     | 10 +++++-----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index fa4e582085d..a9852bd3118 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -267,8 +267,8 @@ TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
 TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
 {
     initialize_data();
-    gko::size_type num_unagg;
-    gko::size_type d_num_unagg;
+    index_type num_unagg;
+    index_type d_num_unagg;
 
     gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
     gko::kernels::cuda::amgx_pgm::count_unagg(cuda, d_agg, &d_num_unagg);
@@ -282,8 +282,8 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
     initialize_data();
     auto x = unfinished_agg;
     auto d_x = d_unfinished_agg;
-    gko::size_type num_agg;
-    gko::size_type d_num_agg;
+    index_type num_agg;
+    index_type d_num_agg;
 
     gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
     gko::kernels::cuda::amgx_pgm::renumber(cuda, d_agg, &d_num_agg);
@@ -331,7 +331,7 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
     initialize_data();
     auto d_x = d_unfinished_agg;
     auto d_intermediate_agg = gko::Array<index_type>(cuda, 0);
-    gko::size_type d_num_unagg;
+    index_type d_num_unagg;
 
     gko::kernels::cuda::amgx_pgm::assign_to_exist_agg(
         cuda, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
diff --git a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
index 9575e854207..0d059b631b2 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
@@ -267,8 +267,8 @@ TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
 TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
 {
     initialize_data();
-    gko::size_type num_unagg;
-    gko::size_type d_num_unagg;
+    index_type num_unagg;
+    index_type d_num_unagg;
 
     gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
     gko::kernels::hip::amgx_pgm::count_unagg(hip, d_agg, &d_num_unagg);
@@ -282,8 +282,8 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
     initialize_data();
     auto x = unfinished_agg;
     auto d_x = d_unfinished_agg;
-    gko::size_type num_agg;
-    gko::size_type d_num_agg;
+    index_type num_agg;
+    index_type d_num_agg;
 
     gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
     gko::kernels::hip::amgx_pgm::renumber(hip, d_agg, &d_num_agg);
@@ -331,7 +331,7 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
     initialize_data();
     auto d_x = d_unfinished_agg;
     auto d_intermediate_agg = gko::Array<index_type>(hip, 0);
-    gko::size_type d_num_unagg;
+    index_type d_num_unagg;
 
     gko::kernels::hip::amgx_pgm::assign_to_exist_agg(
         hip, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index 09e1cf918a3..3b00ea89054 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -258,8 +258,8 @@ TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
 TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
 {
     initialize_data();
-    gko::size_type num_unagg;
-    gko::size_type d_num_unagg;
+    index_type num_unagg;
+    index_type d_num_unagg;
 
     gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
     gko::kernels::omp::amgx_pgm::count_unagg(omp, d_agg, &d_num_unagg);
@@ -273,8 +273,8 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
     initialize_data();
     auto x = unfinished_agg;
     auto d_x = d_unfinished_agg;
-    gko::size_type num_agg;
-    gko::size_type d_num_agg;
+    index_type num_agg;
+    index_type d_num_agg;
 
     gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
     gko::kernels::omp::amgx_pgm::renumber(omp, d_agg, &d_num_agg);
@@ -322,7 +322,7 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
     initialize_data();
     auto d_x = d_unfinished_agg;
     auto d_intermediate_agg = gko::Array<index_type>(omp, 0);
-    gko::size_type d_num_unagg;
+    index_type d_num_unagg;
 
     gko::kernels::omp::amgx_pgm::assign_to_exist_agg(
         omp, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);

From ac1f66e8fea26b6bc200b169bc6ec798b8544022 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 27 Jan 2021 16:39:49 +0800
Subject: [PATCH 09/16] format

---
 cuda/test/multigrid/amgx_pgm_kernels.cpp    | 2 +-
 hip/test/multigrid/amgx_pgm_kernels.hip.cpp | 2 +-
 omp/test/multigrid/amgx_pgm_kernels.cpp     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index a9852bd3118..2d855dc328b 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2020, the Ginkgo authors
+Copyright (c) 2017-2021, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
index 0d059b631b2..6015b2f5522 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2020, the Ginkgo authors
+Copyright (c) 2017-2021, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index 3b00ea89054..22fea84bcaa 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2020, the Ginkgo authors
+Copyright (c) 2017-2021, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

From 32bffa2f7e93688b4e857c3d35394a89c8d4f9a1 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 10 Mar 2021 23:53:40 +0800
Subject: [PATCH 10/16] delete the tests using unused function

---
 common/multigrid/amgx_pgm_kernels.hpp.inc   | 38 +--------------------
 cuda/test/multigrid/amgx_pgm_kernels.cpp    | 32 -----------------
 hip/test/multigrid/amgx_pgm_kernels.hip.cpp | 32 -----------------
 omp/test/multigrid/amgx_pgm_kernels.cpp     | 32 -----------------
 4 files changed, 1 insertion(+), 133 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index d8bda57a2e0..8565aaa6227 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2020, the Ginkgo authors
+Copyright (c) 2017-2021, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -33,42 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace kernel {
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void restrict_apply_kernel(
-    const IndexType *__restrict__ agg, const size_type num_rows,
-    const size_type num_rhs, const ValueType *__restrict__ b,
-    const size_type b_stride, ValueType *__restrict__ x,
-    const size_type x_stride)
-{
-    auto tidx = thread::get_thread_id_flat();
-    auto row = tidx / num_rhs;
-    if (row >= num_rows) {
-        return;
-    }
-    auto col = tidx % num_rhs;
-    auto ind = agg[row];
-    atomic_add(x + ind * x_stride + col, b[row * b_stride + col]);
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void prolong_applyadd_kernel(
-    const IndexType *__restrict__ agg, const size_type num_rows,
-    const size_type num_rhs, const ValueType *__restrict__ b,
-    const size_type b_stride, ValueType *__restrict__ x,
-    const size_type x_stride)
-{
-    auto tidx = thread::get_thread_id_flat();
-    auto row = tidx / num_rhs;
-    if (row >= num_rows) {
-        return;
-    }
-    auto col = tidx % num_rhs;
-    auto ind = agg[row];
-    x[row * x_stride + col] += b[ind * b_stride + col];
-}
-
-
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void replace_kernel(
     size_type size, const IndexType *__restrict__ source,
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 2d855dc328b..6a3071e5ce0 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -219,38 +219,6 @@ class AmgxPgm : public ::testing::Test {
 };
 
 
-TEST_F(AmgxPgm, RestrictApplyIsEquivalentToRef)
-{
-    initialize_data();
-    // fine->coarse
-    auto x = Mtx::create_with_config_of(gko::lend(coarse_vector));
-    auto d_x = Mtx::create_with_config_of(gko::lend(d_coarse_vector));
-
-    gko::kernels::reference::amgx_pgm::restrict_apply(
-        ref, agg, fine_vector.get(), x.get());
-    gko::kernels::cuda::amgx_pgm::restrict_apply(
-        cuda, d_agg, d_fine_vector.get(), d_x.get());
-
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
-}
-
-
-TEST_F(AmgxPgm, ProlongApplyaddIsEquivalentToRef)
-{
-    initialize_data();
-    // coarse->fine
-    auto x = fine_vector->clone();
-    auto d_x = d_fine_vector->clone();
-
-    gko::kernels::reference::amgx_pgm::prolong_applyadd(
-        ref, agg, coarse_vector.get(), x.get());
-    gko::kernels::cuda::amgx_pgm::prolong_applyadd(
-        cuda, d_agg, d_coarse_vector.get(), d_x.get());
-
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
 {
     initialize_data();
diff --git a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
index 6015b2f5522..ad1e8f040e7 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
@@ -219,38 +219,6 @@ class AmgxPgm : public ::testing::Test {
 };
 
 
-TEST_F(AmgxPgm, RestrictApplyIsEquivalentToRef)
-{
-    initialize_data();
-    // fine->coarse
-    auto x = Mtx::create_with_config_of(gko::lend(coarse_vector));
-    auto d_x = Mtx::create_with_config_of(gko::lend(d_coarse_vector));
-
-    gko::kernels::reference::amgx_pgm::restrict_apply(
-        ref, agg, fine_vector.get(), x.get());
-    gko::kernels::hip::amgx_pgm::restrict_apply(hip, d_agg, d_fine_vector.get(),
-                                                d_x.get());
-
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
-}
-
-
-TEST_F(AmgxPgm, ProlongApplyaddIsEquivalentToRef)
-{
-    initialize_data();
-    // coarse->fine
-    auto x = fine_vector->clone();
-    auto d_x = d_fine_vector->clone();
-
-    gko::kernels::reference::amgx_pgm::prolong_applyadd(
-        ref, agg, coarse_vector.get(), x.get());
-    gko::kernels::hip::amgx_pgm::prolong_applyadd(
-        hip, d_agg, d_coarse_vector.get(), d_x.get());
-
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
 {
     initialize_data();
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index 22fea84bcaa..9a27ef3f4fc 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -210,38 +210,6 @@ class AmgxPgm : public ::testing::Test {
 };
 
 
-TEST_F(AmgxPgm, RestrictApplyIsEquivalentToRef)
-{
-    initialize_data();
-    // fine->coarse
-    auto x = Mtx::create_with_config_of(gko::lend(coarse_vector));
-    auto d_x = Mtx::create_with_config_of(gko::lend(d_coarse_vector));
-
-    gko::kernels::reference::amgx_pgm::restrict_apply(
-        ref, agg, fine_vector.get(), x.get());
-    gko::kernels::omp::amgx_pgm::restrict_apply(omp, d_agg, d_fine_vector.get(),
-                                                d_x.get());
-
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
-}
-
-
-TEST_F(AmgxPgm, ProlongApplyaddIsEquivalentToRef)
-{
-    initialize_data();
-    // coarse->fine
-    auto x = fine_vector->clone();
-    auto d_x = d_fine_vector->clone();
-
-    gko::kernels::reference::amgx_pgm::prolong_applyadd(
-        ref, agg, coarse_vector.get(), x.get());
-    gko::kernels::omp::amgx_pgm::prolong_applyadd(
-        omp, d_agg, d_coarse_vector.get(), d_x.get());
-
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, MatchEdgeIsEquivalentToRef)
 {
     initialize_data();

From 29dfbaa2a6acb5cc70fc58220347f7acadece06a Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 15 Mar 2021 15:24:23 +0800
Subject: [PATCH 11/16] change deprecated header, fix wrong type, add test

---
 common/multigrid/amgx_pgm_kernels.hpp.inc     | 45 ++++++++++++++++---
 cuda/multigrid/amgx_pgm_kernels.cu            | 19 +++++---
 cuda/test/multigrid/amgx_pgm_kernels.cpp      | 30 +++++++++++++
 dpcpp/multigrid/amgx_pgm_kernels.dp.cpp       | 10 ++---
 hip/multigrid/amgx_pgm_kernels.hip.cpp        | 24 ++++++----
 hip/test/multigrid/CMakeLists.txt             |  2 +-
 ...m_kernels.hip.cpp => amgx_pgm_kernels.cpp} | 30 +++++++++++++
 reference/test/multigrid/amgx_pgm_kernels.cpp |  2 +-
 8 files changed, 134 insertions(+), 28 deletions(-)
 rename hip/test/multigrid/{amgx_pgm_kernels.hip.cpp => amgx_pgm_kernels.cpp} (88%)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 8565aaa6227..eda2854763e 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -52,14 +52,14 @@ __global__ __launch_bounds__(default_block_size) void match_edge_kernel(
     size_type num, const IndexType *__restrict__ strongest_neighbor_vals,
     IndexType *__restrict__ agg_vals)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<IndexType>();
     if (tidx >= num) {
         return;
     }
     if (agg_vals[tidx] != -1) {
         return;
     }
-    size_type neighbor = strongest_neighbor_vals[tidx];
+    auto neighbor = strongest_neighbor_vals[tidx];
     if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx) {
         // Use the smaller index as agg point
         auto group = min(tidx, neighbor);
@@ -175,10 +175,7 @@ __global__
         IndexType *__restrict__ agg_val)
 {
     auto row = thread::get_thread_id_flat();
-    if (row >= num) {
-        return;
-    }
-    if (agg_val[row] != -1) {
+    if (row >= num || agg_val[row] != -1) {
         return;
     }
     ValueType max_weight_agg = zero<ValueType>();
@@ -203,6 +200,42 @@ __global__
     }
 }
 
+// This is the undeterminstic implementation which is the same implementation of
+// the previous one but agg_val == agg_const_val.
+template <typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void assign_to_exist_agg_kernel(
+        const size_type num, const IndexType *__restrict__ row_ptrs,
+        const IndexType *__restrict__ col_idxs,
+        const ValueType *__restrict__ weight_vals,
+        const ValueType *__restrict__ diag, IndexType *__restrict__ agg_val)
+{
+    auto row = thread::get_thread_id_flat();
+    if (row >= num || agg_val[row] != -1) {
+        return;
+    }
+    ValueType max_weight_agg = zero<ValueType>();
+    IndexType strongest_agg = -1;
+    for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
+        auto col = col_idxs[idx];
+        if (col == row) {
+            continue;
+        }
+        auto weight = weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
+        if (agg_val[col] != -1 &&
+            (weight > max_weight_agg ||
+             (weight == max_weight_agg && col > strongest_agg))) {
+            max_weight_agg = weight;
+            strongest_agg = col;
+        }
+    }
+    if (strongest_agg != -1) {
+        agg_val[row] = agg_val[strongest_agg];
+    } else {
+        agg_val[row] = row;
+    }
+}
+
 
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void get_source_row_map_kernel(
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index b8700466365..1560d7bf7f7 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -149,18 +149,23 @@ void assign_to_exist_agg(std::shared_ptr<const CudaExecutor> exec,
                          Array<IndexType> &agg,
                          Array<IndexType> &intermediate_agg)
 {
-    auto agg_val = (intermediate_agg.get_num_elems() > 0)
-                       ? intermediate_agg.get_data()
-                       : agg.get_data();
     const auto num = agg.get_num_elems();
     const dim3 grid(ceildiv(num, default_block_size));
-    kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
-        num, weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(),
-        weight_mtx->get_const_values(), diag->get_const_values(),
-        agg.get_const_data(), agg_val);
     if (intermediate_agg.get_num_elems() > 0) {
+        // determinstic kernel
+        kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
+            num, weight_mtx->get_const_row_ptrs(),
+            weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
+            diag->get_const_values(), agg.get_const_data(),
+            intermediate_agg.get_data());
         // Copy the intermediate_agg to agg
         agg = intermediate_agg;
+    } else {
+        // undeterminstic kernel
+        kernel::assign_to_exist_agg_kernel<<<grid, default_block_size>>>(
+            num, weight_mtx->get_const_row_ptrs(),
+            weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
+            diag->get_const_values(), agg.get_data());
     }
 }
 
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 6a3071e5ce0..10d2b97a8bd 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -131,6 +131,10 @@ class AmgxPgm : public ::testing::Test {
         weight_csr = Csr::create(ref);
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
+        auto system_dense = gen_mtx(m, m);
+        make_spd(system_dense.get());
+        system_mtx = Csr::create(ref);
+        system_dense->convert_to(system_mtx.get());
 
         d_agg.set_executor(cuda);
         d_unfinished_agg.set_executor(cuda);
@@ -139,6 +143,7 @@ class AmgxPgm : public ::testing::Test {
         d_fine_vector = Mtx::create(cuda);
         d_weight_csr = Csr::create(cuda);
         d_weight_diag = Diag::create(cuda);
+        d_system_mtx = Csr::create(cuda);
         d_agg = agg;
         d_unfinished_agg = unfinished_agg;
         d_strongest_neighbor = strongest_neighbor;
@@ -146,6 +151,7 @@ class AmgxPgm : public ::testing::Test {
         d_fine_vector->copy_from(fine_vector.get());
         d_weight_csr->copy_from(weight_csr.get());
         d_weight_diag->copy_from(weight_diag.get());
+        d_system_mtx->copy_from(system_mtx.get());
     }
 
     void make_symetric(Mtx *mtx)
@@ -209,11 +215,13 @@ class AmgxPgm : public ::testing::Test {
     std::unique_ptr<Mtx> fine_vector;
     std::unique_ptr<Diag> weight_diag;
     std::unique_ptr<Csr> weight_csr;
+    std::shared_ptr<Csr> system_mtx;
 
     std::unique_ptr<Mtx> d_coarse_vector;
     std::unique_ptr<Mtx> d_fine_vector;
     std::unique_ptr<Diag> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
+    std::shared_ptr<Csr> d_system_mtx;
 
     gko::size_type n;
 };
@@ -329,4 +337,26 @@ TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
 }
 
 
+TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
+{
+    initialize_data();
+    auto mg_level_factory = gko::multigrid::AmgxPgm<double, int>::build()
+                                .with_deterministic(true)
+                                .on(ref);
+    auto d_mg_level_factory = gko::multigrid::AmgxPgm<double, int>::build()
+                                  .with_deterministic(true)
+                                  .on(cuda);
+
+    auto mg_level = mg_level_factory->generate(system_mtx);
+    auto d_mg_level = d_mg_level_factory->generate(d_system_mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_restrict_op()),
+                        gko::as<Csr>(mg_level->get_restrict_op()), 1e-14);
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_coarse_op()),
+                        gko::as<Csr>(mg_level->get_coarse_op()), 1e-14);
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_prolong_op()),
+                        gko::as<Csr>(mg_level->get_prolong_op()), 1e-14);
+}
+
+
 }  // namespace
diff --git a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
index 933fc6fda7d..32817e5e834 100644
--- a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
@@ -101,11 +101,11 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const DpcppExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const Array<IndexType> &agg,
-                       matrix::Csr<ValueType, IndexType> *coarse)
-    GKO_NOT_IMPLEMENTED;
+void amgx_pgm_generate(
+    std::shared_ptr<const DpcppExecutor> exec,
+    const matrix::Csr<ValueType, IndexType> *source,
+    const Array<IndexType> &agg, matrix::Csr<ValueType, IndexType> *coarse,
+    matrix::Csr<ValueType, IndexType> *temp) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
 
diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index 41fc38854a8..2199f53e5c3 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -156,19 +156,27 @@ void assign_to_exist_agg(std::shared_ptr<const HipExecutor> exec,
                          Array<IndexType> &agg,
                          Array<IndexType> &intermediate_agg)
 {
-    auto agg_val = (intermediate_agg.get_num_elems() > 0)
-                       ? intermediate_agg.get_data()
-                       : agg.get_data();
     const auto num = agg.get_num_elems();
     const dim3 grid(ceildiv(num, default_block_size));
-    hipLaunchKernelGGL(
-        kernel::assign_to_exist_agg_kernel, dim3(grid),
-        dim3(default_block_size), 0, 0, num, weight_mtx->get_const_row_ptrs(),
-        weight_mtx->get_const_col_idxs(), weight_mtx->get_const_values(),
-        diag->get_const_values(), agg.get_const_data(), agg_val);
+
     if (intermediate_agg.get_num_elems() > 0) {
+        // determinstic kernel
+        hipLaunchKernelGGL(
+            kernel::assign_to_exist_agg_kernel, dim3(grid),
+            dim3(default_block_size), 0, 0, num,
+            weight_mtx->get_const_row_ptrs(), weight_mtx->get_const_col_idxs(),
+            weight_mtx->get_const_values(), diag->get_const_values(),
+            agg.get_const_data(), intermediate_agg.get_data());
         // Copy the intermediate_agg to agg
         agg = intermediate_agg;
+    } else {
+        // undeterminstic kernel
+        hipLaunchKernelGGL(kernel::assign_to_exist_agg_kernel, dim3(grid),
+                           dim3(default_block_size), 0, 0, num,
+                           weight_mtx->get_const_row_ptrs(),
+                           weight_mtx->get_const_col_idxs(),
+                           weight_mtx->get_const_values(),
+                           diag->get_const_values(), agg.get_data());
     }
 }
 
diff --git a/hip/test/multigrid/CMakeLists.txt b/hip/test/multigrid/CMakeLists.txt
index 1c8534e5337..481c2cc1bf2 100644
--- a/hip/test/multigrid/CMakeLists.txt
+++ b/hip/test/multigrid/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_hip_test(amgx_pgm_kernels)
+ginkgo_create_hip_test_special_linkage(amgx_pgm_kernels)
diff --git a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp b/hip/test/multigrid/amgx_pgm_kernels.cpp
similarity index 88%
rename from hip/test/multigrid/amgx_pgm_kernels.hip.cpp
rename to hip/test/multigrid/amgx_pgm_kernels.cpp
index ad1e8f040e7..71acf6ed06a 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.cpp
@@ -131,6 +131,10 @@ class AmgxPgm : public ::testing::Test {
         weight_csr = Csr::create(ref);
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
+        auto system_dense = gen_mtx(m, m);
+        make_spd(system_dense.get());
+        system_mtx = Csr::create(ref);
+        system_dense->convert_to(system_mtx.get());
 
         d_agg.set_executor(hip);
         d_unfinished_agg.set_executor(hip);
@@ -139,6 +143,7 @@ class AmgxPgm : public ::testing::Test {
         d_fine_vector = Mtx::create(hip);
         d_weight_csr = Csr::create(hip);
         d_weight_diag = Diag::create(hip);
+        d_system_mtx = Csr::create(hip);
         d_agg = agg;
         d_unfinished_agg = unfinished_agg;
         d_strongest_neighbor = strongest_neighbor;
@@ -146,6 +151,7 @@ class AmgxPgm : public ::testing::Test {
         d_fine_vector->copy_from(fine_vector.get());
         d_weight_csr->copy_from(weight_csr.get());
         d_weight_diag->copy_from(weight_diag.get());
+        d_system_mtx->copy_from(system_mtx.get());
     }
 
     void make_symetric(Mtx *mtx)
@@ -209,11 +215,13 @@ class AmgxPgm : public ::testing::Test {
     std::unique_ptr<Mtx> fine_vector;
     std::unique_ptr<Diag> weight_diag;
     std::unique_ptr<Csr> weight_csr;
+    std::shared_ptr<Csr> system_mtx;
 
     std::unique_ptr<Mtx> d_coarse_vector;
     std::unique_ptr<Mtx> d_fine_vector;
     std::unique_ptr<Diag> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
+    std::shared_ptr<Csr> d_system_mtx;
 
     gko::size_type n;
 };
@@ -329,4 +337,26 @@ TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
 }
 
 
+TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
+{
+    initialize_data();
+    auto mg_level_factory = gko::multigrid::AmgxPgm<double, int>::build()
+                                .with_deterministic(true)
+                                .on(ref);
+    auto d_mg_level_factory = gko::multigrid::AmgxPgm<double, int>::build()
+                                  .with_deterministic(true)
+                                  .on(hip);
+
+    auto mg_level = mg_level_factory->generate(system_mtx);
+    auto d_mg_level = d_mg_level_factory->generate(d_system_mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_restrict_op()),
+                        gko::as<Csr>(mg_level->get_restrict_op()), 1e-14);
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_coarse_op()),
+                        gko::as<Csr>(mg_level->get_coarse_op()), 1e-14);
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_prolong_op()),
+                        gko::as<Csr>(mg_level->get_prolong_op()), 1e-14);
+}
+
+
 }  // namespace
diff --git a/reference/test/multigrid/amgx_pgm_kernels.cpp b/reference/test/multigrid/amgx_pgm_kernels.cpp
index 847e859529e..09918310823 100644
--- a/reference/test/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/test/multigrid/amgx_pgm_kernels.cpp
@@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
-#include <ginkgo/core/stop/residual_norm_reduction.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
 #include <ginkgo/core/stop/time.hpp>
 
 

From 51f003828cf21ee504d3b86419aba91f6177bd5c Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 26 Mar 2021 03:05:41 +0800
Subject: [PATCH 12/16] review update, fix omp issue, collect dup. helpers

---
 common/multigrid/amgx_pgm_kernels.hpp.inc | 22 +-----
 core/test/utils.hpp                       |  3 +
 core/test/utils/array_generator.hpp       | 69 +++++++++++++++++
 core/test/utils/matrix_generator.hpp      | 23 +-----
 core/test/utils/matrix_utils.hpp          | 84 ++++++++++++++++++++
 core/test/utils/value_generator.hpp       | 69 +++++++++++++++++
 cuda/test/multigrid/amgx_pgm_kernels.cpp  | 67 ++--------------
 cuda/test/solver/bicg_kernels.cpp         | 29 +------
 cuda/test/solver/bicgstab_kernels.cpp     | 14 +---
 cuda/test/solver/cg_kernels.cpp           | 29 +------
 cuda/test/solver/cgs_kernels.cpp          | 14 +---
 cuda/test/solver/fcg_kernels.cpp          | 29 +------
 cuda/test/solver/idr_kernels.cpp          | 12 ---
 hip/test/multigrid/CMakeLists.txt         |  2 +-
 hip/test/multigrid/amgx_pgm_kernels.cpp   | 66 ++--------------
 hip/test/solver/bicg_kernels.cpp          | 29 +------
 hip/test/solver/bicgstab_kernels.cpp      | 14 +---
 hip/test/solver/cg_kernels.cpp            | 29 +------
 hip/test/solver/cgs_kernels.cpp           | 14 +---
 hip/test/solver/fcg_kernels.cpp           | 29 +------
 hip/test/solver/idr_kernels.cpp           | 12 ---
 omp/multigrid/amgx_pgm_kernels.cpp        | 13 ++--
 omp/test/multigrid/amgx_pgm_kernels.cpp   | 94 +++++++++--------------
 omp/test/solver/bicg_kernels.cpp          | 29 +------
 omp/test/solver/bicgstab_kernels.cpp      | 14 +---
 omp/test/solver/cg_kernels.cpp            | 29 +------
 omp/test/solver/cgs_kernels.cpp           | 14 +---
 omp/test/solver/fcg_kernels.cpp           | 29 +------
 omp/test/solver/gmres_kernels.cpp         | 12 ---
 omp/test/solver/idr_kernels.cpp           | 12 ---
 reference/multigrid/amgx_pgm_kernels.cpp  |  6 +-
 31 files changed, 306 insertions(+), 605 deletions(-)
 create mode 100644 core/test/utils/array_generator.hpp
 create mode 100644 core/test/utils/matrix_utils.hpp
 create mode 100644 core/test/utils/value_generator.hpp

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index eda2854763e..9c2d434f508 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -33,20 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace kernel {
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void replace_kernel(
-    size_type size, const IndexType *__restrict__ source,
-    IndexType *__restrict__ result)
-{
-    auto tidx = thread::get_thread_id_flat();
-    if (tidx >= size) {
-        return;
-    }
-
-    result[tidx] = source[tidx] == -1;
-}
-
-
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void match_edge_kernel(
     size_type num, const IndexType *__restrict__ strongest_neighbor_vals,
@@ -60,11 +46,11 @@ __global__ __launch_bounds__(default_block_size) void match_edge_kernel(
         return;
     }
     auto neighbor = strongest_neighbor_vals[tidx];
-    if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx) {
+    if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx &&
+        tidx < neighbor) {
         // Use the smaller index as agg point
-        auto group = min(tidx, neighbor);
-        agg_vals[tidx] = group;
-        agg_vals[neighbor] = group;
+        agg_vals[tidx] = tidx;
+        agg_vals[neighbor] = tidx;
     }
 }
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index 6cc2ad102c5..05bf216c8bb 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -48,8 +48,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/extended_float.hpp"
+#include "core/test/utils/array_generator.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/matrix_generator.hpp"
+#include "core/test/utils/matrix_utils.hpp"
+#include "core/test/utils/value_generator.hpp"
 
 
 namespace gko {
diff --git a/core/test/utils/array_generator.hpp b/core/test/utils/array_generator.hpp
new file mode 100644
index 00000000000..1ad52c18e26
--- /dev/null
+++ b/core/test/utils/array_generator.hpp
@@ -0,0 +1,69 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_UTILS_ARRAY_GENERATOR_HPP_
+#define GKO_CORE_TEST_UTILS_ARRAY_GENERATOR_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+
+
+#include "core/test/utils/value_generator.hpp"
+
+
+namespace gko {
+namespace test {
+
+
+template <typename ValueType, typename ValueDistribution, typename Engine>
+Array<ValueType> generate_random_array(size_type num,
+                                       ValueDistribution &&value_dist,
+                                       Engine &&engine,
+                                       std::shared_ptr<const Executor> exec)
+{
+    Array<ValueType> array(exec->get_master(), num);
+    auto val = array.get_data();
+    for (int i = 0; i < num; i++) {
+        val[i] = detail::get_rand_value<ValueType>(value_dist, engine);
+    }
+    array.set_executor(exec);
+    return array;
+}
+
+
+}  // namespace test
+}  // namespace gko
+
+
+#endif  // GKO_CORE_TEST_UTILS_ARRAY_GENERATOR_HPP_
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index c3f9b2fb50d..04f98126c41 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -46,28 +46,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-namespace gko {
-namespace test {
-namespace detail {
-
-
-template <typename ValueType, typename Distribution, typename Generator>
-typename std::enable_if<!is_complex_s<ValueType>::value, ValueType>::type
-get_rand_value(Distribution &&dist, Generator &&gen)
-{
-    return dist(gen);
-}
+#include "core/test/utils/value_generator.hpp"
 
 
-template <typename ValueType, typename Distribution, typename Generator>
-typename std::enable_if<is_complex_s<ValueType>::value, ValueType>::type
-get_rand_value(Distribution &&dist, Generator &&gen)
-{
-    return ValueType(dist(gen), dist(gen));
-}
-
-
-}  // namespace detail
+namespace gko {
+namespace test {
 
 
 /**
diff --git a/core/test/utils/matrix_utils.hpp b/core/test/utils/matrix_utils.hpp
new file mode 100644
index 00000000000..c8b269d493f
--- /dev/null
+++ b/core/test/utils/matrix_utils.hpp
@@ -0,0 +1,84 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_UTILS_MATRIX_UTILS_HPP_
+#define GKO_CORE_TEST_UTILS_MATRIX_UTILS_HPP_
+
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "core/test/utils/value_generator.hpp"
+
+
+namespace gko {
+namespace test {
+
+
+template <typename ValueType>
+void make_symmetric(matrix::Dense<ValueType> *mtx)
+{
+    assert(mtx->get_executor() == mtx->get_executor()->get_master());
+    for (size_type i = 0; i < mtx->get_size()[0]; ++i) {
+        for (size_type j = i + 1; j < mtx->get_size()[1]; ++j) {
+            mtx->at(i, j) = mtx->at(j, i);
+        }
+    }
+}
+
+
+template <typename ValueType>
+void make_diag_dominant(matrix::Dense<ValueType> *mtx)
+{
+    assert(mtx->get_executor() == mtx->get_executor()->get_master());
+    using std::abs;
+    for (int i = 0; i < mtx->get_size()[0]; ++i) {
+        auto sum = gko::zero<ValueType>();
+        for (int j = 0; j < mtx->get_size()[1]; ++j) {
+            sum += abs(mtx->at(i, j));
+        }
+        mtx->at(i, i) = sum;
+    }
+}
+
+
+template <typename ValueType>
+void make_spd(matrix::Dense<ValueType> *mtx)
+{
+    make_symmetric(mtx);
+    make_diag_dominant(mtx);
+}
+
+
+}  // namespace test
+}  // namespace gko
+
+#endif  // GKO_CORE_TEST_UTILS_MATRIX_UTILS_HPP_
diff --git a/core/test/utils/value_generator.hpp b/core/test/utils/value_generator.hpp
new file mode 100644
index 00000000000..8b82ea63d0a
--- /dev/null
+++ b/core/test/utils/value_generator.hpp
@@ -0,0 +1,69 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_UTILS_VALUE_GENERATOR_HPP_
+#define GKO_CORE_TEST_UTILS_VALUE_GENERATOR_HPP_
+
+
+#include <random>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace test {
+namespace detail {
+
+
+template <typename ValueType, typename Distribution, typename Generator>
+typename std::enable_if<!is_complex_s<ValueType>::value, ValueType>::type
+get_rand_value(Distribution &&dist, Generator &&gen)
+{
+    return dist(gen);
+}
+
+
+template <typename ValueType, typename Distribution, typename Generator>
+typename std::enable_if<is_complex_s<ValueType>::value, ValueType>::type
+get_rand_value(Distribution &&dist, Generator &&gen)
+{
+    return ValueType(dist(gen), dist(gen));
+}
+
+
+}  // namespace detail
+}  // namespace test
+}  // namespace gko
+
+#endif  // GKO_CORE_TEST_UTILS_VALUE_GENERATOR_HPP_
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 10d2b97a8bd..2814f9177c0 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -58,24 +58,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
-template <typename Array, typename ValueDistribution, typename Engine>
-Array generate_random_array(gko::size_type num, ValueDistribution &&value_dist,
-                            Engine &&engine,
-                            std::shared_ptr<const gko::Executor> exec)
-{
-    using value_type = typename Array::value_type;
-    Array array_host(exec->get_master(), num);
-    auto val = array_host.get_data();
-    for (int i = 0; i < num; i++) {
-        val[i] =
-            gko::test::detail::get_rand_value<value_type>(value_dist, engine);
-    }
-    Array array(exec);
-    array = array_host;
-    return array;
-}
-
-
 class AmgxPgm : public ::testing::Test {
 protected:
     using value_type = gko::default_precision;
@@ -83,6 +65,7 @@ class AmgxPgm : public ::testing::Test {
     using Mtx = gko::matrix::Dense<>;
     using Csr = gko::matrix::Csr<value_type, index_type>;
     using Diag = gko::matrix::Diagonal<value_type>;
+
     AmgxPgm() : rand_engine(30) {}
 
     void SetUp()
@@ -110,7 +93,7 @@ class AmgxPgm : public ::testing::Test {
     gko::Array<index_type> gen_array(gko::size_type num, index_type min_val,
                                      index_type max_val)
     {
-        return generate_random_array<gko::Array<index_type>>(
+        return gko::test::generate_random_array<index_type>(
             num, std::uniform_int_distribution<>(min_val, max_val), rand_engine,
             ref);
     }
@@ -132,7 +115,7 @@ class AmgxPgm : public ::testing::Test {
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
         auto system_dense = gen_mtx(m, m);
-        make_spd(system_dense.get());
+        gko::test::make_spd(system_dense.get());
         system_mtx = Csr::create(ref);
         system_dense->convert_to(system_mtx.get());
 
@@ -154,48 +137,12 @@ class AmgxPgm : public ::testing::Test {
         d_system_mtx->copy_from(system_mtx.get());
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    // only for real value
-    void make_absoulte(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = abs(mtx->at(i, j));
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     void make_weight(Mtx *mtx)
     {
-        make_symetric(mtx);
-        make_absoulte(mtx);
-        make_diag_dominant(mtx);
+        gko::test::make_symmetric(mtx);
+        // only works for realvalue cases
+        mtx->compute_absolute_inplace();
+        gko::test::make_diag_dominant(mtx);
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
diff --git a/cuda/test/solver/bicg_kernels.cpp b/cuda/test/solver/bicg_kernels.cpp
index a825ae5aa9b..d82cc0a28a0 100644
--- a/cuda/test/solver/bicg_kernels.cpp
+++ b/cuda/test/solver/bicg_kernels.cpp
@@ -152,33 +152,6 @@ class Bicg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
@@ -285,7 +258,7 @@ TEST_F(Bicg, CudaBicgStep2IsEquivalentToRef)
 TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(cuda);
diff --git a/cuda/test/solver/bicgstab_kernels.cpp b/cuda/test/solver/bicgstab_kernels.cpp
index 7808226870c..b1e0ca85d1b 100644
--- a/cuda/test/solver/bicgstab_kernels.cpp
+++ b/cuda/test/solver/bicgstab_kernels.cpp
@@ -69,7 +69,7 @@ class Bicgstab : public ::testing::Test {
         cuda = gko::CudaExecutor::create(0, ref);
 
         mtx = gen_mtx(123, 123);
-        make_diag_dominant(mtx.get());
+        gko::test::make_diag_dominant(mtx.get());
         d_mtx = Mtx::create(cuda);
         d_mtx->copy_from(mtx.get());
 
@@ -171,18 +171,6 @@ class Bicgstab : public ::testing::Test {
             *stop_status;  // copy_from is not a public member function of Array
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
diff --git a/cuda/test/solver/cg_kernels.cpp b/cuda/test/solver/cg_kernels.cpp
index d8a98a83805..8333fc7d0a8 100644
--- a/cuda/test/solver/cg_kernels.cpp
+++ b/cuda/test/solver/cg_kernels.cpp
@@ -123,33 +123,6 @@ class Cg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
@@ -234,7 +207,7 @@ TEST_F(Cg, CudaCgStep2IsEquivalentToRef)
 TEST_F(Cg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(cuda);
diff --git a/cuda/test/solver/cgs_kernels.cpp b/cuda/test/solver/cgs_kernels.cpp
index 543f580cf53..f8cb89d55bb 100644
--- a/cuda/test/solver/cgs_kernels.cpp
+++ b/cuda/test/solver/cgs_kernels.cpp
@@ -68,7 +68,7 @@ class Cgs : public ::testing::Test {
         cuda = gko::CudaExecutor::create(0, ref);
 
         mtx = gen_mtx(123, 123);
-        make_diag_dominant(mtx.get());
+        gko::test::make_diag_dominant(mtx.get());
         d_mtx = Mtx::create(cuda);
         d_mtx->copy_from(mtx.get());
         cuda_cgs_factory =
@@ -166,18 +166,6 @@ class Cgs : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
diff --git a/cuda/test/solver/fcg_kernels.cpp b/cuda/test/solver/fcg_kernels.cpp
index 85b0b3b7017..b2832945666 100644
--- a/cuda/test/solver/fcg_kernels.cpp
+++ b/cuda/test/solver/fcg_kernels.cpp
@@ -131,33 +131,6 @@ class Fcg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
@@ -247,7 +220,7 @@ TEST_F(Fcg, CudaFcgStep2IsEquivalentToRef)
 TEST_F(Fcg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(cuda);
diff --git a/cuda/test/solver/idr_kernels.cpp b/cuda/test/solver/idr_kernels.cpp
index 04ea559b6cc..9aa925187a4 100644
--- a/cuda/test/solver/idr_kernels.cpp
+++ b/cuda/test/solver/idr_kernels.cpp
@@ -160,18 +160,6 @@ class Idr : public ::testing::Test {
             *stop_status;  // copy_from is not a public member function of Array
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
diff --git a/hip/test/multigrid/CMakeLists.txt b/hip/test/multigrid/CMakeLists.txt
index 481c2cc1bf2..8fe8bbeba48 100644
--- a/hip/test/multigrid/CMakeLists.txt
+++ b/hip/test/multigrid/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_hip_test_special_linkage(amgx_pgm_kernels)
+ginkgo_create_test(amgx_pgm_kernels)
diff --git a/hip/test/multigrid/amgx_pgm_kernels.cpp b/hip/test/multigrid/amgx_pgm_kernels.cpp
index 71acf6ed06a..39546df6cff 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.cpp
@@ -58,24 +58,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
-template <typename Array, typename ValueDistribution, typename Engine>
-Array generate_random_array(gko::size_type num, ValueDistribution &&value_dist,
-                            Engine &&engine,
-                            std::shared_ptr<const gko::Executor> exec)
-{
-    using value_type = typename Array::value_type;
-    Array array_host(exec->get_master(), num);
-    auto val = array_host.get_data();
-    for (int i = 0; i < num; i++) {
-        val[i] =
-            gko::test::detail::get_rand_value<value_type>(value_dist, engine);
-    }
-    Array array(exec);
-    array = array_host;
-    return array;
-}
-
-
 class AmgxPgm : public ::testing::Test {
 protected:
     using value_type = gko::default_precision;
@@ -110,7 +92,7 @@ class AmgxPgm : public ::testing::Test {
     gko::Array<index_type> gen_array(gko::size_type num, index_type min_val,
                                      index_type max_val)
     {
-        return generate_random_array<gko::Array<index_type>>(
+        return gko::test::generate_random_array<index_type>(
             num, std::uniform_int_distribution<>(min_val, max_val), rand_engine,
             ref);
     }
@@ -132,7 +114,7 @@ class AmgxPgm : public ::testing::Test {
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
         auto system_dense = gen_mtx(m, m);
-        make_spd(system_dense.get());
+        gko::test::make_spd(system_dense.get());
         system_mtx = Csr::create(ref);
         system_dense->convert_to(system_mtx.get());
 
@@ -154,48 +136,12 @@ class AmgxPgm : public ::testing::Test {
         d_system_mtx->copy_from(system_mtx.get());
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    // only for real value
-    void make_absoulte(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = abs(mtx->at(i, j));
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     void make_weight(Mtx *mtx)
     {
-        make_symetric(mtx);
-        make_absoulte(mtx);
-        make_diag_dominant(mtx);
+        gko::test::make_symmetric(mtx);
+        // only works for realvalue cases
+        mtx->compute_absolute_inplace();
+        gko::test::make_diag_dominant(mtx);
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
diff --git a/hip/test/solver/bicg_kernels.cpp b/hip/test/solver/bicg_kernels.cpp
index 52073945050..c29bd74374c 100644
--- a/hip/test/solver/bicg_kernels.cpp
+++ b/hip/test/solver/bicg_kernels.cpp
@@ -152,33 +152,6 @@ class Bicg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
 
@@ -285,7 +258,7 @@ TEST_F(Bicg, HipBicgStep2IsEquivalentToRef)
 TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(hip);
diff --git a/hip/test/solver/bicgstab_kernels.cpp b/hip/test/solver/bicgstab_kernels.cpp
index bcf4191a426..5a995656306 100644
--- a/hip/test/solver/bicgstab_kernels.cpp
+++ b/hip/test/solver/bicgstab_kernels.cpp
@@ -69,7 +69,7 @@ class Bicgstab : public ::testing::Test {
         hip = gko::HipExecutor::create(0, ref);
 
         mtx = gen_mtx(123, 123);
-        make_diag_dominant(mtx.get());
+        gko::test::make_diag_dominant(mtx.get());
         d_mtx = Mtx::create(hip);
         d_mtx->copy_from(mtx.get());
 
@@ -171,18 +171,6 @@ class Bicgstab : public ::testing::Test {
             *stop_status;  // copy_from is not a public member function of Array
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
 
diff --git a/hip/test/solver/cg_kernels.cpp b/hip/test/solver/cg_kernels.cpp
index a46d698cc22..cd4000171c0 100644
--- a/hip/test/solver/cg_kernels.cpp
+++ b/hip/test/solver/cg_kernels.cpp
@@ -123,33 +123,6 @@ class Cg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
 
@@ -234,7 +207,7 @@ TEST_F(Cg, HipCgStep2IsEquivalentToRef)
 TEST_F(Cg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(hip);
diff --git a/hip/test/solver/cgs_kernels.cpp b/hip/test/solver/cgs_kernels.cpp
index 861f9498780..2277d4417ba 100644
--- a/hip/test/solver/cgs_kernels.cpp
+++ b/hip/test/solver/cgs_kernels.cpp
@@ -68,7 +68,7 @@ class Cgs : public ::testing::Test {
         hip = gko::HipExecutor::create(0, ref);
 
         mtx = gen_mtx(123, 123);
-        make_diag_dominant(mtx.get());
+        gko::test::make_diag_dominant(mtx.get());
         d_mtx = Mtx::create(hip);
         d_mtx->copy_from(mtx.get());
         hip_cgs_factory =
@@ -166,18 +166,6 @@ class Cgs : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
 
diff --git a/hip/test/solver/fcg_kernels.cpp b/hip/test/solver/fcg_kernels.cpp
index 663c811227c..ca86775ee94 100644
--- a/hip/test/solver/fcg_kernels.cpp
+++ b/hip/test/solver/fcg_kernels.cpp
@@ -131,33 +131,6 @@ class Fcg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
 
@@ -247,7 +220,7 @@ TEST_F(Fcg, HipFcgStep2IsEquivalentToRef)
 TEST_F(Fcg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(hip);
diff --git a/hip/test/solver/idr_kernels.cpp b/hip/test/solver/idr_kernels.cpp
index c59ed90c9a7..de7f5a14125 100644
--- a/hip/test/solver/idr_kernels.cpp
+++ b/hip/test/solver/idr_kernels.cpp
@@ -160,18 +160,6 @@ class Idr : public ::testing::Test {
             *stop_status;  // copy_from is not a public member function of Array
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
 
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index 9c4efefebe6..3ce1b087fd8 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -75,12 +75,12 @@ void match_edge(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel for
     for (size_type i = 0; i < agg.get_num_elems(); i++) {
         if (agg_vals[i] == -1) {
-            size_type neighbor = strongest_neighbor_vals[i];
-            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i) {
+            auto neighbor = strongest_neighbor_vals[i];
+            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i &&
+                i < neighbor) {
                 // Use the smaller index as agg point
-                auto group = min(i, neighbor);
-                agg_vals[i] = group;
-                agg_vals[neighbor] = group;
+                agg_vals[i] = i;
+                agg_vals[neighbor] = i;
             }
         }
     }
@@ -152,7 +152,7 @@ void find_strongest_neighbor(
                     continue;
                 }
                 auto weight =
-                    vals[idx] / max(abs(diag_vals[col]), abs(diag_vals[col]));
+                    vals[idx] / max(abs(diag_vals[row]), abs(diag_vals[col]));
                 if (agg.get_const_data()[col] == -1 &&
                     (weight > max_weight_unagg ||
                      (weight == max_weight_unagg && col > strongest_unagg))) {
@@ -200,7 +200,6 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
                        ? intermediate_agg.get_data()
                        : agg.get_data();
     const auto diag_vals = diag->get_const_values();
-#pragma omp parallel for
     for (IndexType row = 0; row < agg.get_num_elems(); row++) {
         if (agg_const_val[row] != -1) {
             continue;
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index 9a27ef3f4fc..e0b063de0bd 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -57,24 +57,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
-template <typename Array, typename ValueDistribution, typename Engine>
-Array generate_random_array(gko::size_type num, ValueDistribution &&value_dist,
-                            Engine &&engine,
-                            std::shared_ptr<const gko::Executor> exec)
-{
-    using value_type = typename Array::value_type;
-    Array array_host(exec->get_master(), num);
-    auto val = array_host.get_data();
-    for (int i = 0; i < num; i++) {
-        val[i] =
-            gko::test::detail::get_rand_value<value_type>(value_dist, engine);
-    }
-    Array array(exec);
-    array = array_host;
-    return array;
-}
-
-
 class AmgxPgm : public ::testing::Test {
 protected:
     using value_type = gko::default_precision;
@@ -101,7 +83,7 @@ class AmgxPgm : public ::testing::Test {
     gko::Array<index_type> gen_array(gko::size_type num, index_type min_val,
                                      index_type max_val)
     {
-        return generate_random_array<gko::Array<index_type>>(
+        return gko::test::generate_random_array<index_type>(
             num, std::uniform_int_distribution<>(min_val, max_val), rand_engine,
             ref);
     }
@@ -122,6 +104,10 @@ class AmgxPgm : public ::testing::Test {
         weight_csr = Csr::create(ref);
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
+        auto system_dense = gen_mtx(m, m);
+        gko::test::make_spd(system_dense.get());
+        system_mtx = Csr::create(ref);
+        system_dense->convert_to(system_mtx.get());
 
         d_agg.set_executor(omp);
         d_unfinished_agg.set_executor(omp);
@@ -130,6 +116,7 @@ class AmgxPgm : public ::testing::Test {
         d_fine_vector = Mtx::create(omp);
         d_weight_csr = Csr::create(omp);
         d_weight_diag = Diag::create(omp);
+        d_system_mtx = Csr::create(omp);
         d_agg = agg;
         d_unfinished_agg = unfinished_agg;
         d_strongest_neighbor = strongest_neighbor;
@@ -137,50 +124,15 @@ class AmgxPgm : public ::testing::Test {
         d_fine_vector->copy_from(fine_vector.get());
         d_weight_csr->copy_from(weight_csr.get());
         d_weight_diag->copy_from(weight_diag.get());
-    }
-
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    // only for real value
-    void make_absoulte(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = abs(mtx->at(i, j));
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
+        d_system_mtx->copy_from(system_mtx.get());
     }
 
     void make_weight(Mtx *mtx)
     {
-        make_symetric(mtx);
-        make_absoulte(mtx);
-        make_diag_dominant(mtx);
+        gko::test::make_symmetric(mtx);
+        // it is only works for realvalue case.
+        mtx->compute_absolute_inplace();
+        gko::test::make_diag_dominant(mtx);
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -200,11 +152,13 @@ class AmgxPgm : public ::testing::Test {
     std::unique_ptr<Mtx> fine_vector;
     std::unique_ptr<Diag> weight_diag;
     std::unique_ptr<Csr> weight_csr;
+    std::shared_ptr<Csr> system_mtx;
 
     std::unique_ptr<Mtx> d_coarse_vector;
     std::unique_ptr<Mtx> d_fine_vector;
     std::unique_ptr<Diag> d_weight_diag;
     std::unique_ptr<Csr> d_weight_csr;
+    std::shared_ptr<Csr> d_system_mtx;
 
     gko::size_type n;
 };
@@ -320,4 +274,26 @@ TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
 }
 
 
+TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
+{
+    initialize_data();
+    auto mg_level_factory = gko::multigrid::AmgxPgm<double, int>::build()
+                                .with_deterministic(true)
+                                .on(ref);
+    auto d_mg_level_factory = gko::multigrid::AmgxPgm<double, int>::build()
+                                  .with_deterministic(true)
+                                  .on(omp);
+
+    auto mg_level = mg_level_factory->generate(system_mtx);
+    auto d_mg_level = d_mg_level_factory->generate(d_system_mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_restrict_op()),
+                        gko::as<Csr>(mg_level->get_restrict_op()), 1e-14);
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_coarse_op()),
+                        gko::as<Csr>(mg_level->get_coarse_op()), 1e-14);
+    GKO_ASSERT_MTX_NEAR(gko::as<Csr>(d_mg_level->get_prolong_op()),
+                        gko::as<Csr>(mg_level->get_prolong_op()), 1e-14);
+}
+
+
 }  // namespace
diff --git a/omp/test/solver/bicg_kernels.cpp b/omp/test/solver/bicg_kernels.cpp
index fc51d1847ae..2018fa5f788 100644
--- a/omp/test/solver/bicg_kernels.cpp
+++ b/omp/test/solver/bicg_kernels.cpp
@@ -134,33 +134,6 @@ class Bicg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
@@ -265,7 +238,7 @@ TEST_F(Bicg, OmpBicgStep2IsEquivalentToRef)
 TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(omp);
diff --git a/omp/test/solver/bicgstab_kernels.cpp b/omp/test/solver/bicgstab_kernels.cpp
index 74085b6e7a7..9e51d841e64 100644
--- a/omp/test/solver/bicgstab_kernels.cpp
+++ b/omp/test/solver/bicgstab_kernels.cpp
@@ -68,7 +68,7 @@ class Bicgstab : public ::testing::Test {
         omp = gko::OmpExecutor::create();
 
         mtx = gen_mtx(123, 123);
-        make_diag_dominant(mtx.get());
+        gko::test::make_diag_dominant(mtx.get());
         d_mtx = Mtx::create(omp);
         d_mtx->copy_from(mtx.get());
         omp_bicgstab_factory =
@@ -170,18 +170,6 @@ class Bicgstab : public ::testing::Test {
             *stop_status;  // copy_from is not a public member function of Array
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
diff --git a/omp/test/solver/cg_kernels.cpp b/omp/test/solver/cg_kernels.cpp
index e71fad019b5..1e86956602a 100644
--- a/omp/test/solver/cg_kernels.cpp
+++ b/omp/test/solver/cg_kernels.cpp
@@ -122,33 +122,6 @@ class Cg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
@@ -233,7 +206,7 @@ TEST_F(Cg, OmpCgStep2IsEquivalentToRef)
 TEST_F(Cg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(omp);
diff --git a/omp/test/solver/cgs_kernels.cpp b/omp/test/solver/cgs_kernels.cpp
index f05d0604919..d473f830bdc 100644
--- a/omp/test/solver/cgs_kernels.cpp
+++ b/omp/test/solver/cgs_kernels.cpp
@@ -67,7 +67,7 @@ class Cgs : public ::testing::Test {
         omp = gko::OmpExecutor::create();
 
         mtx = gen_mtx(123, 123);
-        make_diag_dominant(mtx.get());
+        gko::test::make_diag_dominant(mtx.get());
         d_mtx = Mtx::create(omp);
         d_mtx->copy_from(mtx.get());
         omp_cgs_factory =
@@ -165,18 +165,6 @@ class Cgs : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
diff --git a/omp/test/solver/fcg_kernels.cpp b/omp/test/solver/fcg_kernels.cpp
index bcf6c57d288..632a2c80808 100644
--- a/omp/test/solver/fcg_kernels.cpp
+++ b/omp/test/solver/fcg_kernels.cpp
@@ -130,33 +130,6 @@ class Fcg : public ::testing::Test {
         *d_stop_status = *stop_status;
     }
 
-    void make_symetric(Mtx *mtx)
-    {
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            for (int j = i + 1; j < mtx->get_size()[1]; ++j) {
-                mtx->at(i, j) = mtx->at(j, i);
-            }
-        }
-    }
-
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
-    void make_spd(Mtx *mtx)
-    {
-        make_symetric(mtx);
-        make_diag_dominant(mtx);
-    }
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
@@ -246,7 +219,7 @@ TEST_F(Fcg, OmpFcgStep2IsEquivalentToRef)
 TEST_F(Fcg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    make_spd(mtx.get());
+    gko::test::make_spd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(omp);
diff --git a/omp/test/solver/gmres_kernels.cpp b/omp/test/solver/gmres_kernels.cpp
index 58b369e748e..66ebe091ce8 100644
--- a/omp/test/solver/gmres_kernels.cpp
+++ b/omp/test/solver/gmres_kernels.cpp
@@ -111,18 +111,6 @@ class Gmres : public ::testing::Test {
             std::normal_distribution<ValueType>(-1.0, 1.0), rand_engine, ref);
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum;
-        }
-    }
-
     void initialize_data()
     {
 #ifdef GINKGO_FAST_TESTS
diff --git a/omp/test/solver/idr_kernels.cpp b/omp/test/solver/idr_kernels.cpp
index f26bf4dcf4b..794d68cf3f2 100644
--- a/omp/test/solver/idr_kernels.cpp
+++ b/omp/test/solver/idr_kernels.cpp
@@ -166,18 +166,6 @@ class Idr : public ::testing::Test {
             *stop_status;  // copy_from is not a public member function of Array
     }
 
-    void make_diag_dominant(Mtx *mtx)
-    {
-        using std::abs;
-        for (int i = 0; i < mtx->get_size()[0]; ++i) {
-            auto sum = gko::zero<Mtx::value_type>();
-            for (int j = 0; j < mtx->get_size()[1]; ++j) {
-                sum += abs(mtx->at(i, j));
-            }
-            mtx->at(i, i) = sum / 4;
-        }
-    }
-
     std::shared_ptr<const gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
diff --git a/reference/multigrid/amgx_pgm_kernels.cpp b/reference/multigrid/amgx_pgm_kernels.cpp
index 64788db9805..d5b1ddbebb1 100644
--- a/reference/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/multigrid/amgx_pgm_kernels.cpp
@@ -71,10 +71,12 @@ void match_edge(std::shared_ptr<const ReferenceExecutor> exec,
     for (size_type i = 0; i < agg.get_num_elems(); i++) {
         if (agg_vals[i] == -1) {
             auto neighbor = strongest_neighbor_vals[i];
-            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i) {
+            // i < neighbor always holds when neighbor is not -1
+            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == i &&
+                i < neighbor) {
+                // Use the smaller index as agg point
                 agg_vals[i] = i;
                 agg_vals[neighbor] = i;
-                // Use the smaller index as agg point
             }
         }
     }

From c93fdfddb21ea29f245dba58e3a19b82d425821f Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 30 Mar 2021 23:50:17 +0800
Subject: [PATCH 13/16] collect mtx/array utils and add doc and typed_test -
 symmetric/hermitian/diag_dominant/hpd - array generator - add type test and
 documents for all in core/test/utils

Remind to myself: class member is initialized by declaring order
---
 core/test/utils.hpp                       |   1 -
 core/test/utils/CMakeLists.txt            |   3 +
 core/test/utils/array_generator.hpp       |  14 ++
 core/test/utils/array_generator_test.cpp  | 125 +++++++++++++
 core/test/utils/matrix_generator.hpp      |  13 +-
 core/test/utils/matrix_generator_test.cpp | 207 +++++++++++++++-------
 core/test/utils/matrix_utils.hpp          | 100 +++++++++--
 core/test/utils/matrix_utils_test.cpp     | 201 +++++++++++++++++++++
 core/test/utils/unsort_matrix_test.cpp    |  53 +++---
 core/test/utils/value_generator.hpp       |  30 +++-
 core/test/utils/value_generator_test.cpp  | 118 ++++++++++++
 cuda/test/multigrid/amgx_pgm_kernels.cpp  |   2 +-
 cuda/test/solver/bicg_kernels.cpp         |   2 +-
 cuda/test/solver/cg_kernels.cpp           |   2 +-
 cuda/test/solver/fcg_kernels.cpp          |   2 +-
 hip/test/multigrid/amgx_pgm_kernels.cpp   |   2 +-
 hip/test/solver/bicg_kernels.cpp          |   2 +-
 hip/test/solver/cg_kernels.cpp            |   2 +-
 hip/test/solver/fcg_kernels.cpp           |   2 +-
 omp/multigrid/amgx_pgm_kernels.cpp        |   1 +
 omp/test/multigrid/amgx_pgm_kernels.cpp   |   2 +-
 omp/test/solver/bicg_kernels.cpp          |   2 +-
 omp/test/solver/cg_kernels.cpp            |   2 +-
 omp/test/solver/fcg_kernels.cpp           |   2 +-
 24 files changed, 766 insertions(+), 124 deletions(-)
 create mode 100644 core/test/utils/array_generator_test.cpp
 create mode 100644 core/test/utils/matrix_utils_test.cpp
 create mode 100644 core/test/utils/value_generator_test.cpp

diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index 05bf216c8bb..f493b375831 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -66,7 +66,6 @@ using ValueTypes =
     ::testing::Types<float, double, std::complex<float>, std::complex<double>>;
 #endif
 
-
 using ComplexValueTypes =
 #if GINKGO_DPCPP_SINGLE_MODE
     ::testing::Types<std::complex<float>>;
diff --git a/core/test/utils/CMakeLists.txt b/core/test/utils/CMakeLists.txt
index 84e3d46958d..9b3e0e5e349 100644
--- a/core/test/utils/CMakeLists.txt
+++ b/core/test/utils/CMakeLists.txt
@@ -1,3 +1,6 @@
+ginkgo_create_test(array_generator_test)
 ginkgo_create_test(assertions_test)
 ginkgo_create_test(matrix_generator_test)
+ginkgo_create_test(matrix_utils_test)
 ginkgo_create_test(unsort_matrix_test)
+ginkgo_create_test(value_generator_test)
diff --git a/core/test/utils/array_generator.hpp b/core/test/utils/array_generator.hpp
index 1ad52c18e26..8dbdeacc9a7 100644
--- a/core/test/utils/array_generator.hpp
+++ b/core/test/utils/array_generator.hpp
@@ -46,6 +46,20 @@ namespace gko {
 namespace test {
 
 
+/**
+ * Generate a random array
+ *
+ * @tparam ValueType  valuetype of the array to generate
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ *
+ * @param num  the number of elements of array
+ * @param value_dist  distribution of array values
+ * @param engine  a random engine
+ * @param exec  executor where the array should be allocated
+ *
+ * @return Array<ValueType>
+ */
 template <typename ValueType, typename ValueDistribution, typename Engine>
 Array<ValueType> generate_random_array(size_type num,
                                        ValueDistribution &&value_dist,
diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp
new file mode 100644
index 00000000000..ba8c6651be9
--- /dev/null
+++ b/core/test/utils/array_generator_test.cpp
@@ -0,0 +1,125 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/test/utils/array_generator.hpp"
+
+
+#include <cmath>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+
+namespace {
+
+
+template <typename T>
+class ArrayGenerator : public ::testing::Test {
+protected:
+    using value_type = T;
+
+    ArrayGenerator() : exec(gko::ReferenceExecutor::create())
+    {
+        array = gko::test::generate_random_array<T>(
+            500, std::normal_distribution<gko::remove_complex<T>>(20.0, 5.0),
+            std::ranlux48(42), exec);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    gko::Array<T> array;
+
+    template <typename InputIterator, typename ValueType, typename Closure>
+    ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
+                             InputIterator sample_end, Closure closure_op)
+    {
+        using std::pow;
+        ValueType res = 0;
+        ValueType num_elems = 0;
+        while (sample_start != sample_end) {
+            auto tmp = *(sample_start++);
+            res += pow(closure_op(tmp) - c, n);
+            num_elems += 1;
+        }
+        return res / num_elems;
+    }
+
+    template <typename ValueType, typename InputIterator, typename Closure>
+    void check_average_and_deviation(
+        InputIterator sample_start, InputIterator sample_end,
+        gko::remove_complex<ValueType> average_ans,
+        gko::remove_complex<ValueType> deviation_ans, Closure closure_op)
+    {
+        auto average =
+            this->get_nth_moment(1, gko::zero<gko::remove_complex<ValueType>>(),
+                                 sample_start, sample_end, closure_op);
+        auto deviation = sqrt(this->get_nth_moment(2, average, sample_start,
+                                                   sample_end, closure_op));
+
+        // check that average & deviation is within 10% of the required amount
+        ASSERT_NEAR(average, average_ans, average_ans * 0.1);
+        ASSERT_NEAR(deviation, deviation_ans, deviation_ans * 0.1);
+    }
+};
+
+TYPED_TEST_SUITE(ArrayGenerator, gko::test::ValueTypes);
+
+
+TYPED_TEST(ArrayGenerator, OutputHasCorrectSize)
+{
+    ASSERT_EQ(this->array.get_num_elems(), 500);
+}
+
+
+TYPED_TEST(ArrayGenerator, OutputHasCorrectAverageAndDeviation)
+{
+    using std::sqrt;
+    using T = typename TestFixture::value_type;
+
+    // check the real part
+    this->template check_average_and_deviation<T>(
+        this->array.get_const_data(),
+        this->array.get_const_data() + this->array.get_num_elems(), 20.0, 5.0,
+        [](T &val) { return gko::real(val); });
+    // check the imag part when the type is complex
+    if (!std::is_same<T, gko::remove_complex<T>>::value) {
+        this->template check_average_and_deviation<T>(
+            this->array.get_const_data(),
+            this->array.get_const_data() + this->array.get_num_elems(), 20.0,
+            5.0, [](T &val) { return gko::imag(val); });
+    }
+}
+
+
+}  // namespace
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 04f98126c41..9a101f4b043 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -70,6 +70,8 @@ namespace test {
  * @param engine  a random engine
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
@@ -128,6 +130,8 @@ std::unique_ptr<MatrixType> generate_random_matrix(
  * @param engine  a random engine
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename Engine, typename... MatrixArgs>
@@ -189,6 +193,8 @@ std::unique_ptr<MatrixType> generate_random_sparsity_matrix(
  * @param engine  a random engine
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
@@ -274,6 +280,8 @@ std::unique_ptr<MatrixType> generate_random_triangular_matrix(
  * @param engine  a random engine
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
@@ -308,6 +316,8 @@ std::unique_ptr<MatrixType> generate_random_lower_triangular_matrix(
  * @param engine  a random engine
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
@@ -341,6 +351,8 @@ std::unique_ptr<MatrixType> generate_random_upper_triangular_matrix(
  * @param engine  a random engine
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>, typename ValueDistribution,
           typename Engine, typename... MatrixArgs>
@@ -353,7 +365,6 @@ std::unique_ptr<MatrixType> generate_random_band_matrix(
     using index_type = typename MatrixType::index_type;
 
     matrix_data<value_type, index_type> data{gko::dim<2>{size, size}, {}};
-
     for (size_type row = 0; row < size; ++row) {
         for (size_type col = row < lower_bandwidth ? 0 : row - lower_bandwidth;
              col <= std::min(row + upper_bandwidth, size - 1); col++) {
diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp
index 1a85604a788..687560a202b 100644
--- a/core/test/utils/matrix_generator_test.cpp
+++ b/core/test/utils/matrix_generator_test.cpp
@@ -40,129 +40,204 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
+template <typename T>
 class MatrixGenerator : public ::testing::Test {
 protected:
+    using value_type = T;
+    using real_type = gko::remove_complex<T>;
+    using mtx_type = gko::matrix::Dense<T>;
+
     MatrixGenerator()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::test::generate_random_matrix(
-              500, 100, std::normal_distribution<double>(50, 5),
-              std::normal_distribution<double>(20.0, 5.0), std::ranlux48(42),
+          mtx(gko::test::generate_random_matrix<mtx_type>(
+              500, 100, std::normal_distribution<real_type>(50, 5),
+              std::normal_distribution<real_type>(20.0, 5.0), std::ranlux48(42),
               exec)),
-          l_mtx(gko::test::generate_random_lower_triangular_matrix(
-              4, 3, true, std::normal_distribution<double>(50, 5),
-              std::normal_distribution<double>(20.0, 5.0), std::ranlux48(42),
+          l_mtx(gko::test::generate_random_lower_triangular_matrix<mtx_type>(
+              4, 3, true, std::normal_distribution<real_type>(50, 5),
+              std::normal_distribution<real_type>(20.0, 5.0), std::ranlux48(42),
               exec)),
-          u_mtx(gko::test::generate_random_upper_triangular_matrix(
-              3, 4, true, std::normal_distribution<double>(50, 5),
-              std::normal_distribution<double>(20.0, 5.0), std::ranlux48(42),
+          u_mtx(gko::test::generate_random_upper_triangular_matrix<mtx_type>(
+              3, 4, true, std::normal_distribution<real_type>(50, 5),
+              std::normal_distribution<real_type>(20.0, 5.0), std::ranlux48(42),
+              exec)),
+          lower_bandwidth(2),
+          upper_bandwidth(3),
+          band_mtx(gko::test::generate_random_band_matrix<mtx_type>(
+              100, lower_bandwidth, upper_bandwidth,
+              std::normal_distribution<real_type>(20.0, 5.0), std::ranlux48(42),
               exec)),
           nnz_per_row_sample(500, 0),
-          values_sample(0)
+          values_sample(0),
+          band_values_sample(0)
     {
         // collect samples of nnz/row and values from the matrix
         for (int row = 0; row < mtx->get_size()[0]; ++row) {
             for (int col = 0; col < mtx->get_size()[1]; ++col) {
                 auto val = mtx->at(row, col);
-                if (val != 0.0) {
+                if (val != gko::zero<T>()) {
                     ++nnz_per_row_sample[row];
                     values_sample.push_back(val);
                 }
             }
         }
+
+        // collect samples of values from the band matrix
+        for (int row = 0; row < band_mtx->get_size()[0]; ++row) {
+            for (int col = 0; col < band_mtx->get_size()[1]; ++col) {
+                auto val = band_mtx->at(row, col);
+                if ((col - row <= upper_bandwidth) &&
+                    (row - col <= lower_bandwidth)) {
+                    band_values_sample.push_back(val);
+                }
+            }
+        }
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<gko::matrix::Dense<>> mtx;
-    std::unique_ptr<gko::matrix::Dense<>> l_mtx;
-    std::unique_ptr<gko::matrix::Dense<>> u_mtx;
+    int lower_bandwidth;
+    int upper_bandwidth;
+    std::unique_ptr<mtx_type> mtx;
+    std::unique_ptr<mtx_type> l_mtx;
+    std::unique_ptr<mtx_type> u_mtx;
+    std::unique_ptr<mtx_type> band_mtx;
     std::vector<int> nnz_per_row_sample;
-    std::vector<double> values_sample;
+    std::vector<T> values_sample;
+    std::vector<T> band_values_sample;
+
 
-    template <typename InputIterator, typename ValueType>
+    template <typename InputIterator, typename ValueType, typename Closure>
     ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
-                             InputIterator sample_end)
+                             InputIterator sample_end, Closure closure_op)
     {
         using std::pow;
         ValueType res = 0;
         ValueType num_elems = 0;
         while (sample_start != sample_end) {
             auto tmp = *(sample_start++);
-            res += pow(tmp - c, n);
+            res += pow(closure_op(tmp) - c, n);
             num_elems += 1;
         }
         return res / num_elems;
     }
+
+    template <typename ValueType, typename InputIterator, typename Closure>
+    void check_average_and_deviation(
+        InputIterator sample_start, InputIterator sample_end,
+        gko::remove_complex<ValueType> average_ans,
+        gko::remove_complex<ValueType> deviation_ans, Closure closure_op)
+    {
+        auto average =
+            this->get_nth_moment(1, gko::zero<gko::remove_complex<ValueType>>(),
+                                 sample_start, sample_end, closure_op);
+        auto deviation = sqrt(this->get_nth_moment(2, average, sample_start,
+                                                   sample_end, closure_op));
+
+        // check that average & deviation is within 10% of the required amount
+        ASSERT_NEAR(average, average_ans, average_ans * 0.1);
+        ASSERT_NEAR(deviation, deviation_ans, deviation_ans * 0.1);
+    }
 };
 
+TYPED_TEST_SUITE(MatrixGenerator, gko::test::ValueTypes);
+
 
-TEST_F(MatrixGenerator, OutputHasCorrectSize)
+TYPED_TEST(MatrixGenerator, OutputHasCorrectSize)
 {
-    ASSERT_EQ(mtx->get_size(), gko::dim<2>(500, 100));
+    ASSERT_EQ(this->mtx->get_size(), gko::dim<2>(500, 100));
 }
 
 
-TEST_F(MatrixGenerator, OutputHasCorrectNonzeroAverageAndDeviation)
+TYPED_TEST(MatrixGenerator, OutputHasCorrectNonzeroAverageAndDeviation)
 {
-    using std::sqrt;
-    auto average = get_nth_moment(1, 0.0, begin(nnz_per_row_sample),
-                                  end(nnz_per_row_sample));
-    auto deviation = sqrt(get_nth_moment(2, average, begin(nnz_per_row_sample),
-                                         end(nnz_per_row_sample)));
-
-    // check that average & deviation is within 10% of the required amount
-    ASSERT_NEAR(average, 50.0, 5);
-    ASSERT_NEAR(deviation, 5.0, 0.5);
+    using T = typename TestFixture::value_type;
+    // the nonzeros only needs to check the real part
+    this->template check_average_and_deviation<T>(
+        begin(this->nnz_per_row_sample), end(this->nnz_per_row_sample), 50.0,
+        5.0, [](T val) { return gko::real(val); });
 }
 
 
-TEST_F(MatrixGenerator, OutputHasCorrectValuesAverageAndDeviation)
+TYPED_TEST(MatrixGenerator, OutputHasCorrectValuesAverageAndDeviation)
 {
-    using std::sqrt;
-    auto average =
-        get_nth_moment(1, 0.0, begin(values_sample), end(values_sample));
-    auto deviation = sqrt(
-        get_nth_moment(2, average, begin(values_sample), end(values_sample)));
-
-    // check that average and deviation is within 10% of the required amount
-    ASSERT_NEAR(average, 20.0, 2.0);
-    ASSERT_NEAR(deviation, 5.0, 0.5);
+    using T = typename TestFixture::value_type;
+    // check the real part
+    this->template check_average_and_deviation<T>(
+        begin(this->values_sample), end(this->values_sample), 20.0, 5.0,
+        [](T &val) { return gko::real(val); });
+    // check the imag part when the type is complex
+    if (!std::is_same<T, gko::remove_complex<T>>::value) {
+        this->template check_average_and_deviation<T>(
+            begin(this->values_sample), end(this->values_sample), 20.0, 5.0,
+            [](T &val) { return gko::imag(val); });
+    }
 }
 
 
-TEST_F(MatrixGenerator, CanGenerateLowerTriangularMatrixWithDiagonalOnes)
+TYPED_TEST(MatrixGenerator, CanGenerateLowerTriangularMatrixWithDiagonalOnes)
 {
-    ASSERT_EQ(l_mtx->at(0, 0), 1.0);
-    ASSERT_EQ(l_mtx->at(0, 1), 0.0);
-    ASSERT_EQ(l_mtx->at(0, 2), 0.0);
-    ASSERT_NE(l_mtx->at(1, 0), 0.0);
-    ASSERT_EQ(l_mtx->at(1, 1), 1.0);
-    ASSERT_EQ(l_mtx->at(1, 2), 0.0);
-    ASSERT_NE(l_mtx->at(2, 0), 0.0);
-    ASSERT_NE(l_mtx->at(2, 1), 0.0);
-    ASSERT_EQ(l_mtx->at(2, 2), 1.0);
-    ASSERT_NE(l_mtx->at(3, 0), 0.0);
-    ASSERT_NE(l_mtx->at(3, 1), 0.0);
-    ASSERT_NE(l_mtx->at(3, 2), 0.0);
+    using T = typename TestFixture::value_type;
+    ASSERT_EQ(this->l_mtx->at(0, 0), T{1.0});
+    ASSERT_EQ(this->l_mtx->at(0, 1), T{0.0});
+    ASSERT_EQ(this->l_mtx->at(0, 2), T{0.0});
+    ASSERT_NE(this->l_mtx->at(1, 0), T{0.0});
+    ASSERT_EQ(this->l_mtx->at(1, 1), T{1.0});
+    ASSERT_EQ(this->l_mtx->at(1, 2), T{0.0});
+    ASSERT_NE(this->l_mtx->at(2, 0), T{0.0});
+    ASSERT_NE(this->l_mtx->at(2, 1), T{0.0});
+    ASSERT_EQ(this->l_mtx->at(2, 2), T{1.0});
+    ASSERT_NE(this->l_mtx->at(3, 0), T{0.0});
+    ASSERT_NE(this->l_mtx->at(3, 1), T{0.0});
+    ASSERT_NE(this->l_mtx->at(3, 2), T{0.0});
 }
 
 
-TEST_F(MatrixGenerator, CanGenerateUpperTriangularMatrixWithDiagonalOnes)
+TYPED_TEST(MatrixGenerator, CanGenerateUpperTriangularMatrixWithDiagonalOnes)
 {
-    ASSERT_EQ(u_mtx->at(0, 0), 1.0);
-    ASSERT_NE(u_mtx->at(0, 1), 0.0);
-    ASSERT_NE(u_mtx->at(0, 2), 0.0);
-    ASSERT_NE(u_mtx->at(0, 3), 0.0);
-    ASSERT_EQ(u_mtx->at(1, 0), 0.0);
-    ASSERT_EQ(u_mtx->at(1, 1), 1.0);
-    ASSERT_NE(u_mtx->at(1, 2), 0.0);
-    ASSERT_NE(u_mtx->at(1, 3), 0.0);
-    ASSERT_EQ(u_mtx->at(2, 0), 0.0);
-    ASSERT_EQ(u_mtx->at(2, 1), 0.0);
-    ASSERT_EQ(u_mtx->at(2, 2), 1.0);
-    ASSERT_NE(u_mtx->at(2, 3), 0.0);
+    using T = typename TestFixture::value_type;
+    ASSERT_EQ(this->u_mtx->at(0, 0), T{1.0});
+    ASSERT_NE(this->u_mtx->at(0, 1), T{0.0});
+    ASSERT_NE(this->u_mtx->at(0, 2), T{0.0});
+    ASSERT_NE(this->u_mtx->at(0, 3), T{0.0});
+    ASSERT_EQ(this->u_mtx->at(1, 0), T{0.0});
+    ASSERT_EQ(this->u_mtx->at(1, 1), T{1.0});
+    ASSERT_NE(this->u_mtx->at(1, 2), T{0.0});
+    ASSERT_NE(this->u_mtx->at(1, 3), T{0.0});
+    ASSERT_EQ(this->u_mtx->at(2, 0), T{0.0});
+    ASSERT_EQ(this->u_mtx->at(2, 1), T{0.0});
+    ASSERT_EQ(this->u_mtx->at(2, 2), T{1.0});
+    ASSERT_NE(this->u_mtx->at(2, 3), T{0.0});
+}
+
+
+TYPED_TEST(MatrixGenerator, CanGenerateBandMatrix)
+{
+    using T = typename TestFixture::value_type;
+    // the elements out of band are zero
+    for (int row = 0; row < this->band_mtx->get_size()[0]; row++) {
+        for (int col = 0; col < this->band_mtx->get_size()[1]; col++) {
+            if ((col - row > this->upper_bandwidth) ||
+                (row - col > this->lower_bandwidth)) {
+                ASSERT_EQ(this->band_mtx->at(row, col), T{0.0});
+            }
+        }
+    }
+    // check the real part of elements in band
+    this->template check_average_and_deviation<T>(
+        begin(this->band_values_sample), end(this->band_values_sample), 20.0,
+        5.0, [](T &val) { return gko::real(val); });
+    // check the imag part when the type is complex
+    if (!std::is_same<T, gko::remove_complex<T>>::value) {
+        this->template check_average_and_deviation<T>(
+            begin(this->band_values_sample), end(this->band_values_sample),
+            20.0, 5.0, [](T &val) { return gko::imag(val); });
+    }
 }
 
 
diff --git a/core/test/utils/matrix_utils.hpp b/core/test/utils/matrix_utils.hpp
index c8b269d493f..90aa3ea1d6e 100644
--- a/core/test/utils/matrix_utils.hpp
+++ b/core/test/utils/matrix_utils.hpp
@@ -34,8 +34,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_TEST_UTILS_MATRIX_UTILS_HPP_
 
 
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
+
 #include "core/test/utils/value_generator.hpp"
 
 
@@ -43,38 +46,109 @@ namespace gko {
 namespace test {
 
 
+/**
+ * Make a symmetric matrix
+ *
+ * @tparam ValueType  valuetype of Dense matrix to process
+ *
+ * @param mtx  the dense matrix
+ */
 template <typename ValueType>
 void make_symmetric(matrix::Dense<ValueType> *mtx)
 {
-    assert(mtx->get_executor() == mtx->get_executor()->get_master());
-    for (size_type i = 0; i < mtx->get_size()[0]; ++i) {
-        for (size_type j = i + 1; j < mtx->get_size()[1]; ++j) {
-            mtx->at(i, j) = mtx->at(j, i);
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
+    auto mtx_host =
+        make_temporary_clone(mtx->get_executor()->get_master(), mtx);
+
+    for (size_type i = 0; i < mtx_host->get_size()[0]; ++i) {
+        for (size_type j = i + 1; j < mtx_host->get_size()[1]; ++j) {
+            mtx_host->at(i, j) = mtx_host->at(j, i);
         }
     }
 }
 
 
+/**
+ * Make a hermitian matrix
+ *
+ * @tparam ValueType  valuetype of Dense matrix to process
+ *
+ * @param mtx  the dense matrix
+ */
 template <typename ValueType>
-void make_diag_dominant(matrix::Dense<ValueType> *mtx)
+void make_hermitian(matrix::Dense<ValueType> *mtx)
 {
-    assert(mtx->get_executor() == mtx->get_executor()->get_master());
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
+    auto mtx_host =
+        make_temporary_clone(mtx->get_executor()->get_master(), mtx);
+
+    for (size_type i = 0; i < mtx_host->get_size()[0]; ++i) {
+        for (size_type j = i + 1; j < mtx_host->get_size()[1]; ++j) {
+            mtx_host->at(i, j) = conj(mtx_host->at(j, i));
+        }
+        mtx_host->at(i, i) = gko::real(mtx_host->at(i, i));
+    }
+}
+
+
+/**
+ * Make a (strictly) diagonal dominant matrix. It will set the diag value from
+ * the summation among the absoulue value of the row's elements. When ratio is
+ * larger than 1, the result will be strictly diagonal dominant matrix except
+ * for the empty row. When ratio is 1, the result will be diagonal dominant
+ * matirx.
+ *
+ * @tparam ValueType  valuetype of Dense matrix to process
+ *
+ * @param mtx  the dense matrix
+ * @param ratio  the scale to set the diagonal value. default is 1 and it must
+ *               be larger than or equal to 1.
+ */
+template <typename ValueType>
+void make_diag_dominant(matrix::Dense<ValueType> *mtx,
+                        remove_complex<ValueType> ratio = 1.0)
+{
+    // To keep the diag dominant, the ratio should be larger than or equal to 1
+    GKO_ASSERT_EQ(ratio >= 1.0, true);
+    auto mtx_host =
+        make_temporary_clone(mtx->get_executor()->get_master(), mtx);
+
     using std::abs;
-    for (int i = 0; i < mtx->get_size()[0]; ++i) {
+    for (size_type i = 0; i < mtx_host->get_size()[0]; ++i) {
         auto sum = gko::zero<ValueType>();
-        for (int j = 0; j < mtx->get_size()[1]; ++j) {
-            sum += abs(mtx->at(i, j));
+        for (size_type j = 0; j < mtx_host->get_size()[1]; ++j) {
+            sum += abs(mtx_host->at(i, j));
         }
-        mtx->at(i, i) = sum;
+        mtx_host->at(i, i) = sum * ratio;
     }
 }
 
 
+/**
+ * Make a Hermitian postive definite matrix.
+ *
+ * @tparam ValueType  valuetype of Dense matrix to process
+ *
+ * @param mtx  the dense matrix
+ * @param ratio  the ratio for make_diag_dominant. default is 1.001 and it must
+ *               be larger than 1.
+ */
 template <typename ValueType>
-void make_spd(matrix::Dense<ValueType> *mtx)
+void make_hpd(matrix::Dense<ValueType> *mtx,
+              remove_complex<ValueType> ratio = 1.001)
 {
-    make_symmetric(mtx);
-    make_diag_dominant(mtx);
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
+    // To get strictly diagonally dominant matrix, the ratio should be larger
+    // than 1.
+    GKO_ASSERT_EQ(ratio > 1.0, true);
+
+    auto mtx_host =
+        make_temporary_clone(mtx->get_executor()->get_master(), mtx);
+    make_hermitian(mtx_host.get());
+    // Construct strictly diagonally dominant matrix to ensure positive
+    // definite. In complex case, the diagonal is set as absolute value and is
+    // larger than 0, so it still gives positive definite.
+    make_diag_dominant(mtx_host.get(), ratio);
 }
 
 
diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp
new file mode 100644
index 00000000000..dea1b4a55d6
--- /dev/null
+++ b/core/test/utils/matrix_utils_test.cpp
@@ -0,0 +1,201 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/test/utils/matrix_utils.hpp"
+
+
+#include <cmath>
+#include <random>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class MatrixUtils : public ::testing::Test {
+protected:
+    using value_type = T;
+    using real_type = gko::remove_complex<T>;
+    using mtx_type = gko::matrix::Dense<T>;
+
+    MatrixUtils()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::test::generate_random_matrix<mtx_type>(
+              500, 500, std::normal_distribution<real_type>(50, 5),
+              std::normal_distribution<real_type>(20.0, 5.0), std::ranlux48(42),
+              exec)),
+          unsquare_mtx(mtx_type::create(exec, gko::dim<2>(500, 100)))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<mtx_type> mtx;
+    std::unique_ptr<mtx_type> unsquare_mtx;
+};
+
+TYPED_TEST_SUITE(MatrixUtils, gko::test::ValueTypes);
+
+
+TYPED_TEST(MatrixUtils, MakeSymmetricThrowsError)
+{
+    ASSERT_THROW(gko::test::make_symmetric(gko::lend(this->unsquare_mtx)),
+                 gko::DimensionMismatch);
+}
+
+TYPED_TEST(MatrixUtils, MakeHermitianThrowsError)
+{
+    ASSERT_THROW(gko::test::make_hermitian(gko::lend(this->unsquare_mtx)),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(MatrixUtils, MakeDiagDominantThrowsError)
+{
+    ASSERT_THROW(gko::test::make_diag_dominant(gko::lend(this->mtx), 0.9),
+                 gko::ValueMismatch);
+}
+
+
+TYPED_TEST(MatrixUtils, MakeHpdMatrixThrowsError)
+{
+    ASSERT_THROW(gko::test::make_hpd(gko::lend(this->mtx), 1.0),
+                 gko::ValueMismatch);
+}
+
+
+TYPED_TEST(MatrixUtils, MakeSymmetricCorrectly)
+{
+    gko::test::make_symmetric(gko::lend(this->mtx));
+
+    for (gko::size_type i = 0; i < this->mtx->get_size()[0]; i++) {
+        for (gko::size_type j = 0; j <= i; j++) {
+            ASSERT_EQ(this->mtx->at(i, j), this->mtx->at(j, i));
+        }
+    }
+}
+
+
+TYPED_TEST(MatrixUtils, MakeHermitianCorrectly)
+{
+    gko::test::make_hermitian(gko::lend(this->mtx));
+
+    for (gko::size_type i = 0; i < this->mtx->get_size()[0]; i++) {
+        for (gko::size_type j = 0; j <= i; j++) {
+            ASSERT_EQ(this->mtx->at(i, j), gko::conj(this->mtx->at(j, i)));
+        }
+    }
+}
+
+
+TYPED_TEST(MatrixUtils, MakeDiagDominantCorrectly)
+{
+    using T = typename TestFixture::value_type;
+    // make_diag_dominant also consider diag value.
+    // To check the ratio easily, set the diag zeros
+    for (gko::size_type i = 0; i < this->mtx->get_size()[0]; i++) {
+        this->mtx->at(i, i) = 0;
+    }
+
+    gko::test::make_diag_dominant(gko::lend(this->mtx));
+
+    for (gko::size_type i = 0; i < this->mtx->get_size()[0]; i++) {
+        gko::remove_complex<T> off_diag_abs = 0;
+        for (gko::size_type j = 0; j < this->mtx->get_size()[1]; j++) {
+            if (j != i) {
+                off_diag_abs += std::abs(this->mtx->at(i, j));
+            }
+        }
+        ASSERT_NEAR(gko::real(this->mtx->at(i, i)), off_diag_abs, r<T>::value);
+    }
+}
+
+
+TYPED_TEST(MatrixUtils, MakeDiagDominantWithRatioCorrectly)
+{
+    using T = typename TestFixture::value_type;
+    gko::remove_complex<T> ratio = 1.001;
+    // make_diag_dominant also consider diag value.
+    // To check the ratio easily, set the diag zeros
+    for (gko::size_type i = 0; i < this->mtx->get_size()[0]; i++) {
+        this->mtx->at(i, i) = 0;
+    }
+
+    gko::test::make_diag_dominant(gko::lend(this->mtx), ratio);
+
+    for (gko::size_type i = 0; i < this->mtx->get_size()[0]; i++) {
+        gko::remove_complex<T> off_diag_abs = 0;
+        for (gko::size_type j = 0; j < this->mtx->get_size()[1]; j++) {
+            if (j != i) {
+                off_diag_abs += std::abs(this->mtx->at(i, j));
+            }
+        }
+        ASSERT_NEAR(gko::real(this->mtx->at(i, i)), off_diag_abs * ratio,
+                    r<T>::value);
+    }
+}
+
+
+TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly)
+{
+    using T = typename TestFixture::value_type;
+    auto cpy_mtx = this->mtx->clone();
+
+    gko::test::make_hpd(gko::lend(this->mtx));
+    gko::test::make_hermitian(gko::lend(cpy_mtx));
+    gko::test::make_diag_dominant(gko::lend(cpy_mtx), 1.001);
+
+    GKO_ASSERT_MTX_NEAR(this->mtx, cpy_mtx, r<T>::value);
+}
+
+
+TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly)
+{
+    using T = typename TestFixture::value_type;
+    gko::remove_complex<T> ratio = 1.00001;
+    auto cpy_mtx = this->mtx->clone();
+
+    gko::test::make_hpd(gko::lend(this->mtx), ratio);
+    gko::test::make_hermitian(gko::lend(cpy_mtx));
+    gko::test::make_diag_dominant(gko::lend(cpy_mtx), ratio);
+
+    GKO_ASSERT_MTX_NEAR(this->mtx, cpy_mtx, r<T>::value);
+}
+
+
+}  // namespace
diff --git a/core/test/utils/unsort_matrix_test.cpp b/core/test/utils/unsort_matrix_test.cpp
index 0a242c7dc2c..90fbbc2ccf2 100644
--- a/core/test/utils/unsort_matrix_test.cpp
+++ b/core/test/utils/unsort_matrix_test.cpp
@@ -55,18 +55,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
+template <typename ValueIndexType>
 class UnsortMatrix : public ::testing::Test {
 protected:
-    using value_type = double;
-    using index_type = gko::int32;
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using Csr = gko::matrix::Csr<value_type, index_type>;
     using Coo = gko::matrix::Coo<value_type, index_type>;
     using Dense = gko::matrix::Dense<value_type>;
     UnsortMatrix()
         : exec(gko::ReferenceExecutor::create()),
           rand_engine(42),
-          /*
-           */
           csr_empty(Csr::create(exec, gko::dim<2>(0, 0))),
           coo_empty(Coo::create(exec, gko::dim<2>(0, 0)))
     {}
@@ -148,54 +149,58 @@ class UnsortMatrix : public ::testing::Test {
     std::unique_ptr<Coo> coo_empty;
 };
 
+TYPED_TEST_SUITE(UnsortMatrix, gko::test::ValueIndexTypes);
 
-TEST_F(UnsortMatrix, CsrWorks)
+
+TYPED_TEST(UnsortMatrix, CsrWorks)
 {
-    auto csr = get_sorted_csr();
-    const auto ref_mtx = get_sorted_csr();
-    bool was_sorted = is_csr_matrix_sorted(gko::lend(csr));
+    auto csr = this->get_sorted_csr();
+    const auto ref_mtx = this->get_sorted_csr();
+    bool was_sorted = this->is_csr_matrix_sorted(gko::lend(csr));
 
-    gko::test::unsort_matrix(gko::lend(csr), rand_engine);
+    gko::test::unsort_matrix(gko::lend(csr), this->rand_engine);
 
-    ASSERT_FALSE(is_csr_matrix_sorted(gko::lend(csr)));
+    ASSERT_FALSE(this->is_csr_matrix_sorted(gko::lend(csr)));
     ASSERT_TRUE(was_sorted);
     GKO_ASSERT_MTX_NEAR(csr, ref_mtx, 0.);
 }
 
 
-TEST_F(UnsortMatrix, CsrWorksWithEmpty)
+TYPED_TEST(UnsortMatrix, CsrWorksWithEmpty)
 {
-    const bool was_sorted = is_csr_matrix_sorted(gko::lend(csr_empty));
+    const bool was_sorted =
+        this->is_csr_matrix_sorted(gko::lend(this->csr_empty));
 
-    gko::test::unsort_matrix(gko::lend(csr_empty), rand_engine);
+    gko::test::unsort_matrix(gko::lend(this->csr_empty), this->rand_engine);
 
     ASSERT_TRUE(was_sorted);
-    ASSERT_EQ(csr_empty->get_num_stored_elements(), 0);
+    ASSERT_EQ(this->csr_empty->get_num_stored_elements(), 0);
 }
 
 
-TEST_F(UnsortMatrix, CooWorks)
+TYPED_TEST(UnsortMatrix, CooWorks)
 {
-    auto coo = get_sorted_coo();
-    const auto ref_mtx = get_sorted_coo();
-    const bool was_sorted = is_coo_matrix_sorted(gko::lend(coo));
+    auto coo = this->get_sorted_coo();
+    const auto ref_mtx = this->get_sorted_coo();
+    const bool was_sorted = this->is_coo_matrix_sorted(gko::lend(coo));
 
-    gko::test::unsort_matrix(gko::lend(coo), rand_engine);
+    gko::test::unsort_matrix(gko::lend(coo), this->rand_engine);
 
-    ASSERT_FALSE(is_coo_matrix_sorted(gko::lend(coo)));
+    ASSERT_FALSE(this->is_coo_matrix_sorted(gko::lend(coo)));
     ASSERT_TRUE(was_sorted);
     GKO_ASSERT_MTX_NEAR(coo, ref_mtx, 0.);
 }
 
 
-TEST_F(UnsortMatrix, CooWorksWithEmpty)
+TYPED_TEST(UnsortMatrix, CooWorksWithEmpty)
 {
-    const bool was_sorted = is_coo_matrix_sorted(gko::lend(coo_empty));
+    const bool was_sorted =
+        this->is_coo_matrix_sorted(gko::lend(this->coo_empty));
 
-    gko::test::unsort_matrix(gko::lend(coo_empty), rand_engine);
+    gko::test::unsort_matrix(gko::lend(this->coo_empty), this->rand_engine);
 
     ASSERT_TRUE(was_sorted);
-    ASSERT_EQ(coo_empty->get_num_stored_elements(), 0);
+    ASSERT_EQ(this->coo_empty->get_num_stored_elements(), 0);
 }
 
 
diff --git a/core/test/utils/value_generator.hpp b/core/test/utils/value_generator.hpp
index 8b82ea63d0a..8791bf6ce01 100644
--- a/core/test/utils/value_generator.hpp
+++ b/core/test/utils/value_generator.hpp
@@ -46,19 +46,35 @@ namespace test {
 namespace detail {
 
 
-template <typename ValueType, typename Distribution, typename Generator>
+/**
+ * Generate a random value.
+ *
+ * @tparam ValueType  valuetype of the value
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ *
+ * @param value_dist  distribution of array values
+ * @param engine  a random engine
+ *
+ * @return ValueType
+ */
+template <typename ValueType, typename ValueDistribution, typename Engine>
 typename std::enable_if<!is_complex_s<ValueType>::value, ValueType>::type
-get_rand_value(Distribution &&dist, Generator &&gen)
+get_rand_value(ValueDistribution &&value_dist, Engine &&gen)
 {
-    return dist(gen);
+    return value_dist(gen);
 }
 
-
-template <typename ValueType, typename Distribution, typename Generator>
+/**
+ * Specialization for complex types.
+ *
+ * @copydoc get_rand_value
+ */
+template <typename ValueType, typename ValueDistribution, typename Engine>
 typename std::enable_if<is_complex_s<ValueType>::value, ValueType>::type
-get_rand_value(Distribution &&dist, Generator &&gen)
+get_rand_value(ValueDistribution &&value_dist, Engine &&gen)
 {
-    return ValueType(dist(gen), dist(gen));
+    return ValueType(value_dist(gen), value_dist(gen));
 }
 
 
diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp
new file mode 100644
index 00000000000..58f033404a9
--- /dev/null
+++ b/core/test/utils/value_generator_test.cpp
@@ -0,0 +1,118 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/test/utils/value_generator.hpp"
+
+
+#include <cmath>
+#include <random>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class ValueGenerator : public ::testing::Test {
+protected:
+    using value_type = T;
+
+    ValueGenerator() {}
+
+    template <typename InputIterator, typename ValueType, typename Closure>
+    ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
+                             InputIterator sample_end, Closure closure_op)
+    {
+        using std::pow;
+        ValueType res = 0;
+        ValueType num_elems = 0;
+        while (sample_start != sample_end) {
+            auto tmp = *(sample_start++);
+            res += pow(closure_op(tmp) - c, n);
+            num_elems += 1;
+        }
+        return res / num_elems;
+    }
+
+    template <typename ValueType, typename InputIterator, typename Closure>
+    void check_average_and_deviation(
+        InputIterator sample_start, InputIterator sample_end,
+        gko::remove_complex<ValueType> average_ans,
+        gko::remove_complex<ValueType> deviation_ans, Closure closure_op)
+    {
+        auto average =
+            this->get_nth_moment(1, gko::zero<gko::remove_complex<ValueType>>(),
+                                 sample_start, sample_end, closure_op);
+        auto deviation = sqrt(this->get_nth_moment(2, average, sample_start,
+                                                   sample_end, closure_op));
+
+        // check that average & deviation is within 10% of the required amount
+        ASSERT_NEAR(average, average_ans, average_ans * 0.1);
+        ASSERT_NEAR(deviation, deviation_ans, deviation_ans * 0.1);
+    }
+};
+
+TYPED_TEST_SUITE(ValueGenerator, gko::test::ValueTypes);
+
+
+TYPED_TEST(ValueGenerator, OutputHasCorrectAverageAndDeviation)
+{
+    using T = typename TestFixture::value_type;
+    int num = 500;
+    std::vector<T> values(num);
+    auto dist = std::normal_distribution<double>(20.0, 5.0);
+    auto engine = std::ranlux48(42);
+
+    for (int i = 0; i < num; i++) {
+        values.at(i) = gko::test::detail::get_rand_value<T>(dist, engine);
+    }
+
+    // check the real part
+    this->template check_average_and_deviation<T>(
+        begin(values), end(values), 20.0, 5.0,
+        [](T &val) { return gko::real(val); });
+    // check the imag part when the type is complex
+    if (!std::is_same<T, gko::remove_complex<T>>::value) {
+        this->template check_average_and_deviation<T>(
+            begin(values), end(values), 20.0, 5.0,
+            [](T &val) { return gko::imag(val); });
+    }
+}
+
+
+}  // namespace
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 2814f9177c0..524f4a09f3d 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -115,7 +115,7 @@ class AmgxPgm : public ::testing::Test {
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
         auto system_dense = gen_mtx(m, m);
-        gko::test::make_spd(system_dense.get());
+        gko::test::make_hpd(system_dense.get());
         system_mtx = Csr::create(ref);
         system_dense->convert_to(system_mtx.get());
 
diff --git a/cuda/test/solver/bicg_kernels.cpp b/cuda/test/solver/bicg_kernels.cpp
index d82cc0a28a0..fb62f702ccf 100644
--- a/cuda/test/solver/bicg_kernels.cpp
+++ b/cuda/test/solver/bicg_kernels.cpp
@@ -258,7 +258,7 @@ TEST_F(Bicg, CudaBicgStep2IsEquivalentToRef)
 TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(cuda);
diff --git a/cuda/test/solver/cg_kernels.cpp b/cuda/test/solver/cg_kernels.cpp
index 8333fc7d0a8..e6443543efb 100644
--- a/cuda/test/solver/cg_kernels.cpp
+++ b/cuda/test/solver/cg_kernels.cpp
@@ -207,7 +207,7 @@ TEST_F(Cg, CudaCgStep2IsEquivalentToRef)
 TEST_F(Cg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(cuda);
diff --git a/cuda/test/solver/fcg_kernels.cpp b/cuda/test/solver/fcg_kernels.cpp
index b2832945666..c18444ee850 100644
--- a/cuda/test/solver/fcg_kernels.cpp
+++ b/cuda/test/solver/fcg_kernels.cpp
@@ -220,7 +220,7 @@ TEST_F(Fcg, CudaFcgStep2IsEquivalentToRef)
 TEST_F(Fcg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(cuda);
diff --git a/hip/test/multigrid/amgx_pgm_kernels.cpp b/hip/test/multigrid/amgx_pgm_kernels.cpp
index 39546df6cff..ce63db8a8dd 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.cpp
@@ -114,7 +114,7 @@ class AmgxPgm : public ::testing::Test {
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
         auto system_dense = gen_mtx(m, m);
-        gko::test::make_spd(system_dense.get());
+        gko::test::make_hpd(system_dense.get());
         system_mtx = Csr::create(ref);
         system_dense->convert_to(system_mtx.get());
 
diff --git a/hip/test/solver/bicg_kernels.cpp b/hip/test/solver/bicg_kernels.cpp
index c29bd74374c..d8510e46145 100644
--- a/hip/test/solver/bicg_kernels.cpp
+++ b/hip/test/solver/bicg_kernels.cpp
@@ -258,7 +258,7 @@ TEST_F(Bicg, HipBicgStep2IsEquivalentToRef)
 TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(hip);
diff --git a/hip/test/solver/cg_kernels.cpp b/hip/test/solver/cg_kernels.cpp
index cd4000171c0..a93c20a3287 100644
--- a/hip/test/solver/cg_kernels.cpp
+++ b/hip/test/solver/cg_kernels.cpp
@@ -207,7 +207,7 @@ TEST_F(Cg, HipCgStep2IsEquivalentToRef)
 TEST_F(Cg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(hip);
diff --git a/hip/test/solver/fcg_kernels.cpp b/hip/test/solver/fcg_kernels.cpp
index ca86775ee94..7a97115a998 100644
--- a/hip/test/solver/fcg_kernels.cpp
+++ b/hip/test/solver/fcg_kernels.cpp
@@ -220,7 +220,7 @@ TEST_F(Fcg, HipFcgStep2IsEquivalentToRef)
 TEST_F(Fcg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(hip);
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index 3ce1b087fd8..5729d01785f 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -200,6 +200,7 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
                        ? intermediate_agg.get_data()
                        : agg.get_data();
     const auto diag_vals = diag->get_const_values();
+#pragma omp parallel for
     for (IndexType row = 0; row < agg.get_num_elems(); row++) {
         if (agg_const_val[row] != -1) {
             continue;
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index e0b063de0bd..a452c9783ce 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -105,7 +105,7 @@ class AmgxPgm : public ::testing::Test {
         weight->convert_to(weight_csr.get());
         weight_diag = weight_csr->extract_diagonal();
         auto system_dense = gen_mtx(m, m);
-        gko::test::make_spd(system_dense.get());
+        gko::test::make_hpd(system_dense.get());
         system_mtx = Csr::create(ref);
         system_dense->convert_to(system_mtx.get());
 
diff --git a/omp/test/solver/bicg_kernels.cpp b/omp/test/solver/bicg_kernels.cpp
index 2018fa5f788..5049ab9ec10 100644
--- a/omp/test/solver/bicg_kernels.cpp
+++ b/omp/test/solver/bicg_kernels.cpp
@@ -238,7 +238,7 @@ TEST_F(Bicg, OmpBicgStep2IsEquivalentToRef)
 TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(omp);
diff --git a/omp/test/solver/cg_kernels.cpp b/omp/test/solver/cg_kernels.cpp
index 1e86956602a..0a5014aad64 100644
--- a/omp/test/solver/cg_kernels.cpp
+++ b/omp/test/solver/cg_kernels.cpp
@@ -206,7 +206,7 @@ TEST_F(Cg, OmpCgStep2IsEquivalentToRef)
 TEST_F(Cg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(omp);
diff --git a/omp/test/solver/fcg_kernels.cpp b/omp/test/solver/fcg_kernels.cpp
index 632a2c80808..85fd4cfebfd 100644
--- a/omp/test/solver/fcg_kernels.cpp
+++ b/omp/test/solver/fcg_kernels.cpp
@@ -219,7 +219,7 @@ TEST_F(Fcg, OmpFcgStep2IsEquivalentToRef)
 TEST_F(Fcg, ApplyIsEquivalentToRef)
 {
     auto mtx = gen_mtx(50, 50);
-    gko::test::make_spd(mtx.get());
+    gko::test::make_hpd(mtx.get());
     auto x = gen_mtx(50, 3);
     auto b = gen_mtx(50, 3);
     auto d_mtx = Mtx::create(omp);

From b5145bb879b0f5527a05c5ce7962d2edc122b970 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 9 Apr 2021 02:34:18 +0800
Subject: [PATCH 14/16] Review update

Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 common/multigrid/amgx_pgm_kernels.hpp.inc | 21 +++++++++++----------
 core/test/utils/matrix_utils.hpp          |  2 +-
 cuda/multigrid/amgx_pgm_kernels.cu        |  1 +
 cuda/test/multigrid/amgx_pgm_kernels.cpp  |  4 ++--
 hip/multigrid/amgx_pgm_kernels.hip.cpp    |  1 +
 hip/test/multigrid/amgx_pgm_kernels.cpp   |  4 ++--
 omp/multigrid/amgx_pgm_kernels.cpp        | 14 +++++++-------
 omp/test/multigrid/amgx_pgm_kernels.cpp   |  4 ++--
 reference/multigrid/amgx_pgm_kernels.cpp  | 14 +++++++-------
 9 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 9c2d434f508..623a8945a44 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -122,13 +122,13 @@ __global__
         }
         auto weight = weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
         if (agg[col] == -1 &&
-            (weight > max_weight_unagg ||
-             (weight == max_weight_unagg && col > strongest_unagg))) {
+            thrust::tie(weight, col) >
+                thrust::tie(max_weight_unagg, strongest_unagg)) {
             max_weight_unagg = weight;
             strongest_unagg = col;
         } else if (agg[col] != -1 &&
-                   (weight > max_weight_agg ||
-                    (weight == max_weight_agg && col > strongest_agg))) {
+                   thrust::tie(weight, col) >
+                       thrust::tie(max_weight_agg, strongest_agg)) {
             max_weight_agg = weight;
             strongest_agg = col;
         }
@@ -173,8 +173,8 @@ __global__
         }
         auto weight = weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
         if (agg_const_val[col] != -1 &&
-            (weight > max_weight_agg ||
-             (weight == max_weight_agg && col > strongest_agg))) {
+            thrust::tie(weight, col) >
+                thrust::tie(max_weight_agg, strongest_agg)) {
             max_weight_agg = weight;
             strongest_agg = col;
         }
@@ -209,8 +209,8 @@ __global__
         }
         auto weight = weight_vals[idx] / max(abs(diag[row]), abs(diag[col]));
         if (agg_val[col] != -1 &&
-            (weight > max_weight_agg ||
-             (weight == max_weight_agg && col > strongest_agg))) {
+            thrust::tie(weight, col) >
+                thrust::tie(max_weight_agg, strongest_agg)) {
             max_weight_agg = weight;
             strongest_agg = col;
         }
@@ -239,6 +239,7 @@ __global__ __launch_bounds__(default_block_size) void get_source_row_map_kernel(
     row_map[row] = atomic_add(result_row_ptrs + result_idx, num_elems);
 }
 
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void move_row_kernel(
     const size_type source_nrows, const IndexType *__restrict__ agg_val,
@@ -316,10 +317,10 @@ __global__ __launch_bounds__(default_block_size) void copy_to_coarse_kernel(
         return;
     }
     auto temp_i = temp_row_ptrs[row];
-    for (auto i = coarse_row_ptrs[row]; i < coarse_row_ptrs[row + 1];
-         i++, temp_i++) {
+    for (auto i = coarse_row_ptrs[row]; i < coarse_row_ptrs[row + 1]; i++) {
         coarse_col_idxs[i] = temp_col_idxs[temp_i];
         coarse_values[i] = temp_values[temp_i];
+        temp_i++;
     }
 }
 
diff --git a/core/test/utils/matrix_utils.hpp b/core/test/utils/matrix_utils.hpp
index 90aa3ea1d6e..dc6586f07b7 100644
--- a/core/test/utils/matrix_utils.hpp
+++ b/core/test/utils/matrix_utils.hpp
@@ -96,7 +96,7 @@ void make_hermitian(matrix::Dense<ValueType> *mtx)
  * the summation among the absoulue value of the row's elements. When ratio is
  * larger than 1, the result will be strictly diagonal dominant matrix except
  * for the empty row. When ratio is 1, the result will be diagonal dominant
- * matirx.
+ * matrix.
  *
  * @tparam ValueType  valuetype of Dense matrix to process
  *
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index 1560d7bf7f7..834f8d17f31 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cuda.h>
 #include <cusparse.h>
+#include <thrust/tuple.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 524f4a09f3d..6de292a8a7f 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -140,7 +140,7 @@ class AmgxPgm : public ::testing::Test {
     void make_weight(Mtx *mtx)
     {
         gko::test::make_symmetric(mtx);
-        // only works for realvalue cases
+        // only works for real value cases.
         mtx->compute_absolute_inplace();
         gko::test::make_diag_dominant(mtx);
     }
@@ -213,7 +213,7 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
-    ASSERT_LE(num_agg, 300);
+    ASSERT_LE(num_agg, n);
 }
 
 
diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index 2199f53e5c3..57498bca7df 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <hip/hip_runtime.h>
 #include <hipsparse.h>
+#include <thrust/tuple.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/test/multigrid/amgx_pgm_kernels.cpp b/hip/test/multigrid/amgx_pgm_kernels.cpp
index ce63db8a8dd..dd1ae6032c0 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.cpp
@@ -139,7 +139,7 @@ class AmgxPgm : public ::testing::Test {
     void make_weight(Mtx *mtx)
     {
         gko::test::make_symmetric(mtx);
-        // only works for realvalue cases
+        // only works for real value cases
         mtx->compute_absolute_inplace();
         gko::test::make_diag_dominant(mtx);
     }
@@ -212,7 +212,7 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
-    ASSERT_LE(num_agg, 300);
+    ASSERT_LE(num_agg, n);
 }
 
 
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index 5729d01785f..408478242dc 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <memory>
+#include <tuple>
 
 
 #include <omp.h>
@@ -154,14 +155,13 @@ void find_strongest_neighbor(
                 auto weight =
                     vals[idx] / max(abs(diag_vals[row]), abs(diag_vals[col]));
                 if (agg.get_const_data()[col] == -1 &&
-                    (weight > max_weight_unagg ||
-                     (weight == max_weight_unagg && col > strongest_unagg))) {
+                    std::tie(weight, col) >
+                        std::tie(max_weight_unagg, strongest_unagg)) {
                     max_weight_unagg = weight;
                     strongest_unagg = col;
                 } else if (agg.get_const_data()[col] != -1 &&
-                           (weight > max_weight_agg ||
-                            (weight == max_weight_agg &&
-                             col > strongest_agg))) {
+                           std::tie(weight, col) >
+                               std::tie(max_weight_agg, strongest_agg)) {
                     max_weight_agg = weight;
                     strongest_agg = col;
                 }
@@ -215,8 +215,8 @@ void assign_to_exist_agg(std::shared_ptr<const OmpExecutor> exec,
             auto weight =
                 vals[idx] / max(abs(diag_vals[row]), abs(diag_vals[col]));
             if (agg_const_val[col] != -1 &&
-                (weight > max_weight_agg ||
-                 (weight == max_weight_agg && col > strongest_agg))) {
+                std::tie(weight, col) >
+                    std::tie(max_weight_agg, strongest_agg)) {
                 max_weight_agg = weight;
                 strongest_agg = col;
             }
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index a452c9783ce..bd20b3749b1 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -130,7 +130,7 @@ class AmgxPgm : public ::testing::Test {
     void make_weight(Mtx *mtx)
     {
         gko::test::make_symmetric(mtx);
-        // it is only works for realvalue case.
+        // it is only works for real value case.
         mtx->compute_absolute_inplace();
         gko::test::make_diag_dominant(mtx);
     }
@@ -203,7 +203,7 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
-    ASSERT_LE(num_agg, 300);
+    ASSERT_LE(num_agg, n);
 }
 
 
diff --git a/reference/multigrid/amgx_pgm_kernels.cpp b/reference/multigrid/amgx_pgm_kernels.cpp
index d5b1ddbebb1..16002259fcf 100644
--- a/reference/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/multigrid/amgx_pgm_kernels.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <memory>
+#include <tuple>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -148,14 +149,13 @@ void find_strongest_neighbor(
                 auto weight =
                     vals[idx] / max(abs(diag_vals[row]), abs(diag_vals[col]));
                 if (agg.get_const_data()[col] == -1 &&
-                    (weight > max_weight_unagg ||
-                     (weight == max_weight_unagg && col > strongest_unagg))) {
+                    std::tie(weight, col) >
+                        std::tie(max_weight_unagg, strongest_unagg)) {
                     max_weight_unagg = weight;
                     strongest_unagg = col;
                 } else if (agg.get_const_data()[col] != -1 &&
-                           (weight > max_weight_agg ||
-                            (weight == max_weight_agg &&
-                             col > strongest_agg))) {
+                           std::tie(weight, col) >
+                               std::tie(max_weight_agg, strongest_agg)) {
                     max_weight_agg = weight;
                     strongest_agg = col;
                 }
@@ -208,8 +208,8 @@ void assign_to_exist_agg(std::shared_ptr<const ReferenceExecutor> exec,
             auto weight =
                 vals[idx] / max(abs(diag_vals[row]), abs(diag_vals[col]));
             if (agg_const_val[col] != -1 &&
-                (weight > max_weight_agg ||
-                 (weight == max_weight_agg && col > strongest_agg))) {
+                std::tie(weight, col) >
+                    std::tie(max_weight_agg, strongest_agg)) {
                 max_weight_agg = weight;
                 strongest_agg = col;
             }

From bef43736140b14b568129a4a2e83330700369ed8 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 27 Apr 2021 02:46:32 +0800
Subject: [PATCH 15/16] improve renumber and pass rstr/prlg into generate

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 common/multigrid/amgx_pgm_kernels.hpp.inc     |  4 +-
 core/multigrid/amgx_pgm.cpp                   | 23 +++----
 core/multigrid/amgx_pgm_kernels.hpp           | 14 ++--
 cuda/multigrid/amgx_pgm_kernels.cu            | 10 +--
 cuda/test/multigrid/amgx_pgm_kernels.cpp      | 64 +++++++++++++++----
 dpcpp/multigrid/amgx_pgm_kernels.dp.cpp       | 12 ++--
 hip/multigrid/amgx_pgm_kernels.hip.cpp        | 10 +--
 hip/test/multigrid/amgx_pgm_kernels.cpp       | 64 +++++++++++++++----
 omp/multigrid/amgx_pgm_kernels.cpp            | 13 ++--
 omp/test/multigrid/amgx_pgm_kernels.cpp       | 64 +++++++++++++++----
 reference/multigrid/amgx_pgm_kernels.cpp      |  8 ++-
 reference/test/multigrid/amgx_pgm_kernels.cpp | 12 ++--
 12 files changed, 218 insertions(+), 80 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 623a8945a44..3ec6bf81a7b 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -77,7 +77,9 @@ __global__ __launch_bounds__(default_block_size) void fill_agg_kernel(
     if (tidx >= num) {
         return;
     }
-    result[index[tidx]] = 1;
+    // agg_vals[i] == i always holds in the aggregated group whose identifier is
+    // i because we use the index of element as the aggregated group identifier.
+    result[tidx] = (index[tidx] == tidx);
 }
 
 
diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp
index 1803c4252d2..d9fac590691 100644
--- a/core/multigrid/amgx_pgm.cpp
+++ b/core/multigrid/amgx_pgm.cpp
@@ -74,15 +74,17 @@ namespace {
 template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> amgx_pgm_generate(
     std::shared_ptr<const Executor> exec,
-    const matrix::Csr<ValueType, IndexType> *source, const size_type num_agg,
-    const Array<IndexType> &agg)
+    const matrix::Csr<ValueType, IndexType> *source,
+    const matrix::Csr<ValueType, IndexType> *prolong_op,
+    const matrix::Csr<ValueType, IndexType> *restrict_op)
 {
+    auto num_agg = prolong_op->get_size()[1];
     auto coarse = matrix::Csr<ValueType, IndexType>::create(
         exec, dim<2>{num_agg, num_agg}, 0, source->get_strategy());
     auto temp = matrix::Csr<ValueType, IndexType>::create(
         exec, dim<2>{num_agg, num_agg}, source->get_num_stored_elements());
-    exec->run(amgx_pgm::make_amgx_pgm_generate(source, agg, coarse.get(),
-                                               temp.get()));
+    exec->run(amgx_pgm::make_amgx_pgm_generate(source, prolong_op, restrict_op,
+                                               coarse.get(), temp.get()));
     return std::move(coarse);
 }
 
@@ -158,13 +160,8 @@ void AmgxPgm<ValueType, IndexType>::generate()
     // Renumber the index
     exec->run(amgx_pgm::make_renumber(agg_, &num_agg));
 
-    // Construct the coarse matrix
-    auto coarse_matrix =
-        share(amgx_pgm_generate(exec, amgxpgm_op, num_agg, agg_));
-    // this->set_multigrid_level(system_matrix_, coarse_matrix);
-    auto coarse_dim = coarse_matrix->get_size()[0];
+    auto coarse_dim = num_agg;
     auto fine_dim = system_matrix_->get_size()[0];
-
     // TODO: prolong_op can be done with lightway format
     auto prolong_op = share(
         matrix_type::create(exec, gko::dim<2>{fine_dim, coarse_dim}, fine_dim));
@@ -175,7 +172,11 @@ void AmgxPgm<ValueType, IndexType>::generate()
     exec->run(amgx_pgm::make_fill_array(prolong_op->get_values(), fine_dim,
                                         one<ValueType>()));
     // TODO: implement the restrict_op from aggregation.
-    auto restrict_op = share(prolong_op->transpose());
+    auto restrict_op = gko::as<matrix_type>(share(prolong_op->transpose()));
+
+    // Construct the coarse matrix
+    auto coarse_matrix = share(amgx_pgm_generate(
+        exec, amgxpgm_op, prolong_op.get(), restrict_op.get()));
     this->set_multigrid_level(prolong_op, coarse_matrix, restrict_op);
 }
 
diff --git a/core/multigrid/amgx_pgm_kernels.hpp b/core/multigrid/amgx_pgm_kernels.hpp
index e0006be63a7..5bbb48b9594 100644
--- a/core/multigrid/amgx_pgm_kernels.hpp
+++ b/core/multigrid/amgx_pgm_kernels.hpp
@@ -78,12 +78,14 @@ namespace amgx_pgm {
         const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg, \
         Array<IndexType> &intermediate_agg)
 
-#define GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)                 \
-    void amgx_pgm_generate(std::shared_ptr<const DefaultExecutor> exec,     \
-                           const matrix::Csr<ValueType, IndexType> *source, \
-                           const Array<IndexType> &agg,                     \
-                           matrix::Csr<ValueType, IndexType> *coarse,       \
-                           matrix::Csr<ValueType, IndexType> *temp)
+#define GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)   \
+    void amgx_pgm_generate(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,          \
+        const matrix::Csr<ValueType, IndexType> *source,      \
+        const matrix::Csr<ValueType, IndexType> *prolong_op,  \
+        const matrix::Csr<ValueType, IndexType> *restrict_op, \
+        matrix::Csr<ValueType, IndexType> *coarse,            \
+        matrix::Csr<ValueType, IndexType> *temp)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
     template <typename IndexType>                                       \
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index 834f8d17f31..03f657573a7 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -111,8 +111,6 @@ void renumber(std::shared_ptr<const CudaExecutor> exec, Array<IndexType> &agg,
 {
     const auto num = agg.get_num_elems();
     Array<IndexType> agg_map(exec, num + 1);
-    components::fill_array(exec, agg_map.get_data(), agg_map.get_num_elems(),
-                           zero<IndexType>());
     const dim3 grid(ceildiv(num, default_block_size));
     kernel::fill_agg_kernel<<<grid, default_block_size>>>(
         num, agg.get_const_data(), agg_map.get_data());
@@ -177,10 +175,12 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const CudaExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
-                       const Array<IndexType> &agg,
+                       const matrix::Csr<ValueType, IndexType> *prolong_op,
+                       const matrix::Csr<ValueType, IndexType> *restrict_op,
                        matrix::Csr<ValueType, IndexType> *coarse,
                        matrix::Csr<ValueType, IndexType> *temp)
 {
+    const auto agg_const_val = prolong_op->get_const_col_idxs();
     const auto source_nrows = source->get_size()[0];
     const auto source_nnz = source->get_num_stored_elements();
     const auto coarse_nrows = coarse->get_size()[0];
@@ -192,13 +192,13 @@ void amgx_pgm_generate(std::shared_ptr<const CudaExecutor> exec,
     dim3 grid(ceildiv(source_nrows, default_block_size));
     // agg source_row (for row size) coarse row source map
     kernel::get_source_row_map_kernel<<<grid, default_block_size>>>(
-        source_nrows, agg.get_const_data(), source->get_const_row_ptrs(),
+        source_nrows, agg_const_val, source->get_const_row_ptrs(),
         temp->get_row_ptrs(), row_map.get_data());
     // prefix sum of temp_row_ptrs
     components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
     // copy source -> to coarse and change column index
     kernel::move_row_kernel<<<grid, default_block_size>>>(
-        source_nrows, agg.get_const_data(), row_map.get_const_data(),
+        source_nrows, agg_const_val, row_map.get_const_data(),
         source->get_const_row_ptrs(), source->get_const_col_idxs(),
         as_cuda_type(source->get_const_values()), temp->get_const_row_ptrs(),
         temp->get_col_idxs(), as_cuda_type(temp->get_values()));
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index 6de292a8a7f..b92934ce089 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -98,13 +98,35 @@ class AmgxPgm : public ::testing::Test {
             ref);
     }
 
+    gko::Array<index_type> gen_agg_array(gko::size_type num,
+                                         gko::size_type num_agg)
+    {
+        auto agg_array = gen_array(num, 0, num_agg - 1);
+        auto agg_array_val = agg_array.get_data();
+        std::vector<index_type> select_agg(num);
+        std::iota(select_agg.begin(), select_agg.end(), 0);
+        // use the first num_agg item as the aggregated index.
+        std::shuffle(select_agg.begin(), select_agg.end(), rand_engine);
+        // the value of agg_array is the i-th of aggregate group
+        for (gko::size_type i = 0; i < num; i++) {
+            agg_array_val[i] = select_agg[agg_array_val[i]];
+        }
+        // the aggregated group must contain the identifier-th element
+        // agg_val[i] == i holds in the aggregated group whose identifier is i
+        for (gko::size_type i = 0; i < num_agg; i++) {
+            auto agg_idx = select_agg[i];
+            agg_array_val[agg_idx] = agg_idx;
+        }
+        return agg_array;
+    }
+
     void initialize_data()
     {
-        int m = 597;
+        m = 597;
         n = 300;
         int nrhs = 3;
 
-        agg = gen_array(m, 0, n - 1);
+        agg = gen_agg_array(m, n);
         unfinished_agg = gen_array(m, -1, n - 1);
         strongest_neighbor = gen_array(m, 0, n - 1);
         coarse_vector = gen_mtx(n, nrhs);
@@ -171,6 +193,7 @@ class AmgxPgm : public ::testing::Test {
     std::shared_ptr<Csr> d_system_mtx;
 
     gko::size_type n;
+    gko::size_type m;
 };
 
 
@@ -193,8 +216,10 @@ TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
     index_type num_unagg;
     index_type d_num_unagg;
 
-    gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
-    gko::kernels::cuda::amgx_pgm::count_unagg(cuda, d_agg, &d_num_unagg);
+    gko::kernels::reference::amgx_pgm::count_unagg(ref, unfinished_agg,
+                                                   &num_unagg);
+    gko::kernels::cuda::amgx_pgm::count_unagg(cuda, d_unfinished_agg,
+                                              &d_num_unagg);
 
     ASSERT_EQ(d_num_unagg, num_unagg);
 }
@@ -203,8 +228,6 @@ TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
 TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 {
     initialize_data();
-    auto x = unfinished_agg;
-    auto d_x = d_unfinished_agg;
     index_type num_agg;
     index_type d_num_agg;
 
@@ -213,7 +236,7 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
-    ASSERT_LE(num_agg, n);
+    ASSERT_EQ(num_agg, n);
 }
 
 
@@ -271,15 +294,34 @@ TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
     auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
     auto d_csr_coarse = Csr::create(cuda, gko::dim<2>{n, n}, 0);
     auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                weight_csr->get_num_stored_elements());
+                                system_mtx->get_num_stored_elements());
     auto d_csr_temp = Csr::create(cuda, gko::dim<2>{n, n},
-                                  d_weight_csr->get_num_stored_elements());
+                                  d_system_mtx->get_num_stored_elements());
+    index_type num_agg;
+    // renumber again
+    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
+    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
+    for (int i = 0; i < m; i++) {
+        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
+    }
+    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
+              0);
+    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
+    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
+    auto d_prolong_op = Csr::create(cuda);
+    auto d_restrict_op = Csr::create(cuda);
+    d_prolong_op->copy_from(prolong_op.get());
+    d_restrict_op->copy_from(restrict_op.get());
 
     gko::kernels::cuda::amgx_pgm::amgx_pgm_generate(
-        cuda, d_weight_csr.get(), d_agg, d_csr_coarse.get(), d_csr_temp.get());
+        cuda, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
+        d_csr_coarse.get(), d_csr_temp.get());
     gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, weight_csr.get(), agg, csr_coarse.get(), csr_temp.get());
+        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
+        csr_coarse.get(), csr_temp.get());
 
+    // it should be checked already in renumber
+    GKO_ASSERT_EQ(num_agg, n);
     GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
 }
 
diff --git a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
index 32817e5e834..283127ce6d5 100644
--- a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
@@ -101,11 +101,13 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(
-    std::shared_ptr<const DpcppExecutor> exec,
-    const matrix::Csr<ValueType, IndexType> *source,
-    const Array<IndexType> &agg, matrix::Csr<ValueType, IndexType> *coarse,
-    matrix::Csr<ValueType, IndexType> *temp) GKO_NOT_IMPLEMENTED;
+void amgx_pgm_generate(std::shared_ptr<const DpcppExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType> *source,
+                       const matrix::Csr<ValueType, IndexType> *prolong_op,
+                       const matrix::Csr<ValueType, IndexType> *restrict_op,
+                       matrix::Csr<ValueType, IndexType> *coarse,
+                       matrix::Csr<ValueType, IndexType> *temp)
+    GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
 
diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index 57498bca7df..e2ed35d42e9 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -113,8 +113,6 @@ void renumber(std::shared_ptr<const HipExecutor> exec, Array<IndexType> &agg,
 {
     const auto num = agg.get_num_elems();
     Array<IndexType> agg_map(exec, num + 1);
-    components::fill_array(exec, agg_map.get_data(), agg_map.get_num_elems(),
-                           zero<IndexType>());
     const dim3 grid(ceildiv(num, default_block_size));
     hipLaunchKernelGGL(kernel::fill_agg_kernel, dim3(grid),
                        dim3(default_block_size), 0, 0, num,
@@ -188,10 +186,12 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const HipExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
-                       const Array<IndexType> &agg,
+                       const matrix::Csr<ValueType, IndexType> *prolong_op,
+                       const matrix::Csr<ValueType, IndexType> *restrict_op,
                        matrix::Csr<ValueType, IndexType> *coarse,
                        matrix::Csr<ValueType, IndexType> *temp)
 {
+    const auto agg_const_val = prolong_op->get_const_col_idxs();
     const auto source_nrows = source->get_size()[0];
     const auto source_nnz = source->get_num_stored_elements();
     const auto coarse_nrows = coarse->get_size()[0];
@@ -204,14 +204,14 @@ void amgx_pgm_generate(std::shared_ptr<const HipExecutor> exec,
     // agg source_row (for row size) coarse row source map
     hipLaunchKernelGGL(kernel::get_source_row_map_kernel, dim3(grid),
                        dim3(default_block_size), 0, 0, source_nrows,
-                       agg.get_const_data(), source->get_const_row_ptrs(),
+                       agg_const_val, source->get_const_row_ptrs(),
                        temp->get_row_ptrs(), row_map.get_data());
     // prefix sum of temp_row_ptrs
     components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
     // copy source -> to coarse and change column index
     hipLaunchKernelGGL(
         kernel::move_row_kernel, dim3(grid), dim3(default_block_size), 0, 0,
-        source_nrows, agg.get_const_data(), row_map.get_const_data(),
+        source_nrows, agg_const_val, row_map.get_const_data(),
         source->get_const_row_ptrs(), source->get_const_col_idxs(),
         as_hip_type(source->get_const_values()), temp->get_const_row_ptrs(),
         temp->get_col_idxs(), as_hip_type(temp->get_values()));
diff --git a/hip/test/multigrid/amgx_pgm_kernels.cpp b/hip/test/multigrid/amgx_pgm_kernels.cpp
index dd1ae6032c0..73909e522aa 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.cpp
@@ -97,13 +97,35 @@ class AmgxPgm : public ::testing::Test {
             ref);
     }
 
+    gko::Array<index_type> gen_agg_array(gko::size_type num,
+                                         gko::size_type num_agg)
+    {
+        auto agg_array = gen_array(num, 0, num_agg - 1);
+        auto agg_array_val = agg_array.get_data();
+        std::vector<index_type> select_agg(num);
+        std::iota(select_agg.begin(), select_agg.end(), 0);
+        // use the first num_agg item as the aggregated index.
+        std::shuffle(select_agg.begin(), select_agg.end(), rand_engine);
+        // the value of agg_array is the i-th of aggregate group
+        for (gko::size_type i = 0; i < num; i++) {
+            agg_array_val[i] = select_agg[agg_array_val[i]];
+        }
+        // the aggregated group must contain the identifier-th element
+        // agg_val[i] == i holds in the aggregated group whose identifier is i
+        for (gko::size_type i = 0; i < num_agg; i++) {
+            auto agg_idx = select_agg[i];
+            agg_array_val[agg_idx] = agg_idx;
+        }
+        return agg_array;
+    }
+
     void initialize_data()
     {
-        int m = 597;
+        m = 597;
         n = 300;
         int nrhs = 3;
 
-        agg = gen_array(m, 0, n - 1);
+        agg = gen_agg_array(m, n);
         unfinished_agg = gen_array(m, -1, n - 1);
         strongest_neighbor = gen_array(m, 0, n - 1);
         coarse_vector = gen_mtx(n, nrhs);
@@ -170,6 +192,7 @@ class AmgxPgm : public ::testing::Test {
     std::shared_ptr<Csr> d_system_mtx;
 
     gko::size_type n;
+    gko::size_type m;
 };
 
 
@@ -192,8 +215,10 @@ TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
     index_type num_unagg;
     index_type d_num_unagg;
 
-    gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
-    gko::kernels::hip::amgx_pgm::count_unagg(hip, d_agg, &d_num_unagg);
+    gko::kernels::reference::amgx_pgm::count_unagg(ref, unfinished_agg,
+                                                   &num_unagg);
+    gko::kernels::hip::amgx_pgm::count_unagg(hip, d_unfinished_agg,
+                                             &d_num_unagg);
 
     ASSERT_EQ(d_num_unagg, num_unagg);
 }
@@ -202,8 +227,6 @@ TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
 TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 {
     initialize_data();
-    auto x = unfinished_agg;
-    auto d_x = d_unfinished_agg;
     index_type num_agg;
     index_type d_num_agg;
 
@@ -212,7 +235,7 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
-    ASSERT_LE(num_agg, n);
+    ASSERT_EQ(num_agg, n);
 }
 
 
@@ -270,15 +293,34 @@ TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
     auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
     auto d_csr_coarse = Csr::create(hip, gko::dim<2>{n, n}, 0);
     auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                weight_csr->get_num_stored_elements());
+                                system_mtx->get_num_stored_elements());
     auto d_csr_temp = Csr::create(hip, gko::dim<2>{n, n},
-                                  d_weight_csr->get_num_stored_elements());
+                                  d_system_mtx->get_num_stored_elements());
+    index_type num_agg;
+    // renumber again
+    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
+    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
+    for (int i = 0; i < m; i++) {
+        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
+    }
+    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
+              0);
+    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
+    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
+    auto d_prolong_op = Csr::create(hip);
+    auto d_restrict_op = Csr::create(hip);
+    d_prolong_op->copy_from(prolong_op.get());
+    d_restrict_op->copy_from(restrict_op.get());
 
     gko::kernels::hip::amgx_pgm::amgx_pgm_generate(
-        hip, d_weight_csr.get(), d_agg, d_csr_coarse.get(), d_csr_temp.get());
+        hip, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
+        d_csr_coarse.get(), d_csr_temp.get());
     gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, weight_csr.get(), agg, csr_coarse.get(), csr_temp.get());
+        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
+        csr_coarse.get(), csr_temp.get());
 
+    // it should be checked already in renumber
+    GKO_ASSERT_EQ(num_agg, n);
     GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
 }
 
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index 408478242dc..dc6329d982c 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -113,10 +113,11 @@ void renumber(std::shared_ptr<const OmpExecutor> exec, Array<IndexType> &agg,
     Array<IndexType> agg_map(exec, num + 1);
     auto agg_vals = agg.get_data();
     auto agg_map_vals = agg_map.get_data();
-    components::fill_array(exec, agg_map_vals, num + 1, zero<IndexType>());
+    // agg_vals[i] == i always holds in the aggregated group whose identifier is
+    // i because we use the index of element as the aggregated group identifier.
 #pragma omp parallel for
     for (size_type i = 0; i < num; i++) {
-        agg_map_vals[agg_vals[i]] = 1;
+        agg_map_vals[i] = (agg_vals[i] == i);
     }
     components::prefix_sum(exec, agg_map_vals, num + 1);
 #pragma omp parallel for
@@ -241,11 +242,13 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const OmpExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
-                       const Array<IndexType> &agg,
+                       const matrix::Csr<ValueType, IndexType> *prolong_op,
+                       const matrix::Csr<ValueType, IndexType> *restrict_op,
                        matrix::Csr<ValueType, IndexType> *coarse,
                        matrix::Csr<ValueType, IndexType> *temp)
 {
     // agg[i] -> I, agg[j] -> J
+    const auto agg_const_val = prolong_op->get_const_col_idxs();
     const auto coarse_nrows = coarse->get_size()[0];
     const auto source_nrows = source->get_size()[0];
     const auto source_row_ptrs = source->get_const_row_ptrs();
@@ -254,9 +257,9 @@ void amgx_pgm_generate(std::shared_ptr<const OmpExecutor> exec,
     vector<map<IndexType, ValueType>> row_list(
         source_nrows, map<IndexType, ValueType>{exec}, exec);
     for (size_type i = 0; i < source_nrows; i++) {
-        IndexType row_idx = agg.get_const_data()[i];
+        IndexType row_idx = agg_const_val[i];
         for (auto j = source_row_ptrs[i]; j < source_row_ptrs[i + 1]; j++) {
-            const auto col = agg.get_const_data()[source_col_idxs[j]];
+            const auto col = agg_const_val[source_col_idxs[j]];
             const auto val = source_vals[j];
             row_list[row_idx][col] += val;
         }
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index bd20b3749b1..91dc2c2fce8 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -88,13 +88,35 @@ class AmgxPgm : public ::testing::Test {
             ref);
     }
 
+    gko::Array<index_type> gen_agg_array(gko::size_type num,
+                                         gko::size_type num_agg)
+    {
+        auto agg_array = gen_array(num, 0, num_agg - 1);
+        auto agg_array_val = agg_array.get_data();
+        std::vector<index_type> select_agg(num);
+        std::iota(select_agg.begin(), select_agg.end(), 0);
+        // use the first num_agg item as the aggregated index.
+        std::shuffle(select_agg.begin(), select_agg.end(), rand_engine);
+        // the value of agg_array is the i-th of aggregate group
+        for (gko::size_type i = 0; i < num; i++) {
+            agg_array_val[i] = select_agg[agg_array_val[i]];
+        }
+        // the aggregated group must contain the identifier-th element
+        // agg_val[i] == i holds in the aggregated group whose identifier is i
+        for (gko::size_type i = 0; i < num_agg; i++) {
+            auto agg_idx = select_agg[i];
+            agg_array_val[agg_idx] = agg_idx;
+        }
+        return agg_array;
+    }
+
     void initialize_data()
     {
-        int m = 597;
+        m = 597;
         n = 300;
         int nrhs = 3;
 
-        agg = gen_array(m, 0, n - 1);
+        agg = gen_agg_array(m, n);
         unfinished_agg = gen_array(m, -1, n - 1);
         strongest_neighbor = gen_array(m, 0, n - 1);
         coarse_vector = gen_mtx(n, nrhs);
@@ -161,6 +183,7 @@ class AmgxPgm : public ::testing::Test {
     std::shared_ptr<Csr> d_system_mtx;
 
     gko::size_type n;
+    gko::size_type m;
 };
 
 
@@ -183,8 +206,10 @@ TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
     index_type num_unagg;
     index_type d_num_unagg;
 
-    gko::kernels::reference::amgx_pgm::count_unagg(ref, agg, &num_unagg);
-    gko::kernels::omp::amgx_pgm::count_unagg(omp, d_agg, &d_num_unagg);
+    gko::kernels::reference::amgx_pgm::count_unagg(ref, unfinished_agg,
+                                                   &num_unagg);
+    gko::kernels::omp::amgx_pgm::count_unagg(omp, d_unfinished_agg,
+                                             &d_num_unagg);
 
     ASSERT_EQ(d_num_unagg, num_unagg);
 }
@@ -193,8 +218,6 @@ TEST_F(AmgxPgm, CountUnaggIsEquivalentToRef)
 TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 {
     initialize_data();
-    auto x = unfinished_agg;
-    auto d_x = d_unfinished_agg;
     index_type num_agg;
     index_type d_num_agg;
 
@@ -203,7 +226,7 @@ TEST_F(AmgxPgm, RenumberIsEquivalentToRef)
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
-    ASSERT_LE(num_agg, n);
+    ASSERT_EQ(num_agg, n);
 }
 
 
@@ -261,15 +284,34 @@ TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
     auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
     auto d_csr_coarse = Csr::create(omp, gko::dim<2>{n, n}, 0);
     auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                weight_csr->get_num_stored_elements());
+                                system_mtx->get_num_stored_elements());
     auto d_csr_temp = Csr::create(omp, gko::dim<2>{n, n},
-                                  d_weight_csr->get_num_stored_elements());
+                                  d_system_mtx->get_num_stored_elements());
+    index_type num_agg;
+    // renumber again
+    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
+    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
+    for (int i = 0; i < m; i++) {
+        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
+    }
+    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
+              0);
+    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
+    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
+    auto d_prolong_op = Csr::create(omp);
+    auto d_restrict_op = Csr::create(omp);
+    d_prolong_op->copy_from(prolong_op.get());
+    d_restrict_op->copy_from(restrict_op.get());
 
     gko::kernels::omp::amgx_pgm::amgx_pgm_generate(
-        omp, d_weight_csr.get(), d_agg, d_csr_coarse.get(), d_csr_temp.get());
+        omp, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
+        d_csr_coarse.get(), d_csr_temp.get());
     gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, weight_csr.get(), agg, csr_coarse.get(), csr_temp.get());
+        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
+        csr_coarse.get(), csr_temp.get());
 
+    // it should be checked already in renumber
+    GKO_ASSERT_EQ(num_agg, n);
     GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
 }
 
diff --git a/reference/multigrid/amgx_pgm_kernels.cpp b/reference/multigrid/amgx_pgm_kernels.cpp
index 16002259fcf..7d03fdff9f0 100644
--- a/reference/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/multigrid/amgx_pgm_kernels.cpp
@@ -234,11 +234,13 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void amgx_pgm_generate(std::shared_ptr<const ReferenceExecutor> exec,
                        const matrix::Csr<ValueType, IndexType> *source,
-                       const Array<IndexType> &agg,
+                       const matrix::Csr<ValueType, IndexType> *prolong_op,
+                       const matrix::Csr<ValueType, IndexType> *restrict_op,
                        matrix::Csr<ValueType, IndexType> *coarse,
                        matrix::Csr<ValueType, IndexType> *temp)
 {
     // agg[i] -> I, agg[j] -> J
+    const auto agg_const_val = prolong_op->get_const_col_idxs();
     const auto coarse_nrows = coarse->get_size()[0];
     const auto source_nrows = source->get_size()[0];
     const auto source_row_ptrs = source->get_const_row_ptrs();
@@ -247,9 +249,9 @@ void amgx_pgm_generate(std::shared_ptr<const ReferenceExecutor> exec,
     gko::vector<gko::map<IndexType, ValueType>> row_list(
         source_nrows, gko::map<IndexType, ValueType>{exec}, exec);
     for (size_type i = 0; i < source_nrows; i++) {
-        IndexType row_idx = agg.get_const_data()[i];
+        IndexType row_idx = agg_const_val[i];
         for (auto j = source_row_ptrs[i]; j < source_row_ptrs[i + 1]; j++) {
-            const auto col = agg.get_const_data()[source_col_idxs[j]];
+            const auto col = agg_const_val[source_col_idxs[j]];
             const auto val = source_vals[j];
             row_list[row_idx][col] += val;
         }
diff --git a/reference/test/multigrid/amgx_pgm_kernels.cpp b/reference/test/multigrid/amgx_pgm_kernels.cpp
index 09918310823..2cff6c8537b 100644
--- a/reference/test/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/test/multigrid/amgx_pgm_kernels.cpp
@@ -508,11 +508,10 @@ TYPED_TEST(AmgxPgm, GenerateMtx)
     gko::Array<index_type> agg(this->exec, 5);
     auto agg_vals = agg.get_data();
     // 0 - 2, 1 - 3, 4
-    agg_vals[0] = 0;
-    agg_vals[1] = 1;
-    agg_vals[2] = 0;
-    agg_vals[3] = 1;
-    agg_vals[4] = 2;
+    auto prolong_op = mtx_type::create(this->exec, gko::dim<2>{5, 3}, 0);
+    prolong_op->read(
+        {{5, 3}, {{0, 0, 1}, {1, 1, 1}, {2, 0, 1}, {3, 1, 1}, {4, 2, 1}}});
+    auto restrict_op = gko::as<mtx_type>(prolong_op->transpose());
     auto coarse_ans = mtx_type::create(this->exec, gko::dim<2>{3, 3}, 0);
     coarse_ans->read({{3, 3},
                       {{0, 0, 4},
@@ -528,7 +527,8 @@ TYPED_TEST(AmgxPgm, GenerateMtx)
     auto empty = gko::matrix::Csr<value_type, index_type>::create(this->exec);
 
     gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        this->exec, this->mtx.get(), agg, csr_coarse.get(), empty.get());
+        this->exec, this->mtx.get(), prolong_op.get(), restrict_op.get(),
+        csr_coarse.get(), empty.get());
 
     GKO_ASSERT_MTX_NEAR(csr_coarse, coarse_ans, r<value_type>::value);
 }

From 8b5901bd64bf1417f2b1bc4191fa3c93d1f4ed33 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 30 Apr 2021 20:56:15 +0800
Subject: [PATCH 16/16] use two csr multiplication to generate coarse

need to improve it with less memory footprint
---
 common/multigrid/amgx_pgm_kernels.hpp.inc     | 102 ------------------
 core/device_hooks/common_kernels.inc.cpp      |   5 -
 core/multigrid/amgx_pgm.cpp                   |  36 ++-----
 core/multigrid/amgx_pgm_kernels.hpp           |  13 +--
 cuda/multigrid/amgx_pgm_kernels.cu            |  59 ----------
 cuda/test/multigrid/amgx_pgm_kernels.cpp      |  38 -------
 dpcpp/multigrid/amgx_pgm_kernels.dp.cpp       |  12 ---
 hip/multigrid/amgx_pgm_kernels.hip.cpp        |  63 -----------
 hip/test/multigrid/amgx_pgm_kernels.cpp       |  38 -------
 omp/multigrid/amgx_pgm_kernels.cpp            |  54 ----------
 omp/test/multigrid/amgx_pgm_kernels.cpp       |  38 -------
 reference/multigrid/amgx_pgm_kernels.cpp      |  53 ---------
 reference/test/multigrid/amgx_pgm_kernels.cpp |  43 +++-----
 13 files changed, 24 insertions(+), 530 deletions(-)

diff --git a/common/multigrid/amgx_pgm_kernels.hpp.inc b/common/multigrid/amgx_pgm_kernels.hpp.inc
index 3ec6bf81a7b..540478ba427 100644
--- a/common/multigrid/amgx_pgm_kernels.hpp.inc
+++ b/common/multigrid/amgx_pgm_kernels.hpp.inc
@@ -225,106 +225,4 @@ __global__
 }
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void get_source_row_map_kernel(
-    const size_type source_nrows, const IndexType *__restrict__ agg_val,
-    const IndexType *__restrict__ source_row_ptrs,
-    IndexType *__restrict__ result_row_ptrs, IndexType *__restrict__ row_map)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= source_nrows) {
-        return;
-    }
-    const auto num_elems = source_row_ptrs[row + 1] - source_row_ptrs[row];
-    const auto result_idx = agg_val[row];
-    // atomic_add returns the old value, so it can be the starting point.
-    row_map[row] = atomic_add(result_row_ptrs + result_idx, num_elems);
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void move_row_kernel(
-    const size_type source_nrows, const IndexType *__restrict__ agg_val,
-    const IndexType *__restrict__ row_map,
-    const IndexType *__restrict__ source_row_ptrs,
-    const IndexType *__restrict__ source_col_idxs,
-    const ValueType *__restrict__ source_values,
-    const IndexType *__restrict__ result_row_ptrs,
-    IndexType *__restrict__ result_col_idxs,
-    ValueType *__restrict__ result_values)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= source_nrows) {
-        return;
-    }
-    auto result_i = result_row_ptrs[agg_val[row]] + row_map[row];
-    for (auto i = source_row_ptrs[row]; i < source_row_ptrs[row + 1];
-         i++, result_i++) {
-        result_col_idxs[result_i] = agg_val[source_col_idxs[i]];
-        result_values[result_i] = source_values[i];
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void merge_col_kernel(
-    const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
-    IndexType *__restrict__ temp_col_idxs, ValueType *__restrict__ temp_values,
-    IndexType *__restrict__ coarse_row_ptrs)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= nrows) {
-        return;
-    }
-
-    IndexType num_elems = zero<IndexType>();
-    const auto start = temp_row_ptrs[row];
-    const auto end = temp_row_ptrs[row + 1];
-    IndexType col = temp_col_idxs[start];
-    ValueType value = temp_values[start];
-    for (auto i = start + 1; i < end; i++) {
-        const auto current_col = temp_col_idxs[i];
-        if (current_col != col) {
-            // apply to the original data. It is sorted, so the writing position
-            // is before read position
-            temp_col_idxs[start + num_elems] = col;
-            temp_values[start + num_elems] = value;
-            value = zero<ValueType>();
-            col = current_col;
-            num_elems++;
-        }
-        value += temp_values[i];
-    }
-    // If start != end, need to process the final column
-    if (start != end) {
-        temp_col_idxs[start + num_elems] = col;
-        temp_values[start + num_elems] = value;
-        num_elems++;
-    }
-    coarse_row_ptrs[row] = num_elems;
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void copy_to_coarse_kernel(
-    const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
-    const IndexType *__restrict__ temp_col_idxs,
-    const ValueType *__restrict__ temp_values,
-    const IndexType *__restrict__ coarse_row_ptrs,
-    IndexType *__restrict__ coarse_col_idxs,
-    ValueType *__restrict__ coarse_values)
-{
-    auto row = thread::get_thread_id_flat();
-    if (row >= nrows) {
-        return;
-    }
-    auto temp_i = temp_row_ptrs[row];
-    for (auto i = coarse_row_ptrs[row]; i < coarse_row_ptrs[row + 1]; i++) {
-        coarse_col_idxs[i] = temp_col_idxs[temp_i];
-        coarse_values[i] = temp_values[temp_i];
-        temp_i++;
-    }
-}
-
-
 }  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index cbf1e2d9737..877051c3e04 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -1328,11 +1328,6 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)
-GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
 
 }  // namespace amgx_pgm
 
diff --git a/core/multigrid/amgx_pgm.cpp b/core/multigrid/amgx_pgm.cpp
index d9fac590691..83421932016 100644
--- a/core/multigrid/amgx_pgm.cpp
+++ b/core/multigrid/amgx_pgm.cpp
@@ -60,7 +60,6 @@ GKO_REGISTER_OPERATION(renumber, amgx_pgm::renumber);
 GKO_REGISTER_OPERATION(find_strongest_neighbor,
                        amgx_pgm::find_strongest_neighbor);
 GKO_REGISTER_OPERATION(assign_to_exist_agg, amgx_pgm::assign_to_exist_agg);
-GKO_REGISTER_OPERATION(amgx_pgm_generate, amgx_pgm::amgx_pgm_generate);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
 GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);
 
@@ -68,30 +67,6 @@ GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);
 }  // namespace amgx_pgm
 
 
-namespace {
-
-
-template <typename ValueType, typename IndexType>
-std::unique_ptr<LinOp> amgx_pgm_generate(
-    std::shared_ptr<const Executor> exec,
-    const matrix::Csr<ValueType, IndexType> *source,
-    const matrix::Csr<ValueType, IndexType> *prolong_op,
-    const matrix::Csr<ValueType, IndexType> *restrict_op)
-{
-    auto num_agg = prolong_op->get_size()[1];
-    auto coarse = matrix::Csr<ValueType, IndexType>::create(
-        exec, dim<2>{num_agg, num_agg}, 0, source->get_strategy());
-    auto temp = matrix::Csr<ValueType, IndexType>::create(
-        exec, dim<2>{num_agg, num_agg}, source->get_num_stored_elements());
-    exec->run(amgx_pgm::make_amgx_pgm_generate(source, prolong_op, restrict_op,
-                                               coarse.get(), temp.get()));
-    return std::move(coarse);
-}
-
-
-}  // namespace
-
-
 template <typename ValueType, typename IndexType>
 void AmgxPgm<ValueType, IndexType>::generate()
 {
@@ -160,7 +135,7 @@ void AmgxPgm<ValueType, IndexType>::generate()
     // Renumber the index
     exec->run(amgx_pgm::make_renumber(agg_, &num_agg));
 
-    auto coarse_dim = num_agg;
+    gko::dim<2>::dimension_type coarse_dim = num_agg;
     auto fine_dim = system_matrix_->get_size()[0];
     // TODO: prolong_op can be done with lightway format
     auto prolong_op = share(
@@ -175,8 +150,13 @@ void AmgxPgm<ValueType, IndexType>::generate()
     auto restrict_op = gko::as<matrix_type>(share(prolong_op->transpose()));
 
     // Construct the coarse matrix
-    auto coarse_matrix = share(amgx_pgm_generate(
-        exec, amgxpgm_op, prolong_op.get(), restrict_op.get()));
+    // TODO: use less memory footprint to improve it
+    auto coarse_matrix =
+        share(matrix_type::create(exec, gko::dim<2>{coarse_dim, coarse_dim}));
+    auto tmp = matrix_type::create(exec, gko::dim<2>{coarse_dim, fine_dim});
+    restrict_op->apply(amgxpgm_op, tmp.get());
+    tmp->apply(prolong_op.get(), coarse_matrix.get());
+
     this->set_multigrid_level(prolong_op, coarse_matrix, restrict_op);
 }
 
diff --git a/core/multigrid/amgx_pgm_kernels.hpp b/core/multigrid/amgx_pgm_kernels.hpp
index 5bbb48b9594..793780ae505 100644
--- a/core/multigrid/amgx_pgm_kernels.hpp
+++ b/core/multigrid/amgx_pgm_kernels.hpp
@@ -78,15 +78,6 @@ namespace amgx_pgm {
         const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg, \
         Array<IndexType> &intermediate_agg)
 
-#define GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)   \
-    void amgx_pgm_generate(                                   \
-        std::shared_ptr<const DefaultExecutor> exec,          \
-        const matrix::Csr<ValueType, IndexType> *source,      \
-        const matrix::Csr<ValueType, IndexType> *prolong_op,  \
-        const matrix::Csr<ValueType, IndexType> *restrict_op, \
-        matrix::Csr<ValueType, IndexType> *coarse,            \
-        matrix::Csr<ValueType, IndexType> *temp)
-
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
     template <typename IndexType>                                       \
     GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL(IndexType);                  \
@@ -97,9 +88,7 @@ namespace amgx_pgm {
     template <typename ValueType, typename IndexType>                   \
     GKO_DECLARE_AMGX_PGM_FIND_STRONGEST_NEIGHBOR(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>                   \
-    GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG(ValueType, IndexType);     \
-    template <typename ValueType, typename IndexType>                   \
-    GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)
+    GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG(ValueType, IndexType)
 
 
 }  // namespace amgx_pgm
diff --git a/cuda/multigrid/amgx_pgm_kernels.cu b/cuda/multigrid/amgx_pgm_kernels.cu
index 03f657573a7..15d9940b22e 100644
--- a/cuda/multigrid/amgx_pgm_kernels.cu
+++ b/cuda/multigrid/amgx_pgm_kernels.cu
@@ -172,65 +172,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const CudaExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-{
-    const auto agg_const_val = prolong_op->get_const_col_idxs();
-    const auto source_nrows = source->get_size()[0];
-    const auto source_nnz = source->get_num_stored_elements();
-    const auto coarse_nrows = coarse->get_size()[0];
-    Array<IndexType> row_map(exec, source_nrows);
-    // fill coarse row pointer as zero
-    components::fill_array(exec, temp->get_row_ptrs(), coarse_nrows + 1,
-                           zero<IndexType>());
-    // compute each source row should be moved and also change column index
-    dim3 grid(ceildiv(source_nrows, default_block_size));
-    // agg source_row (for row size) coarse row source map
-    kernel::get_source_row_map_kernel<<<grid, default_block_size>>>(
-        source_nrows, agg_const_val, source->get_const_row_ptrs(),
-        temp->get_row_ptrs(), row_map.get_data());
-    // prefix sum of temp_row_ptrs
-    components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
-    // copy source -> to coarse and change column index
-    kernel::move_row_kernel<<<grid, default_block_size>>>(
-        source_nrows, agg_const_val, row_map.get_const_data(),
-        source->get_const_row_ptrs(), source->get_const_col_idxs(),
-        as_cuda_type(source->get_const_values()), temp->get_const_row_ptrs(),
-        temp->get_col_idxs(), as_cuda_type(temp->get_values()));
-    // sort csr
-    csr::sort_by_column_index(exec, temp);
-    // summation of the elements with same position
-    grid = ceildiv(coarse_nrows, default_block_size);
-    kernel::merge_col_kernel<<<grid, default_block_size>>>(
-        coarse_nrows, temp->get_const_row_ptrs(), temp->get_col_idxs(),
-        as_cuda_type(temp->get_values()), coarse->get_row_ptrs());
-    // build the coarse matrix
-    components::prefix_sum(exec, coarse->get_row_ptrs(), coarse_nrows + 1);
-    // prefix sum of coarse->get_row_ptrs
-    const auto coarse_nnz =
-        exec->copy_val_to_host(coarse->get_row_ptrs() + coarse_nrows);
-    // reallocate size of column and values
-    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
-    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
-    auto &coarse_vals_array = coarse_builder.get_value_array();
-    coarse_col_idxs_array.resize_and_reset(coarse_nnz);
-    coarse_vals_array.resize_and_reset(coarse_nnz);
-    // copy the result
-    kernel::copy_to_coarse_kernel<<<grid, default_block_size>>>(
-        coarse_nrows, temp->get_const_row_ptrs(), temp->get_const_col_idxs(),
-        as_cuda_type(temp->get_const_values()), coarse->get_const_row_ptrs(),
-        coarse_col_idxs_array.get_data(),
-        as_cuda_type(coarse_vals_array.get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/test/multigrid/amgx_pgm_kernels.cpp b/cuda/test/multigrid/amgx_pgm_kernels.cpp
index b92934ce089..07672359615 100644
--- a/cuda/test/multigrid/amgx_pgm_kernels.cpp
+++ b/cuda/test/multigrid/amgx_pgm_kernels.cpp
@@ -288,44 +288,6 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
 }
 
 
-TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
-{
-    initialize_data();
-    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
-    auto d_csr_coarse = Csr::create(cuda, gko::dim<2>{n, n}, 0);
-    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                system_mtx->get_num_stored_elements());
-    auto d_csr_temp = Csr::create(cuda, gko::dim<2>{n, n},
-                                  d_system_mtx->get_num_stored_elements());
-    index_type num_agg;
-    // renumber again
-    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
-    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
-    for (int i = 0; i < m; i++) {
-        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
-    }
-    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
-              0);
-    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
-    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
-    auto d_prolong_op = Csr::create(cuda);
-    auto d_restrict_op = Csr::create(cuda);
-    d_prolong_op->copy_from(prolong_op.get());
-    d_restrict_op->copy_from(restrict_op.get());
-
-    gko::kernels::cuda::amgx_pgm::amgx_pgm_generate(
-        cuda, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
-        d_csr_coarse.get(), d_csr_temp.get());
-    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
-        csr_coarse.get(), csr_temp.get());
-
-    // it should be checked already in renumber
-    GKO_ASSERT_EQ(num_agg, n);
-    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
 {
     initialize_data();
diff --git a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
index 283127ce6d5..14baa306bca 100644
--- a/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
@@ -100,18 +100,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const DpcppExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-    GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/hip/multigrid/amgx_pgm_kernels.hip.cpp b/hip/multigrid/amgx_pgm_kernels.hip.cpp
index e2ed35d42e9..d7f4685b785 100644
--- a/hip/multigrid/amgx_pgm_kernels.hip.cpp
+++ b/hip/multigrid/amgx_pgm_kernels.hip.cpp
@@ -183,69 +183,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const HipExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-{
-    const auto agg_const_val = prolong_op->get_const_col_idxs();
-    const auto source_nrows = source->get_size()[0];
-    const auto source_nnz = source->get_num_stored_elements();
-    const auto coarse_nrows = coarse->get_size()[0];
-    Array<IndexType> row_map(exec, source_nrows);
-    // fill coarse row pointer as zero
-    components::fill_array(exec, temp->get_row_ptrs(), coarse_nrows + 1,
-                           zero<IndexType>());
-    // compute each source row should be moved and also change column index
-    dim3 grid(ceildiv(source_nrows, default_block_size));
-    // agg source_row (for row size) coarse row source map
-    hipLaunchKernelGGL(kernel::get_source_row_map_kernel, dim3(grid),
-                       dim3(default_block_size), 0, 0, source_nrows,
-                       agg_const_val, source->get_const_row_ptrs(),
-                       temp->get_row_ptrs(), row_map.get_data());
-    // prefix sum of temp_row_ptrs
-    components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
-    // copy source -> to coarse and change column index
-    hipLaunchKernelGGL(
-        kernel::move_row_kernel, dim3(grid), dim3(default_block_size), 0, 0,
-        source_nrows, agg_const_val, row_map.get_const_data(),
-        source->get_const_row_ptrs(), source->get_const_col_idxs(),
-        as_hip_type(source->get_const_values()), temp->get_const_row_ptrs(),
-        temp->get_col_idxs(), as_hip_type(temp->get_values()));
-    // sort csr
-    csr::sort_by_column_index(exec, temp);
-    // summation of the elements with same position
-    grid = ceildiv(coarse_nrows, default_block_size);
-    hipLaunchKernelGGL(kernel::merge_col_kernel, dim3(grid),
-                       dim3(default_block_size), 0, 0, coarse_nrows,
-                       temp->get_const_row_ptrs(), temp->get_col_idxs(),
-                       as_hip_type(temp->get_values()), coarse->get_row_ptrs());
-    // build the coarse matrix
-    components::prefix_sum(exec, coarse->get_row_ptrs(), coarse_nrows + 1);
-    // prefix sum of coarse->get_row_ptrs
-    const auto coarse_nnz =
-        exec->copy_val_to_host(coarse->get_row_ptrs() + coarse_nrows);
-    // reallocate size of column and values
-    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
-    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
-    auto &coarse_vals_array = coarse_builder.get_value_array();
-    coarse_col_idxs_array.resize_and_reset(coarse_nnz);
-    coarse_vals_array.resize_and_reset(coarse_nnz);
-    // copy the result
-    hipLaunchKernelGGL(
-        kernel::copy_to_coarse_kernel, dim3(grid), dim3(default_block_size), 0,
-        0, coarse_nrows, temp->get_const_row_ptrs(), temp->get_const_col_idxs(),
-        as_hip_type(temp->get_const_values()), coarse->get_const_row_ptrs(),
-        coarse_col_idxs_array.get_data(),
-        as_hip_type(coarse_vals_array.get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace hip
 }  // namespace kernels
diff --git a/hip/test/multigrid/amgx_pgm_kernels.cpp b/hip/test/multigrid/amgx_pgm_kernels.cpp
index 73909e522aa..879eae1876e 100644
--- a/hip/test/multigrid/amgx_pgm_kernels.cpp
+++ b/hip/test/multigrid/amgx_pgm_kernels.cpp
@@ -287,44 +287,6 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
 }
 
 
-TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
-{
-    initialize_data();
-    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
-    auto d_csr_coarse = Csr::create(hip, gko::dim<2>{n, n}, 0);
-    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                system_mtx->get_num_stored_elements());
-    auto d_csr_temp = Csr::create(hip, gko::dim<2>{n, n},
-                                  d_system_mtx->get_num_stored_elements());
-    index_type num_agg;
-    // renumber again
-    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
-    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
-    for (int i = 0; i < m; i++) {
-        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
-    }
-    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
-              0);
-    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
-    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
-    auto d_prolong_op = Csr::create(hip);
-    auto d_restrict_op = Csr::create(hip);
-    d_prolong_op->copy_from(prolong_op.get());
-    d_restrict_op->copy_from(restrict_op.get());
-
-    gko::kernels::hip::amgx_pgm::amgx_pgm_generate(
-        hip, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
-        d_csr_coarse.get(), d_csr_temp.get());
-    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
-        csr_coarse.get(), csr_temp.get());
-
-    // it should be checked already in renumber
-    GKO_ASSERT_EQ(num_agg, n);
-    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
 {
     initialize_data();
diff --git a/omp/multigrid/amgx_pgm_kernels.cpp b/omp/multigrid/amgx_pgm_kernels.cpp
index dc6329d982c..07e84630d85 100644
--- a/omp/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/multigrid/amgx_pgm_kernels.cpp
@@ -239,60 +239,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const OmpExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-{
-    // agg[i] -> I, agg[j] -> J
-    const auto agg_const_val = prolong_op->get_const_col_idxs();
-    const auto coarse_nrows = coarse->get_size()[0];
-    const auto source_nrows = source->get_size()[0];
-    const auto source_row_ptrs = source->get_const_row_ptrs();
-    const auto source_col_idxs = source->get_const_col_idxs();
-    const auto source_vals = source->get_const_values();
-    vector<map<IndexType, ValueType>> row_list(
-        source_nrows, map<IndexType, ValueType>{exec}, exec);
-    for (size_type i = 0; i < source_nrows; i++) {
-        IndexType row_idx = agg_const_val[i];
-        for (auto j = source_row_ptrs[i]; j < source_row_ptrs[i + 1]; j++) {
-            const auto col = agg_const_val[source_col_idxs[j]];
-            const auto val = source_vals[j];
-            row_list[row_idx][col] += val;
-        }
-    }
-    auto coarse_row_ptrs = coarse->get_row_ptrs();
-#pragma omp parallel for
-    for (size_type i = 0; i < coarse_nrows; i++) {
-        coarse_row_ptrs[i] = row_list[i].size();
-    }
-    components::prefix_sum(exec, coarse_row_ptrs, coarse_nrows + 1);
-
-    auto nnz = coarse_row_ptrs[coarse_nrows];
-    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
-    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
-    auto &coarse_vals_array = coarse_builder.get_value_array();
-    coarse_col_idxs_array.resize_and_reset(nnz);
-    coarse_vals_array.resize_and_reset(nnz);
-    auto coarse_col_idxs = coarse_col_idxs_array.get_data();
-    auto coarse_vals = coarse_vals_array.get_data();
-#pragma omp parallel for
-    for (size_type i = 0; i < coarse_nrows; i++) {
-        auto ind = coarse_row_ptrs[i];
-        for (auto pair : row_list[i]) {
-            coarse_col_idxs[ind] = pair.first;
-            coarse_vals[ind] = pair.second;
-            ind++;
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace omp
 }  // namespace kernels
diff --git a/omp/test/multigrid/amgx_pgm_kernels.cpp b/omp/test/multigrid/amgx_pgm_kernels.cpp
index 91dc2c2fce8..59b40657703 100644
--- a/omp/test/multigrid/amgx_pgm_kernels.cpp
+++ b/omp/test/multigrid/amgx_pgm_kernels.cpp
@@ -278,44 +278,6 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
 }
 
 
-TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
-{
-    initialize_data();
-    auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
-    auto d_csr_coarse = Csr::create(omp, gko::dim<2>{n, n}, 0);
-    auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
-                                system_mtx->get_num_stored_elements());
-    auto d_csr_temp = Csr::create(omp, gko::dim<2>{n, n},
-                                  d_system_mtx->get_num_stored_elements());
-    index_type num_agg;
-    // renumber again
-    gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
-    auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
-    for (int i = 0; i < m; i++) {
-        prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
-    }
-    std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
-              0);
-    std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
-    auto restrict_op = gko::as<Csr>(prolong_op->transpose());
-    auto d_prolong_op = Csr::create(omp);
-    auto d_restrict_op = Csr::create(omp);
-    d_prolong_op->copy_from(prolong_op.get());
-    d_restrict_op->copy_from(restrict_op.get());
-
-    gko::kernels::omp::amgx_pgm::amgx_pgm_generate(
-        omp, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
-        d_csr_coarse.get(), d_csr_temp.get());
-    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
-        csr_coarse.get(), csr_temp.get());
-
-    // it should be checked already in renumber
-    GKO_ASSERT_EQ(num_agg, n);
-    GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
-}
-
-
 TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
 {
     initialize_data();
diff --git a/reference/multigrid/amgx_pgm_kernels.cpp b/reference/multigrid/amgx_pgm_kernels.cpp
index 7d03fdff9f0..5ad1ff8d108 100644
--- a/reference/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/multigrid/amgx_pgm_kernels.cpp
@@ -231,59 +231,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);
 
 
-template <typename ValueType, typename IndexType>
-void amgx_pgm_generate(std::shared_ptr<const ReferenceExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType> *source,
-                       const matrix::Csr<ValueType, IndexType> *prolong_op,
-                       const matrix::Csr<ValueType, IndexType> *restrict_op,
-                       matrix::Csr<ValueType, IndexType> *coarse,
-                       matrix::Csr<ValueType, IndexType> *temp)
-{
-    // agg[i] -> I, agg[j] -> J
-    const auto agg_const_val = prolong_op->get_const_col_idxs();
-    const auto coarse_nrows = coarse->get_size()[0];
-    const auto source_nrows = source->get_size()[0];
-    const auto source_row_ptrs = source->get_const_row_ptrs();
-    const auto source_col_idxs = source->get_const_col_idxs();
-    const auto source_vals = source->get_const_values();
-    gko::vector<gko::map<IndexType, ValueType>> row_list(
-        source_nrows, gko::map<IndexType, ValueType>{exec}, exec);
-    for (size_type i = 0; i < source_nrows; i++) {
-        IndexType row_idx = agg_const_val[i];
-        for (auto j = source_row_ptrs[i]; j < source_row_ptrs[i + 1]; j++) {
-            const auto col = agg_const_val[source_col_idxs[j]];
-            const auto val = source_vals[j];
-            row_list[row_idx][col] += val;
-        }
-    }
-    auto coarse_row_ptrs = coarse->get_row_ptrs();
-    for (size_type i = 0; i < coarse_nrows; i++) {
-        coarse_row_ptrs[i] = row_list[i].size();
-    }
-    components::prefix_sum(exec, coarse_row_ptrs, coarse_nrows + 1);
-
-    auto nnz = coarse_row_ptrs[coarse_nrows];
-    matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
-    auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
-    auto &coarse_vals_array = coarse_builder.get_value_array();
-    coarse_col_idxs_array.resize_and_reset(nnz);
-    coarse_vals_array.resize_and_reset(nnz);
-    auto coarse_col_idxs = coarse_col_idxs_array.get_data();
-    auto coarse_vals = coarse_vals_array.get_data();
-
-    for (size_type i = 0; i < coarse_nrows; i++) {
-        auto ind = coarse_row_ptrs[i];
-        for (auto pair : row_list[i]) {
-            coarse_col_idxs[ind] = pair.first;
-            coarse_vals[ind] = pair.second;
-            ind++;
-        }
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);
-
-
 }  // namespace amgx_pgm
 }  // namespace reference
 }  // namespace kernels
diff --git a/reference/test/multigrid/amgx_pgm_kernels.cpp b/reference/test/multigrid/amgx_pgm_kernels.cpp
index 2cff6c8537b..c0270312d6c 100644
--- a/reference/test/multigrid/amgx_pgm_kernels.cpp
+++ b/reference/test/multigrid/amgx_pgm_kernels.cpp
@@ -500,37 +500,24 @@ TYPED_TEST(AmgxPgm, AssignToExistAgg)
 }
 
 
-TYPED_TEST(AmgxPgm, GenerateMtx)
+TYPED_TEST(AmgxPgm, GenerateMgLevel)
 {
-    using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
-    using mtx_type = typename TestFixture::Mtx;
-    gko::Array<index_type> agg(this->exec, 5);
-    auto agg_vals = agg.get_data();
-    // 0 - 2, 1 - 3, 4
-    auto prolong_op = mtx_type::create(this->exec, gko::dim<2>{5, 3}, 0);
+    using Mtx = typename TestFixture::Mtx;
+    auto prolong_op = gko::share(Mtx::create(this->exec, gko::dim<2>{5, 2}, 0));
+    // 0-2-4, 1-3
     prolong_op->read(
-        {{5, 3}, {{0, 0, 1}, {1, 1, 1}, {2, 0, 1}, {3, 1, 1}, {4, 2, 1}}});
-    auto restrict_op = gko::as<mtx_type>(prolong_op->transpose());
-    auto coarse_ans = mtx_type::create(this->exec, gko::dim<2>{3, 3}, 0);
-    coarse_ans->read({{3, 3},
-                      {{0, 0, 4},
-                       {0, 1, -3},
-                       {0, 2, -1},
-                       {1, 0, -3},
-                       {1, 1, 5},
-                       {1, 2, -1},
-                       {2, 0, -2},
-                       {2, 1, -2},
-                       {2, 2, 5}}});
-    auto csr_coarse = mtx_type::create(this->exec, gko::dim<2>{3, 3}, 0);
-    auto empty = gko::matrix::Csr<value_type, index_type>::create(this->exec);
-
-    gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
-        this->exec, this->mtx.get(), prolong_op.get(), restrict_op.get(),
-        csr_coarse.get(), empty.get());
-
-    GKO_ASSERT_MTX_NEAR(csr_coarse, coarse_ans, r<value_type>::value);
+        {{5, 2}, {{0, 0, 1}, {1, 1, 1}, {2, 0, 1}, {3, 1, 1}, {4, 0, 1}}});
+    auto restrict_op = gko::share(gko::as<Mtx>(prolong_op->transpose()));
+
+    auto coarse_fine = this->amgxpgm_factory->generate(this->mtx);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_restrict_op()),
+                        restrict_op, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_coarse_op()),
+                        this->coarse, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(gko::as<Mtx>(coarse_fine->get_prolong_op()), prolong_op,
+                        r<value_type>::value);
 }