Skip to content

Commit

Permalink
use two csr multiplication to generate coarse
Browse files Browse the repository at this point in the history
need to improve it with less memory footprint
  • Loading branch information
yhmtsai committed May 5, 2021
1 parent bef4373 commit 8b5901b
Show file tree
Hide file tree
Showing 13 changed files with 24 additions and 530 deletions.
102 changes: 0 additions & 102 deletions common/multigrid/amgx_pgm_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -225,106 +225,4 @@ __global__
}


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void get_source_row_map_kernel(
const size_type source_nrows, const IndexType *__restrict__ agg_val,
const IndexType *__restrict__ source_row_ptrs,
IndexType *__restrict__ result_row_ptrs, IndexType *__restrict__ row_map)
{
auto row = thread::get_thread_id_flat();
if (row >= source_nrows) {
return;
}
const auto num_elems = source_row_ptrs[row + 1] - source_row_ptrs[row];
const auto result_idx = agg_val[row];
// atomic_add returns the old value, so it can be the starting point.
row_map[row] = atomic_add(result_row_ptrs + result_idx, num_elems);
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void move_row_kernel(
const size_type source_nrows, const IndexType *__restrict__ agg_val,
const IndexType *__restrict__ row_map,
const IndexType *__restrict__ source_row_ptrs,
const IndexType *__restrict__ source_col_idxs,
const ValueType *__restrict__ source_values,
const IndexType *__restrict__ result_row_ptrs,
IndexType *__restrict__ result_col_idxs,
ValueType *__restrict__ result_values)
{
auto row = thread::get_thread_id_flat();
if (row >= source_nrows) {
return;
}
auto result_i = result_row_ptrs[agg_val[row]] + row_map[row];
for (auto i = source_row_ptrs[row]; i < source_row_ptrs[row + 1];
i++, result_i++) {
result_col_idxs[result_i] = agg_val[source_col_idxs[i]];
result_values[result_i] = source_values[i];
}
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void merge_col_kernel(
const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
IndexType *__restrict__ temp_col_idxs, ValueType *__restrict__ temp_values,
IndexType *__restrict__ coarse_row_ptrs)
{
auto row = thread::get_thread_id_flat();
if (row >= nrows) {
return;
}

IndexType num_elems = zero<IndexType>();
const auto start = temp_row_ptrs[row];
const auto end = temp_row_ptrs[row + 1];
IndexType col = temp_col_idxs[start];
ValueType value = temp_values[start];
for (auto i = start + 1; i < end; i++) {
const auto current_col = temp_col_idxs[i];
if (current_col != col) {
// apply to the original data. It is sorted, so the writing position
// is before read position
temp_col_idxs[start + num_elems] = col;
temp_values[start + num_elems] = value;
value = zero<ValueType>();
col = current_col;
num_elems++;
}
value += temp_values[i];
}
// If start != end, need to process the final column
if (start != end) {
temp_col_idxs[start + num_elems] = col;
temp_values[start + num_elems] = value;
num_elems++;
}
coarse_row_ptrs[row] = num_elems;
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void copy_to_coarse_kernel(
const size_type nrows, const IndexType *__restrict__ temp_row_ptrs,
const IndexType *__restrict__ temp_col_idxs,
const ValueType *__restrict__ temp_values,
const IndexType *__restrict__ coarse_row_ptrs,
IndexType *__restrict__ coarse_col_idxs,
ValueType *__restrict__ coarse_values)
{
auto row = thread::get_thread_id_flat();
if (row >= nrows) {
return;
}
auto temp_i = temp_row_ptrs[row];
for (auto i = coarse_row_ptrs[row]; i < coarse_row_ptrs[row + 1]; i++) {
coarse_col_idxs[i] = temp_col_idxs[temp_i];
coarse_values[i] = temp_values[temp_i];
temp_i++;
}
}


} // namespace kernel
5 changes: 0 additions & 5 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1328,11 +1328,6 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);

template <typename ValueType, typename IndexType>
GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);


} // namespace amgx_pgm

Expand Down
36 changes: 8 additions & 28 deletions core/multigrid/amgx_pgm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,38 +60,13 @@ GKO_REGISTER_OPERATION(renumber, amgx_pgm::renumber);
GKO_REGISTER_OPERATION(find_strongest_neighbor,
amgx_pgm::find_strongest_neighbor);
GKO_REGISTER_OPERATION(assign_to_exist_agg, amgx_pgm::assign_to_exist_agg);
GKO_REGISTER_OPERATION(amgx_pgm_generate, amgx_pgm::amgx_pgm_generate);
GKO_REGISTER_OPERATION(fill_array, components::fill_array);
GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);


} // namespace amgx_pgm


namespace {


template <typename ValueType, typename IndexType>
std::unique_ptr<LinOp> amgx_pgm_generate(
std::shared_ptr<const Executor> exec,
const matrix::Csr<ValueType, IndexType> *source,
const matrix::Csr<ValueType, IndexType> *prolong_op,
const matrix::Csr<ValueType, IndexType> *restrict_op)
{
auto num_agg = prolong_op->get_size()[1];
auto coarse = matrix::Csr<ValueType, IndexType>::create(
exec, dim<2>{num_agg, num_agg}, 0, source->get_strategy());
auto temp = matrix::Csr<ValueType, IndexType>::create(
exec, dim<2>{num_agg, num_agg}, source->get_num_stored_elements());
exec->run(amgx_pgm::make_amgx_pgm_generate(source, prolong_op, restrict_op,
coarse.get(), temp.get()));
return std::move(coarse);
}


} // namespace


template <typename ValueType, typename IndexType>
void AmgxPgm<ValueType, IndexType>::generate()
{
Expand Down Expand Up @@ -160,7 +135,7 @@ void AmgxPgm<ValueType, IndexType>::generate()
// Renumber the index
exec->run(amgx_pgm::make_renumber(agg_, &num_agg));

auto coarse_dim = num_agg;
gko::dim<2>::dimension_type coarse_dim = num_agg;
auto fine_dim = system_matrix_->get_size()[0];
// TODO: prolong_op can be done with lightway format
auto prolong_op = share(
Expand All @@ -175,8 +150,13 @@ void AmgxPgm<ValueType, IndexType>::generate()
auto restrict_op = gko::as<matrix_type>(share(prolong_op->transpose()));

// Construct the coarse matrix
auto coarse_matrix = share(amgx_pgm_generate(
exec, amgxpgm_op, prolong_op.get(), restrict_op.get()));
// TODO: use less memory footprint to improve it
auto coarse_matrix =
share(matrix_type::create(exec, gko::dim<2>{coarse_dim, coarse_dim}));
auto tmp = matrix_type::create(exec, gko::dim<2>{coarse_dim, fine_dim});
restrict_op->apply(amgxpgm_op, tmp.get());
tmp->apply(prolong_op.get(), coarse_matrix.get());

this->set_multigrid_level(prolong_op, coarse_matrix, restrict_op);
}

Expand Down
13 changes: 1 addition & 12 deletions core/multigrid/amgx_pgm_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,6 @@ namespace amgx_pgm {
const matrix::Diagonal<ValueType> *diag, Array<IndexType> &agg, \
Array<IndexType> &intermediate_agg)

#define GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType) \
void amgx_pgm_generate( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType> *source, \
const matrix::Csr<ValueType, IndexType> *prolong_op, \
const matrix::Csr<ValueType, IndexType> *restrict_op, \
matrix::Csr<ValueType, IndexType> *coarse, \
matrix::Csr<ValueType, IndexType> *temp)

#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename IndexType> \
GKO_DECLARE_AMGX_PGM_MATCH_EDGE_KERNEL(IndexType); \
Expand All @@ -97,9 +88,7 @@ namespace amgx_pgm {
template <typename ValueType, typename IndexType> \
GKO_DECLARE_AMGX_PGM_FIND_STRONGEST_NEIGHBOR(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_AMGX_PGM_GENERATE(ValueType, IndexType)
GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG(ValueType, IndexType)


} // namespace amgx_pgm
Expand Down
59 changes: 0 additions & 59 deletions cuda/multigrid/amgx_pgm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -172,65 +172,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);


template <typename ValueType, typename IndexType>
void amgx_pgm_generate(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType> *source,
const matrix::Csr<ValueType, IndexType> *prolong_op,
const matrix::Csr<ValueType, IndexType> *restrict_op,
matrix::Csr<ValueType, IndexType> *coarse,
matrix::Csr<ValueType, IndexType> *temp)
{
const auto agg_const_val = prolong_op->get_const_col_idxs();
const auto source_nrows = source->get_size()[0];
const auto source_nnz = source->get_num_stored_elements();
const auto coarse_nrows = coarse->get_size()[0];
Array<IndexType> row_map(exec, source_nrows);
// fill coarse row pointer as zero
components::fill_array(exec, temp->get_row_ptrs(), coarse_nrows + 1,
zero<IndexType>());
// compute each source row should be moved and also change column index
dim3 grid(ceildiv(source_nrows, default_block_size));
// agg source_row (for row size) coarse row source map
kernel::get_source_row_map_kernel<<<grid, default_block_size>>>(
source_nrows, agg_const_val, source->get_const_row_ptrs(),
temp->get_row_ptrs(), row_map.get_data());
// prefix sum of temp_row_ptrs
components::prefix_sum(exec, temp->get_row_ptrs(), coarse_nrows + 1);
// copy source -> to coarse and change column index
kernel::move_row_kernel<<<grid, default_block_size>>>(
source_nrows, agg_const_val, row_map.get_const_data(),
source->get_const_row_ptrs(), source->get_const_col_idxs(),
as_cuda_type(source->get_const_values()), temp->get_const_row_ptrs(),
temp->get_col_idxs(), as_cuda_type(temp->get_values()));
// sort csr
csr::sort_by_column_index(exec, temp);
// summation of the elements with same position
grid = ceildiv(coarse_nrows, default_block_size);
kernel::merge_col_kernel<<<grid, default_block_size>>>(
coarse_nrows, temp->get_const_row_ptrs(), temp->get_col_idxs(),
as_cuda_type(temp->get_values()), coarse->get_row_ptrs());
// build the coarse matrix
components::prefix_sum(exec, coarse->get_row_ptrs(), coarse_nrows + 1);
// prefix sum of coarse->get_row_ptrs
const auto coarse_nnz =
exec->copy_val_to_host(coarse->get_row_ptrs() + coarse_nrows);
// reallocate size of column and values
matrix::CsrBuilder<ValueType, IndexType> coarse_builder{coarse};
auto &coarse_col_idxs_array = coarse_builder.get_col_idx_array();
auto &coarse_vals_array = coarse_builder.get_value_array();
coarse_col_idxs_array.resize_and_reset(coarse_nnz);
coarse_vals_array.resize_and_reset(coarse_nnz);
// copy the result
kernel::copy_to_coarse_kernel<<<grid, default_block_size>>>(
coarse_nrows, temp->get_const_row_ptrs(), temp->get_const_col_idxs(),
as_cuda_type(temp->get_const_values()), coarse->get_const_row_ptrs(),
coarse_col_idxs_array.get_data(),
as_cuda_type(coarse_vals_array.get_data()));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);


} // namespace amgx_pgm
} // namespace cuda
} // namespace kernels
Expand Down
38 changes: 0 additions & 38 deletions cuda/test/multigrid/amgx_pgm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,44 +288,6 @@ TEST_F(AmgxPgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
}


TEST_F(AmgxPgm, GenerateMtxIsEquivalentToRef)
{
initialize_data();
auto csr_coarse = Csr::create(ref, gko::dim<2>{n, n}, 0);
auto d_csr_coarse = Csr::create(cuda, gko::dim<2>{n, n}, 0);
auto csr_temp = Csr::create(ref, gko::dim<2>{n, n},
system_mtx->get_num_stored_elements());
auto d_csr_temp = Csr::create(cuda, gko::dim<2>{n, n},
d_system_mtx->get_num_stored_elements());
index_type num_agg;
// renumber again
gko::kernels::reference::amgx_pgm::renumber(ref, agg, &num_agg);
auto prolong_op = Csr::create(ref, gko::dim<2>{m, n}, m);
for (int i = 0; i < m; i++) {
prolong_op->get_col_idxs()[i] = agg.get_const_data()[i];
}
std::iota(prolong_op->get_row_ptrs(), prolong_op->get_row_ptrs() + m + 1,
0);
std::fill_n(prolong_op->get_values(), m, gko::one<value_type>());
auto restrict_op = gko::as<Csr>(prolong_op->transpose());
auto d_prolong_op = Csr::create(cuda);
auto d_restrict_op = Csr::create(cuda);
d_prolong_op->copy_from(prolong_op.get());
d_restrict_op->copy_from(restrict_op.get());

gko::kernels::cuda::amgx_pgm::amgx_pgm_generate(
cuda, d_system_mtx.get(), d_prolong_op.get(), d_restrict_op.get(),
d_csr_coarse.get(), d_csr_temp.get());
gko::kernels::reference::amgx_pgm::amgx_pgm_generate(
ref, system_mtx.get(), prolong_op.get(), restrict_op.get(),
csr_coarse.get(), csr_temp.get());

// it should be checked already in renumber
GKO_ASSERT_EQ(num_agg, n);
GKO_ASSERT_MTX_NEAR(d_csr_coarse, csr_coarse, 1e-14);
}


TEST_F(AmgxPgm, GenerateMgLevelIsEquivalentToRef)
{
initialize_data();
Expand Down
12 changes: 0 additions & 12 deletions dpcpp/multigrid/amgx_pgm_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,6 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_AMGX_PGM_ASSIGN_TO_EXIST_AGG);


template <typename ValueType, typename IndexType>
void amgx_pgm_generate(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType> *source,
const matrix::Csr<ValueType, IndexType> *prolong_op,
const matrix::Csr<ValueType, IndexType> *restrict_op,
matrix::Csr<ValueType, IndexType> *coarse,
matrix::Csr<ValueType, IndexType> *temp)
GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_AMGX_PGM_GENERATE);


} // namespace amgx_pgm
} // namespace dpcpp
} // namespace kernels
Expand Down
Loading

0 comments on commit 8b5901b

Please sign in to comment.