Skip to content

Commit

Permalink
Merge: Scalar Jacobi specialization and kernels
Browse files Browse the repository at this point in the history
This PR adds special kernels for the scalar Jacobi preconditioner. It extracts the diagonal from the system matrix and applies the inverse of the diagonal to the rhs. The generate and apply overhead compared to the usual block-Jacobi should be lesser due to fewer storage requirements and memory movement in general.

This should in combination with the Ir solver, provide for some cheap and effective relaxation methods, useful in multi-grid and coarse grid solvers.

Related PR: #808
  • Loading branch information
pratikvn authored Jul 2, 2021
2 parents 39772fd + a34478d commit 9cebc13
Show file tree
Hide file tree
Showing 14 changed files with 666 additions and 45 deletions.
47 changes: 47 additions & 0 deletions common/preconditioner/jacobi_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,50 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
});
}
}


namespace kernel {


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void scalar_apply(
size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
const ValueType *__restrict__ alpha, size_type source_stride,
const ValueType *__restrict__ source_values,
const ValueType *__restrict__ beta, size_type result_stride,
ValueType *__restrict__ result_values)
{
const auto tidx = thread::get_thread_id_flat();
const auto row = tidx / num_cols;
const auto col = tidx % num_cols;

if (row < num_rows) {
auto diag_val =
diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
result_values[row * result_stride + col] =
result_values[row * result_stride + col] * beta[0] +
alpha[0] * source_values[row * source_stride + col] / diag_val;
}
}


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void simple_scalar_apply(
size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
size_type source_stride, const ValueType *__restrict__ source_values,
size_type result_stride, ValueType *__restrict__ result_values)
{
const auto tidx = thread::get_thread_id_flat();
const auto row = tidx / num_cols;
const auto col = tidx % num_cols;

if (row < num_rows) {
auto diag_val =
diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
result_values[row * result_stride + col] =
source_values[row * source_stride + col] / diag_val;
}
}


} // namespace kernel
11 changes: 11 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);

template <typename ValueType>
GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);

template <typename ValueType>
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
Expand Down
88 changes: 55 additions & 33 deletions core/preconditioner/jacobi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ namespace jacobi {


GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
GKO_REGISTER_OPERATION(simple_scalar_apply, jacobi::simple_scalar_apply);
GKO_REGISTER_OPERATION(apply, jacobi::apply);
GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply);
GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
GKO_REGISTER_OPERATION(generate, jacobi::generate);
GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
Expand All @@ -74,10 +76,15 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_b, auto dense_x) {
this->get_executor()->run(jacobi::make_simple_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_b, dense_x));
if (parameters_.max_block_size == 1) {
this->get_executor()->run(jacobi::make_simple_scalar_apply(
this->blocks_, dense_b, dense_x));
} else {
this->get_executor()->run(jacobi::make_simple_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_b, dense_x));
}
},
b, x);
}
Expand All @@ -90,11 +97,16 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *alpha,
{
precision_dispatch_real_complex<ValueType>(
[this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
this->get_executor()->run(jacobi::make_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_alpha, dense_b,
dense_beta, dense_x));
if (parameters_.max_block_size == 1) {
this->get_executor()->run(jacobi::make_scalar_apply(
this->blocks_, dense_alpha, dense_b, dense_beta, dense_x));
} else {
this->get_executor()->run(jacobi::make_apply(
num_blocks_, parameters_.max_block_size, storage_scheme_,
parameters_.storage_optimization.block_wise,
parameters_.block_pointers, blocks_, dense_alpha, dense_b,
dense_beta, dense_x));
}
},
alpha, b, beta, x);
}
Expand Down Expand Up @@ -218,33 +230,43 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix,
GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
using csr_type = matrix::Csr<ValueType, IndexType>;
const auto exec = this->get_executor();
auto csr_mtx =
convert_to_with_sorting<csr_type>(exec, system_matrix, skip_sorting);

if (parameters_.block_pointers.get_data() == nullptr) {
this->detect_blocks(csr_mtx.get());
}
if (parameters_.max_block_size == 1) {
auto diag = as<DiagonalExtractable<ValueType>>(system_matrix)
->extract_diagonal();
auto temp = gko::Array<ValueType>::view(
exec, system_matrix->get_size()[0], diag->get_values());
this->blocks_ = temp;
} else {
auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
skip_sorting);

if (parameters_.block_pointers.get_data() == nullptr) {
this->detect_blocks(csr_mtx.get());
}

const auto all_block_opt = parameters_.storage_optimization.of_all_blocks;
auto &precisions = parameters_.storage_optimization.block_wise;
// if adaptive version is used, make sure that the precision array is of
// the correct size by replicating it multiple times if needed
if (parameters_.storage_optimization.is_block_wise ||
all_block_opt != precision_reduction(0, 0)) {
if (!parameters_.storage_optimization.is_block_wise) {
precisions = gko::Array<precision_reduction>(exec, {all_block_opt});
const auto all_block_opt =
parameters_.storage_optimization.of_all_blocks;
auto &precisions = parameters_.storage_optimization.block_wise;
// if adaptive version is used, make sure that the precision array is of
// the correct size by replicating it multiple times if needed
if (parameters_.storage_optimization.is_block_wise ||
all_block_opt != precision_reduction(0, 0)) {
if (!parameters_.storage_optimization.is_block_wise) {
precisions =
gko::Array<precision_reduction>(exec, {all_block_opt});
}
Array<precision_reduction> tmp(
exec, parameters_.block_pointers.get_num_elems() - 1);
exec->run(jacobi::make_initialize_precisions(precisions, tmp));
precisions = std::move(tmp);
conditioning_.resize_and_reset(num_blocks_);
}
Array<precision_reduction> tmp(
exec, parameters_.block_pointers.get_num_elems() - 1);
exec->run(jacobi::make_initialize_precisions(precisions, tmp));
precisions = std::move(tmp);
conditioning_.resize_and_reset(num_blocks_);
}

exec->run(jacobi::make_generate(
csr_mtx.get(), num_blocks_, parameters_.max_block_size,
parameters_.accuracy, storage_scheme_, conditioning_, precisions,
parameters_.block_pointers, blocks_));
exec->run(jacobi::make_generate(
csr_mtx.get(), num_blocks_, parameters_.max_block_size,
parameters_.accuracy, storage_scheme_, conditioning_, precisions,
parameters_.block_pointers, blocks_));
}
}


Expand Down
17 changes: 17 additions & 0 deletions core/preconditioner/jacobi_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ namespace kernels {
const matrix::Dense<ValueType> *b, \
const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType) \
void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec, \
const Array<ValueType> &diag, \
const matrix::Dense<ValueType> *b, \
matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType) \
void simple_apply( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
Expand All @@ -85,6 +91,13 @@ namespace kernels {
const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType) \
void scalar_apply( \
std::shared_ptr<const DefaultExecutor> exec, \
const Array<ValueType> &diag, const matrix::Dense<ValueType> *alpha, \
const matrix::Dense<ValueType> *b, \
const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)

#define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) \
void transpose_jacobi( \
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
Expand Down Expand Up @@ -127,8 +140,12 @@ namespace kernels {
GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType); \
template <typename ValueType> \
GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType); \
template <typename ValueType> \
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
67 changes: 62 additions & 5 deletions cuda/preconditioner/jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,14 @@ constexpr int default_num_warps = 32;
// current GPUs have at most 84 SMs)
constexpr int default_grid_size = 32 * 32 * 128;

constexpr int default_block_size = 512;


#include "common/preconditioner/jacobi_kernels.hpp.inc"


template <typename ValueType, typename IndexType>
size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType> *mtx,
int32 max_block_size,
IndexType *__restrict__ block_ptrs)
Expand All @@ -95,7 +97,7 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,

template <typename IndexType>
inline size_type agglomerate_supervariables(
std::shared_ptr<const CudaExecutor> exec, int32 max_block_size,
std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
size_type num_natural_blocks, IndexType *block_ptrs)
{
Array<size_type> nums(exec, 1);
Expand All @@ -111,7 +113,7 @@ inline size_type agglomerate_supervariables(
} // namespace


void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
const Array<precision_reduction> &source,
Array<precision_reduction> &precisions)
{
Expand All @@ -126,7 +128,7 @@ void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,


template <typename ValueType, typename IndexType>
void find_blocks(std::shared_ptr<const CudaExecutor> exec,
void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType> *system_matrix,
uint32 max_block_size, size_type &num_blocks,
Array<IndexType> &block_pointers)
Expand Down Expand Up @@ -230,7 +232,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

template <typename ValueType, typename IndexType>
void convert_to_dense(
std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
const Array<precision_reduction> &block_precisions,
const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
const preconditioner::block_interleaved_storage_scheme<IndexType>
Expand All @@ -241,6 +243,61 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);


template <typename ValueType>
void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
const Array<ValueType> &diag,
const matrix::Dense<ValueType> *alpha,
const matrix::Dense<ValueType> *b,
const matrix::Dense<ValueType> *beta,
matrix::Dense<ValueType> *x)
{
const auto b_size = b->get_size();
const auto num_rows = b_size[0];
const auto num_cols = b_size[1];
const auto b_stride = b->get_stride();
const auto x_stride = x->get_stride();
const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);

const auto b_values = b->get_const_values();
const auto diag_values = diag.get_const_data();
auto x_values = x->get_values();

kernel::scalar_apply<<<grid_dim, default_block_size>>>(
num_rows, num_cols, as_cuda_type(diag_values),
as_cuda_type(alpha->get_const_values()), b_stride,
as_cuda_type(b_values), as_cuda_type(beta->get_const_values()),
x_stride, as_cuda_type(x_values));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);


template <typename ValueType>
void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
const Array<ValueType> &diag,
const matrix::Dense<ValueType> *b,
matrix::Dense<ValueType> *x)
{
const auto b_size = b->get_size();
const auto num_rows = b_size[0];
const auto num_cols = b_size[1];
const auto b_stride = b->get_stride();
const auto x_stride = x->get_stride();
const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);

const auto b_values = b->get_const_values();
const auto diag_values = diag.get_const_data();
auto x_values = x->get_values();

kernel::simple_scalar_apply<<<grid_dim, default_block_size>>>(
num_rows, num_cols, as_cuda_type(diag_values), b_stride,
as_cuda_type(b_values), x_stride, as_cuda_type(x_values));
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);


} // namespace jacobi
} // namespace cuda
} // namespace kernels
Expand Down
Loading

0 comments on commit 9cebc13

Please sign in to comment.