Merge: Scalar Jacobi specialization and kernels

This PR adds special kernels for the scalar Jacobi preconditioner. It extracts the diagonal from the system matrix and applies the inverse of the diagonal to the rhs. The generate and apply overhead compared to the usual block-Jacobi should be lesser due to fewer storage requirements and memory movement in general. This should in combination with the Ir solver, provide for some cheap and effective relaxation methods, useful in multi-grid and coarse grid solvers. Related PR: #808
ginkgo-project · Jul 2, 2021 · 9cebc13 · 9cebc13
2 parents 39772fd + a34478d
commit 9cebc13
Show file tree

Hide file tree

Showing 14 changed files with 666 additions and 45 deletions.
diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc
@@ -213,3 +213,50 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
             });
     }
 }
+
+
+namespace kernel {
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void scalar_apply(
+    size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
+    const ValueType *__restrict__ alpha, size_type source_stride,
+    const ValueType *__restrict__ source_values,
+    const ValueType *__restrict__ beta, size_type result_stride,
+    ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / num_cols;
+    const auto col = tidx % num_cols;
+
+    if (row < num_rows) {
+        auto diag_val =
+            diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
+        result_values[row * result_stride + col] =
+            result_values[row * result_stride + col] * beta[0] +
+            alpha[0] * source_values[row * source_stride + col] / diag_val;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void simple_scalar_apply(
+    size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
+    size_type source_stride, const ValueType *__restrict__ source_values,
+    size_type result_stride, ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / num_cols;
+    const auto col = tidx % num_cols;
+
+    if (row < num_rows) {
+        auto diag_val =
+            diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
+        result_values[row * result_stride + col] =
+            source_values[row * source_stride + col] / diag_val;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -1081,6 +1081,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
+template <typename ValueType>
+GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);

diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
@@ -57,7 +57,9 @@ namespace jacobi {
 
 
 GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
+GKO_REGISTER_OPERATION(simple_scalar_apply, jacobi::simple_scalar_apply);
 GKO_REGISTER_OPERATION(apply, jacobi::apply);
+GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply);
 GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
 GKO_REGISTER_OPERATION(generate, jacobi::generate);
 GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
@@ -74,10 +76,15 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
-            this->get_executor()->run(jacobi::make_simple_apply(
-                num_blocks_, parameters_.max_block_size, storage_scheme_,
-                parameters_.storage_optimization.block_wise,
-                parameters_.block_pointers, blocks_, dense_b, dense_x));
+            if (parameters_.max_block_size == 1) {
+                this->get_executor()->run(jacobi::make_simple_scalar_apply(
+                    this->blocks_, dense_b, dense_x));
+            } else {
+                this->get_executor()->run(jacobi::make_simple_apply(
+                    num_blocks_, parameters_.max_block_size, storage_scheme_,
+                    parameters_.storage_optimization.block_wise,
+                    parameters_.block_pointers, blocks_, dense_b, dense_x));
+            }
         },
         b, x);
 }
@@ -90,11 +97,16 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *alpha,
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            this->get_executor()->run(jacobi::make_apply(
-                num_blocks_, parameters_.max_block_size, storage_scheme_,
-                parameters_.storage_optimization.block_wise,
-                parameters_.block_pointers, blocks_, dense_alpha, dense_b,
-                dense_beta, dense_x));
+            if (parameters_.max_block_size == 1) {
+                this->get_executor()->run(jacobi::make_scalar_apply(
+                    this->blocks_, dense_alpha, dense_b, dense_beta, dense_x));
+            } else {
+                this->get_executor()->run(jacobi::make_apply(
+                    num_blocks_, parameters_.max_block_size, storage_scheme_,
+                    parameters_.storage_optimization.block_wise,
+                    parameters_.block_pointers, blocks_, dense_alpha, dense_b,
+                    dense_beta, dense_x));
+            }
         },
         alpha, b, beta, x);
 }
@@ -218,33 +230,43 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix,
     GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
     using csr_type = matrix::Csr<ValueType, IndexType>;
     const auto exec = this->get_executor();
-    auto csr_mtx =
-        convert_to_with_sorting<csr_type>(exec, system_matrix, skip_sorting);
-
-    if (parameters_.block_pointers.get_data() == nullptr) {
-        this->detect_blocks(csr_mtx.get());
-    }
+    if (parameters_.max_block_size == 1) {
+        auto diag = as<DiagonalExtractable<ValueType>>(system_matrix)
+                        ->extract_diagonal();
+        auto temp = gko::Array<ValueType>::view(
+            exec, system_matrix->get_size()[0], diag->get_values());
+        this->blocks_ = temp;
+    } else {
+        auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
+                                                         skip_sorting);
+
+        if (parameters_.block_pointers.get_data() == nullptr) {
+            this->detect_blocks(csr_mtx.get());
+        }
 
-    const auto all_block_opt = parameters_.storage_optimization.of_all_blocks;
-    auto &precisions = parameters_.storage_optimization.block_wise;
-    // if adaptive version is used, make sure that the precision array is of
-    // the correct size by replicating it multiple times if needed
-    if (parameters_.storage_optimization.is_block_wise ||
-        all_block_opt != precision_reduction(0, 0)) {
-        if (!parameters_.storage_optimization.is_block_wise) {
-            precisions = gko::Array<precision_reduction>(exec, {all_block_opt});
+        const auto all_block_opt =
+            parameters_.storage_optimization.of_all_blocks;
+        auto &precisions = parameters_.storage_optimization.block_wise;
+        // if adaptive version is used, make sure that the precision array is of
+        // the correct size by replicating it multiple times if needed
+        if (parameters_.storage_optimization.is_block_wise ||
+            all_block_opt != precision_reduction(0, 0)) {
+            if (!parameters_.storage_optimization.is_block_wise) {
+                precisions =
+                    gko::Array<precision_reduction>(exec, {all_block_opt});
+            }
+            Array<precision_reduction> tmp(
+                exec, parameters_.block_pointers.get_num_elems() - 1);
+            exec->run(jacobi::make_initialize_precisions(precisions, tmp));
+            precisions = std::move(tmp);
+            conditioning_.resize_and_reset(num_blocks_);
         }
-        Array<precision_reduction> tmp(
-            exec, parameters_.block_pointers.get_num_elems() - 1);
-        exec->run(jacobi::make_initialize_precisions(precisions, tmp));
-        precisions = std::move(tmp);
-        conditioning_.resize_and_reset(num_blocks_);
-    }
 
-    exec->run(jacobi::make_generate(
-        csr_mtx.get(), num_blocks_, parameters_.max_block_size,
-        parameters_.accuracy, storage_scheme_, conditioning_, precisions,
-        parameters_.block_pointers, blocks_));
+        exec->run(jacobi::make_generate(
+            csr_mtx.get(), num_blocks_, parameters_.max_block_size,
+            parameters_.accuracy, storage_scheme_, conditioning_, precisions,
+            parameters_.block_pointers, blocks_));
+    }
 }
 
 

diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp
@@ -74,6 +74,12 @@ namespace kernels {
         const matrix::Dense<ValueType> *b,                                     \
         const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)          \
+    void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                             const Array<ValueType> &diag,                \
+                             const matrix::Dense<ValueType> *b,           \
+                             matrix::Dense<ValueType> *x)
+
 #define GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType)       \
     void simple_apply(                                                     \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -85,6 +91,13 @@ namespace kernels {
         const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
         matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)                    \
+    void scalar_apply(                                                       \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const Array<ValueType> &diag, const matrix::Dense<ValueType> *alpha, \
+        const matrix::Dense<ValueType> *b,                                   \
+        const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
+
 #define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)          \
     void transpose_jacobi(                                                 \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -127,8 +140,12 @@ namespace kernels {
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL(ValueType, IndexType);      \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType>                                     \
+    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType);                \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType>                                     \
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType);         \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                 \

diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
@@ -65,12 +65,14 @@ constexpr int default_num_warps = 32;
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
 
+constexpr int default_block_size = 512;
+
 
 #include "common/preconditioner/jacobi_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
-size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
+size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
                               const matrix::Csr<ValueType, IndexType> *mtx,
                               int32 max_block_size,
                               IndexType *__restrict__ block_ptrs)
@@ -95,7 +97,7 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename IndexType>
 inline size_type agglomerate_supervariables(
-    std::shared_ptr<const CudaExecutor> exec, int32 max_block_size,
+    std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
     size_type num_natural_blocks, IndexType *block_ptrs)
 {
     Array<size_type> nums(exec, 1);
@@ -111,7 +113,7 @@ inline size_type agglomerate_supervariables(
 }  // namespace
 
 
-void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
+void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
                            const Array<precision_reduction> &source,
                            Array<precision_reduction> &precisions)
 {
@@ -126,7 +128,7 @@ void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void find_blocks(std::shared_ptr<const CudaExecutor> exec,
+void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType> *system_matrix,
                  uint32 max_block_size, size_type &num_blocks,
                  Array<IndexType> &block_pointers)
@@ -230,7 +232,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
-    std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     const Array<precision_reduction> &block_precisions,
     const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>
@@ -241,6 +243,61 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    kernel::scalar_apply<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, as_cuda_type(diag_values),
+        as_cuda_type(alpha->get_const_values()), b_stride,
+        as_cuda_type(b_values), as_cuda_type(beta->get_const_values()),
+        x_stride, as_cuda_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    kernel::simple_scalar_apply<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, as_cuda_type(diag_values), b_stride,
+        as_cuda_type(b_values), x_stride, as_cuda_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 }  // namespace jacobi
 }  // namespace cuda
 }  // namespace kernels