diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc index c3fa889b210..61678280d56 100644 --- a/common/preconditioner/jacobi_kernels.hpp.inc +++ b/common/preconditioner/jacobi_kernels.hpp.inc @@ -213,3 +213,50 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi( }); } } + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void scalar_apply( + size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag, + const ValueType *__restrict__ alpha, size_type source_stride, + const ValueType *__restrict__ source_values, + const ValueType *__restrict__ beta, size_type result_stride, + ValueType *__restrict__ result_values) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / num_cols; + const auto col = tidx % num_cols; + + if (row < num_rows) { + auto diag_val = + diag[row] == zero() ? one() : diag[row]; + result_values[row * result_stride + col] = + result_values[row * result_stride + col] * beta[0] + + alpha[0] * source_values[row * source_stride + col] / diag_val; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void simple_scalar_apply( + size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag, + size_type source_stride, const ValueType *__restrict__ source_values, + size_type result_stride, ValueType *__restrict__ result_values) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / num_cols; + const auto col = tidx % num_cols; + + if (row < num_rows) { + auto diag_val = + diag[row] == zero() ? one() : diag[row]; + result_values[row * result_stride + col] = + source_values[row * source_stride + col] / diag_val; + } +} + + +} // namespace kernel diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index c4244c8317c..4f09888e88f 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -1081,6 +1081,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE); GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); +template +GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + +template +GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType) +GKO_NOT_COMPILED(GKO_HOOK_MODULE); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + template GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index 5f135b6c010..8270265c2ec 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -57,7 +57,9 @@ namespace jacobi { GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply); +GKO_REGISTER_OPERATION(simple_scalar_apply, jacobi::simple_scalar_apply); GKO_REGISTER_OPERATION(apply, jacobi::apply); +GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply); GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks); GKO_REGISTER_OPERATION(generate, jacobi::generate); GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi); @@ -74,10 +76,15 @@ void Jacobi::apply_impl(const LinOp *b, LinOp *x) const { precision_dispatch_real_complex( [this](auto dense_b, auto dense_x) { - this->get_executor()->run(jacobi::make_simple_apply( - num_blocks_, parameters_.max_block_size, storage_scheme_, - parameters_.storage_optimization.block_wise, - parameters_.block_pointers, blocks_, dense_b, dense_x)); + if (parameters_.max_block_size == 1) { + this->get_executor()->run(jacobi::make_simple_scalar_apply( + this->blocks_, dense_b, dense_x)); + } else { + this->get_executor()->run(jacobi::make_simple_apply( + num_blocks_, parameters_.max_block_size, storage_scheme_, + parameters_.storage_optimization.block_wise, + parameters_.block_pointers, blocks_, dense_b, dense_x)); + } }, b, x); } @@ -90,11 +97,16 @@ void Jacobi::apply_impl(const LinOp *alpha, { precision_dispatch_real_complex( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { - this->get_executor()->run(jacobi::make_apply( - num_blocks_, parameters_.max_block_size, storage_scheme_, - parameters_.storage_optimization.block_wise, - parameters_.block_pointers, blocks_, dense_alpha, dense_b, - dense_beta, dense_x)); + if (parameters_.max_block_size == 1) { + this->get_executor()->run(jacobi::make_scalar_apply( + this->blocks_, dense_alpha, dense_b, dense_beta, dense_x)); + } else { + this->get_executor()->run(jacobi::make_apply( + num_blocks_, parameters_.max_block_size, storage_scheme_, + parameters_.storage_optimization.block_wise, + parameters_.block_pointers, blocks_, dense_alpha, dense_b, + dense_beta, dense_x)); + } }, alpha, b, beta, x); } @@ -218,33 +230,43 @@ void Jacobi::generate(const LinOp *system_matrix, GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix); using csr_type = matrix::Csr; const auto exec = this->get_executor(); - auto csr_mtx = - convert_to_with_sorting(exec, system_matrix, skip_sorting); - - if (parameters_.block_pointers.get_data() == nullptr) { - this->detect_blocks(csr_mtx.get()); - } + if (parameters_.max_block_size == 1) { + auto diag = as>(system_matrix) + ->extract_diagonal(); + auto temp = gko::Array::view( + exec, system_matrix->get_size()[0], diag->get_values()); + this->blocks_ = temp; + } else { + auto csr_mtx = convert_to_with_sorting(exec, system_matrix, + skip_sorting); + + if (parameters_.block_pointers.get_data() == nullptr) { + this->detect_blocks(csr_mtx.get()); + } - const auto all_block_opt = parameters_.storage_optimization.of_all_blocks; - auto &precisions = parameters_.storage_optimization.block_wise; - // if adaptive version is used, make sure that the precision array is of - // the correct size by replicating it multiple times if needed - if (parameters_.storage_optimization.is_block_wise || - all_block_opt != precision_reduction(0, 0)) { - if (!parameters_.storage_optimization.is_block_wise) { - precisions = gko::Array(exec, {all_block_opt}); + const auto all_block_opt = + parameters_.storage_optimization.of_all_blocks; + auto &precisions = parameters_.storage_optimization.block_wise; + // if adaptive version is used, make sure that the precision array is of + // the correct size by replicating it multiple times if needed + if (parameters_.storage_optimization.is_block_wise || + all_block_opt != precision_reduction(0, 0)) { + if (!parameters_.storage_optimization.is_block_wise) { + precisions = + gko::Array(exec, {all_block_opt}); + } + Array tmp( + exec, parameters_.block_pointers.get_num_elems() - 1); + exec->run(jacobi::make_initialize_precisions(precisions, tmp)); + precisions = std::move(tmp); + conditioning_.resize_and_reset(num_blocks_); } - Array tmp( - exec, parameters_.block_pointers.get_num_elems() - 1); - exec->run(jacobi::make_initialize_precisions(precisions, tmp)); - precisions = std::move(tmp); - conditioning_.resize_and_reset(num_blocks_); - } - exec->run(jacobi::make_generate( - csr_mtx.get(), num_blocks_, parameters_.max_block_size, - parameters_.accuracy, storage_scheme_, conditioning_, precisions, - parameters_.block_pointers, blocks_)); + exec->run(jacobi::make_generate( + csr_mtx.get(), num_blocks_, parameters_.max_block_size, + parameters_.accuracy, storage_scheme_, conditioning_, precisions, + parameters_.block_pointers, blocks_)); + } } diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp index e5738e9f51f..22cfa512bb8 100644 --- a/core/preconditioner/jacobi_kernels.hpp +++ b/core/preconditioner/jacobi_kernels.hpp @@ -74,6 +74,12 @@ namespace kernels { const matrix::Dense *b, \ const matrix::Dense *beta, matrix::Dense *x) +#define GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType) \ + void simple_scalar_apply(std::shared_ptr exec, \ + const Array &diag, \ + const matrix::Dense *b, \ + matrix::Dense *x) + #define GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType) \ void simple_apply( \ std::shared_ptr exec, size_type num_blocks, \ @@ -85,6 +91,13 @@ namespace kernels { const Array &blocks, const matrix::Dense *b, \ matrix::Dense *x) +#define GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType) \ + void scalar_apply( \ + std::shared_ptr exec, \ + const Array &diag, const matrix::Dense *alpha, \ + const matrix::Dense *b, \ + const matrix::Dense *beta, matrix::Dense *x) + #define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType) \ void transpose_jacobi( \ std::shared_ptr exec, size_type num_blocks, \ @@ -127,8 +140,12 @@ namespace kernels { GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL(ValueType, IndexType); \ template \ GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType); \ template \ GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType); \ template \ GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \ template \ diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index 889fd1b0db6..bb9b918f5d9 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -65,12 +65,14 @@ constexpr int default_num_warps = 32; // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; +constexpr int default_block_size = 512; + #include "common/preconditioner/jacobi_kernels.hpp.inc" template -size_type find_natural_blocks(std::shared_ptr exec, +size_type find_natural_blocks(std::shared_ptr exec, const matrix::Csr *mtx, int32 max_block_size, IndexType *__restrict__ block_ptrs) @@ -95,7 +97,7 @@ size_type find_natural_blocks(std::shared_ptr exec, template inline size_type agglomerate_supervariables( - std::shared_ptr exec, int32 max_block_size, + std::shared_ptr exec, int32 max_block_size, size_type num_natural_blocks, IndexType *block_ptrs) { Array nums(exec, 1); @@ -111,7 +113,7 @@ inline size_type agglomerate_supervariables( } // namespace -void initialize_precisions(std::shared_ptr exec, +void initialize_precisions(std::shared_ptr exec, const Array &source, Array &precisions) { @@ -126,7 +128,7 @@ void initialize_precisions(std::shared_ptr exec, template -void find_blocks(std::shared_ptr exec, +void find_blocks(std::shared_ptr exec, const matrix::Csr *system_matrix, uint32 max_block_size, size_type &num_blocks, Array &block_pointers) @@ -230,7 +232,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense( - std::shared_ptr exec, size_type num_blocks, + std::shared_ptr exec, size_type num_blocks, const Array &block_precisions, const Array &block_pointers, const Array &blocks, const preconditioner::block_interleaved_storage_scheme @@ -241,6 +243,61 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); +template +void scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *x) +{ + const auto b_size = b->get_size(); + const auto num_rows = b_size[0]; + const auto num_cols = b_size[1]; + const auto b_stride = b->get_stride(); + const auto x_stride = x->get_stride(); + const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); + + const auto b_values = b->get_const_values(); + const auto diag_values = diag.get_const_data(); + auto x_values = x->get_values(); + + kernel::scalar_apply<<>>( + num_rows, num_cols, as_cuda_type(diag_values), + as_cuda_type(alpha->get_const_values()), b_stride, + as_cuda_type(b_values), as_cuda_type(beta->get_const_values()), + x_stride, as_cuda_type(x_values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + + +template +void simple_scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *b, + matrix::Dense *x) +{ + const auto b_size = b->get_size(); + const auto num_rows = b_size[0]; + const auto num_cols = b_size[1]; + const auto b_stride = b->get_stride(); + const auto x_stride = x->get_stride(); + const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); + + const auto b_values = b->get_const_values(); + const auto diag_values = diag.get_const_data(); + auto x_values = x->get_values(); + + kernel::simple_scalar_apply<<>>( + num_rows, num_cols, as_cuda_type(diag_values), b_stride, + as_cuda_type(b_values), x_stride, as_cuda_type(x_values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + + } // namespace jacobi } // namespace cuda } // namespace kernels diff --git a/cuda/test/preconditioner/jacobi_kernels.cpp b/cuda/test/preconditioner/jacobi_kernels.cpp index aafd6546996..87e0cbb2ab9 100644 --- a/cuda/test/preconditioner/jacobi_kernels.cpp +++ b/cuda/test/preconditioner/jacobi_kernels.cpp @@ -412,6 +412,37 @@ TEST_F(Jacobi, CudaApplyEquivalentToRef) } +TEST_F(Jacobi, CudaScalarApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref)); + auto sx = Vec::create(ref, sb->get_size()); + + auto d_smtx = gko::share(Mtx::create(cuda)); + auto d_sb = gko::share(Vec::create(cuda)); + auto d_sx = gko::share(Vec::create(cuda, sb->get_size())); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(cuda)->generate(d_smtx); + + sj->apply(sb.get(), sx.get()); + d_sj->apply(d_sb.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + TEST_F(Jacobi, CudaLinearCombinationApplyEquivalentToRef) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, @@ -430,6 +461,46 @@ TEST_F(Jacobi, CudaLinearCombinationApplyEquivalentToRef) } +TEST_F(Jacobi, CudaScalarLinearCombinationApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + auto sx = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + + auto d_smtx = gko::share(Mtx::create(cuda)); + auto d_sb = gko::share(Vec::create(cuda)); + auto d_sx = gko::share(Vec::create(cuda)); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + d_sx->copy_from(sx.get()); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, cuda); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, cuda); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(cuda)->generate(d_smtx); + + sj->apply(alpha.get(), sb.get(), beta.get(), sx.get()); + d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + TEST_F(Jacobi, CudaApplyToMultipleVectorsEquivalentToRef) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 623e2c121ec..3b7f1efb90f 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -280,6 +280,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); +template +void scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + + +template +void simple_scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *b, + matrix::Dense *x) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + + } // namespace jacobi } // namespace dpcpp } // namespace kernels diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index fc020837a22..57d513774b5 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -72,6 +72,7 @@ constexpr int default_num_warps = 32; // current GPUs have at most 84 SMs) constexpr int default_grid_size = 32 * 32 * 128; +constexpr int default_block_size = 512; #include "common/preconditioner/jacobi_kernels.hpp.inc" @@ -256,6 +257,62 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); +template +void scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *x) +{ + const auto b_size = b->get_size(); + const auto num_rows = b_size[0]; + const auto num_cols = b_size[1]; + const auto b_stride = b->get_stride(); + const auto x_stride = x->get_stride(); + const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); + + const auto b_values = b->get_const_values(); + const auto diag_values = diag.get_const_data(); + auto x_values = x->get_values(); + + hipLaunchKernelGGL( + kernel::scalar_apply, dim3(grid_dim), dim3(default_block_size), 0, 0, + num_rows, num_cols, as_hip_type(diag_values), + as_hip_type(alpha->get_const_values()), b_stride, as_hip_type(b_values), + as_hip_type(beta->get_const_values()), x_stride, as_hip_type(x_values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + + +template +void simple_scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *b, + matrix::Dense *x) +{ + const auto b_size = b->get_size(); + const auto num_rows = b_size[0]; + const auto num_cols = b_size[1]; + const auto b_stride = b->get_stride(); + const auto x_stride = x->get_stride(); + const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size); + + const auto b_values = b->get_const_values(); + const auto diag_values = diag.get_const_data(); + auto x_values = x->get_values(); + + hipLaunchKernelGGL(kernel::simple_scalar_apply, dim3(grid_dim), + dim3(default_block_size), 0, 0, num_rows, num_cols, + as_hip_type(diag_values), b_stride, + as_hip_type(b_values), x_stride, as_hip_type(x_values)); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + + } // namespace jacobi } // namespace hip } // namespace kernels diff --git a/hip/test/preconditioner/jacobi_kernels.cpp b/hip/test/preconditioner/jacobi_kernels.cpp index 3931f666f12..4ef05dc7fff 100644 --- a/hip/test/preconditioner/jacobi_kernels.cpp +++ b/hip/test/preconditioner/jacobi_kernels.cpp @@ -443,6 +443,37 @@ TEST_F(Jacobi, HipApplyEquivalentToRef) } +TEST_F(Jacobi, HipScalarApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref)); + auto sx = Vec::create(ref, sb->get_size()); + + auto d_smtx = gko::share(Mtx::create(hip)); + auto d_sb = gko::share(Vec::create(hip)); + auto d_sx = gko::share(Vec::create(hip, sb->get_size())); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(hip)->generate(d_smtx); + + sj->apply(sb.get(), sx.get()); + d_sj->apply(d_sb.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRef) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, @@ -461,6 +492,46 @@ TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRef) } +TEST_F(Jacobi, HipScalarLinearCombinationApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + auto sx = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + + auto d_smtx = gko::share(Mtx::create(hip)); + auto d_sb = gko::share(Vec::create(hip)); + auto d_sx = gko::share(Vec::create(hip)); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + d_sx->copy_from(sx.get()); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, hip); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, hip); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(hip)->generate(d_smtx); + + sj->apply(alpha.get(), sb.get(), beta.get(), sx.get()); + d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRef) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp index 85924ac3772..dd05296abde 100644 --- a/include/ginkgo/core/preconditioner/jacobi.hpp +++ b/include/ginkgo/core/preconditioner/jacobi.hpp @@ -36,7 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include +#include namespace gko { diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp index 1b0c0c6c1a5..605ba433578 100644 --- a/omp/preconditioner/jacobi_kernels.cpp +++ b/omp/preconditioner/jacobi_kernels.cpp @@ -696,6 +696,50 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); +template +void scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *x) +{ +#pragma omp parallel for + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + auto diag_val = diag.get_const_data()[i] == zero() + ? one() + : diag.get_const_data()[i]; + x->at(i, j) = beta->at(0) * x->at(i, j) + + alpha->at(0) * b->at(i, j) / diag_val; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + + +template +void simple_scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *b, + matrix::Dense *x) +{ +#pragma omp parallel for + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + auto diag_val = diag.get_const_data()[i] == zero() + ? one() + : diag.get_const_data()[i]; + x->at(i, j) = b->at(i, j) / diag_val; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + + } // namespace jacobi } // namespace omp } // namespace kernels diff --git a/omp/test/preconditioner/jacobi_kernels.cpp b/omp/test/preconditioner/jacobi_kernels.cpp index 78663913f71..095729a5e9a 100644 --- a/omp/test/preconditioner/jacobi_kernels.cpp +++ b/omp/test/preconditioner/jacobi_kernels.cpp @@ -397,6 +397,37 @@ TEST_F(Jacobi, OmpApplyEquivalentToRefWithDifferentBlockSize) } +TEST_F(Jacobi, OmpScalarApplyEquivalentToRef) +{ + gko::size_type dim = 313; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref)); + auto sx = Vec::create(ref, sb->get_size()); + + auto d_smtx = gko::share(Mtx::create(omp)); + auto d_sb = gko::share(Vec::create(omp)); + auto d_sx = gko::share(Vec::create(omp, sb->get_size())); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(omp)->generate(d_smtx); + + sj->apply(sb.get(), sx.get()); + d_sj->apply(d_sb.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + TEST_F(Jacobi, OmpApplyEquivalentToRef) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, @@ -429,6 +460,46 @@ TEST_F(Jacobi, OmpLinearCombinationApplyEquivalentToRef) } +TEST_F(Jacobi, OmpScalarLinearCombinationApplyEquivalentToRef) +{ + gko::size_type dim = 5; + std::ranlux48 engine(42); + auto dense_smtx = gko::share(gko::test::generate_random_matrix( + dim, dim, std::uniform_int_distribution<>(1, dim), + std::normal_distribution<>(1.0, 2.0), engine, ref)); + gko::test::make_diag_dominant(dense_smtx.get()); + auto smtx = gko::share(Mtx::create(ref)); + smtx->copy_from(dense_smtx.get()); + auto sb = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + auto sx = gko::share(gko::test::generate_random_matrix( + dim, 3, std::uniform_int_distribution<>(1, 1), + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); + + auto d_smtx = gko::share(Mtx::create(omp)); + auto d_sb = gko::share(Vec::create(omp)); + auto d_sx = gko::share(Vec::create(omp)); + d_smtx->copy_from(smtx.get()); + d_sb->copy_from(sb.get()); + d_sx->copy_from(sx.get()); + auto alpha = gko::initialize({2.0}, ref); + auto d_alpha = gko::initialize({2.0}, omp); + auto beta = gko::initialize({-1.0}, ref); + auto d_beta = gko::initialize({-1.0}, omp); + + auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx); + auto d_sj = Bj::build().with_max_block_size(1u).on(omp)->generate(d_smtx); + + sj->apply(alpha.get(), sb.get(), beta.get(), sx.get()); + d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get()); + + GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12); +} + + TEST_F(Jacobi, OmpApplyToMultipleVectorsEquivalentToRef) { initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13, diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp index b96af16dca2..a176b01e4bf 100644 --- a/reference/preconditioner/jacobi_kernels.cpp +++ b/reference/preconditioner/jacobi_kernels.cpp @@ -136,7 +136,7 @@ inline size_type agglomerate_supervariables(uint32 max_block_size, template -void find_blocks(std::shared_ptr exec, +void find_blocks(std::shared_ptr exec, const matrix::Csr *system_matrix, uint32 max_block_size, size_type &num_blocks, Array &block_pointers) @@ -310,7 +310,7 @@ inline bool invert_block(IndexType block_size, IndexType *perm, template inline bool validate_precision_reduction_feasibility( - std::shared_ptr exec, IndexType block_size, + std::shared_ptr exec, IndexType block_size, const ValueType *block, size_type stride) { using gko::detail::float_traits; @@ -340,7 +340,7 @@ inline bool validate_precision_reduction_feasibility( template -void generate(std::shared_ptr exec, +void generate(std::shared_ptr exec, const matrix::Csr *system_matrix, size_type num_blocks, uint32 max_block_size, remove_complex accuracy, @@ -480,7 +480,7 @@ inline void apply_block(size_type block_size, size_type num_rhs, } // namespace -void initialize_precisions(std::shared_ptr exec, +void initialize_precisions(std::shared_ptr exec, const Array &source, Array &precisions) { @@ -492,7 +492,7 @@ void initialize_precisions(std::shared_ptr exec, template -void apply(std::shared_ptr exec, size_type num_blocks, +void apply(std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, const preconditioner::block_interleaved_storage_scheme &storage_scheme, @@ -528,7 +528,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL); template void simple_apply( - std::shared_ptr exec, size_type num_blocks, + std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, const preconditioner::block_interleaved_storage_scheme &storage_scheme, @@ -560,6 +560,48 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL); +template +void scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *alpha, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *x) +{ + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + auto diag_val = diag.get_const_data()[i] == zero() + ? one() + : diag.get_const_data()[i]; + x->at(i, j) = beta->at(0) * x->at(i, j) + + alpha->at(0) * b->at(i, j) / diag_val; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL); + + +template +void simple_scalar_apply(std::shared_ptr exec, + const Array &diag, + const matrix::Dense *b, + matrix::Dense *x) +{ + for (size_type i = 0; i < x->get_size()[0]; ++i) { + for (size_type j = 0; j < x->get_size()[1]; ++j) { + auto diag_val = diag.get_const_data()[i] == zero() + ? one() + : diag.get_const_data()[i]; + x->at(i, j) = b->at(i, j) / diag_val; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL); + + template void transpose_jacobi( std::shared_ptr exec, size_type num_blocks, @@ -634,7 +676,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_to_dense( - std::shared_ptr exec, size_type num_blocks, + std::shared_ptr exec, size_type num_blocks, const Array &block_precisions, const Array &block_pointers, const Array &blocks, const preconditioner::block_interleaved_storage_scheme diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index f9cda1ad532..55192bf36c6 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -75,6 +75,7 @@ class Jacobi : public ::testing::Test { block_pointers.get_data()[2] = 5; block_precisions.get_data()[0] = gko::precision_reduction(0, 1); block_precisions.get_data()[1] = gko::precision_reduction(0, 0); + scalar_j_factory = Bj::build().with_max_block_size(1u).on(exec); bj_factory = Bj::build() .with_max_block_size(3u) .with_block_pointers(block_pointers) @@ -111,6 +112,7 @@ class Jacobi : public ::testing::Test { std::shared_ptr exec; std::unique_ptr bj_factory; + std::unique_ptr scalar_j_factory; std::unique_ptr adaptive_bj_factory; gko::Array block_pointers; gko::Array block_precisions; @@ -656,6 +658,21 @@ TYPED_TEST(Jacobi, AppliesToVector) } +TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector) +{ + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); + auto sj = this->scalar_j_factory->generate(this->mtx); + + sj->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, -0.25, -0.5, 1.0, -0.25}), + r::value); +} + + TYPED_TEST(Jacobi, AppliesToMixedVector) { using value_type = gko::next_precision; @@ -783,6 +800,32 @@ TYPED_TEST(Jacobi, AppliesToMultipleVectors) } +TYPED_TEST(Jacobi, ScalarJacobiAppliesToMultipleVectors) +{ + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); + auto sj = this->scalar_j_factory->generate(this->mtx); + + sj->apply(b.get(), x.get()); + + GKO_ASSERT_MTX_NEAR( + x, + l({{1.0, -0.5}, {-0.25, 1.0}, {-0.5, 0.0}, {1.0, -0.5}, {-0.25, 1.0}}), + r::value); +} + + TYPED_TEST(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecision) { using Vec = typename TestFixture::Vec; @@ -859,6 +902,23 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToVector) } +TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector) +{ + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); + auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto sj = this->scalar_j_factory->generate(this->mtx); + + sj->apply(alpha.get(), b.get(), beta.get(), x.get()); + + GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.5, -3.0, 4.0, -3.5}), + r::value); +} + + TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector) { using value_type = gko::next_precision; @@ -973,6 +1033,34 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToMultipleVectors) } +TYPED_TEST(Jacobi, ScalarAppliesLinearCombinationToMultipleVectors) +{ + using Vec = typename TestFixture::Vec; + using value_type = typename TestFixture::value_type; + using T = value_type; + auto half_tol = std::sqrt(r::value); + auto x = + gko::initialize(3, + {I{1.0, 0.5}, I{-1.0, -0.5}, I{2.0, 1.0}, + I{-2.0, -1.0}, I{3.0, 1.5}}, + this->exec); + auto b = + gko::initialize(3, + {I{4.0, -2.0}, I{-1.0, 4.0}, I{-2.0, 0.0}, + I{4.0, -2.0}, I{-1.0, 4.0}}, + this->exec); + auto alpha = gko::initialize({2.0}, this->exec); + auto beta = gko::initialize({-1.0}, this->exec); + auto sj = this->scalar_j_factory->generate(this->mtx); + + sj->apply(alpha.get(), b.get(), beta.get(), x.get()); + + GKO_ASSERT_MTX_NEAR( + x, l({{1.0, -1.5}, {0.5, 2.5}, {-3.0, -1.0}, {4.0, 0.0}, {-3.5, 0.5}}), + r::value); +} + + TYPED_TEST(Jacobi, AppliesLinearCombinationToMultipleVectorsWithAdaptivePrecision) {