diff --git a/common/preconditioner/jacobi_kernels.hpp.inc b/common/preconditioner/jacobi_kernels.hpp.inc
index c3fa889b210..61678280d56 100644
--- a/common/preconditioner/jacobi_kernels.hpp.inc
+++ b/common/preconditioner/jacobi_kernels.hpp.inc
@@ -213,3 +213,50 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_transpose_jacobi(
             });
     }
 }
+
+
+namespace kernel {
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void scalar_apply(
+    size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
+    const ValueType *__restrict__ alpha, size_type source_stride,
+    const ValueType *__restrict__ source_values,
+    const ValueType *__restrict__ beta, size_type result_stride,
+    ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / num_cols;
+    const auto col = tidx % num_cols;
+
+    if (row < num_rows) {
+        auto diag_val =
+            diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
+        result_values[row * result_stride + col] =
+            result_values[row * result_stride + col] * beta[0] +
+            alpha[0] * source_values[row * source_stride + col] / diag_val;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void simple_scalar_apply(
+    size_type num_rows, size_type num_cols, const ValueType *__restrict__ diag,
+    size_type source_stride, const ValueType *__restrict__ source_values,
+    size_type result_stride, ValueType *__restrict__ result_values)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / num_cols;
+    const auto col = tidx % num_cols;
+
+    if (row < num_rows) {
+        auto diag_val =
+            diag[row] == zero<ValueType>() ? one<ValueType>() : diag[row];
+        result_values[row * result_stride + col] =
+            source_values[row * source_stride + col] / diag_val;
+    }
+}
+
+
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index c4244c8317c..4f09888e88f 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -1081,6 +1081,17 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
+template <typename ValueType>
+GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+template <typename ValueType>
+GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp
index 5f135b6c010..8270265c2ec 100644
--- a/core/preconditioner/jacobi.cpp
+++ b/core/preconditioner/jacobi.cpp
@@ -57,7 +57,9 @@ namespace jacobi {
 
 
 GKO_REGISTER_OPERATION(simple_apply, jacobi::simple_apply);
+GKO_REGISTER_OPERATION(simple_scalar_apply, jacobi::simple_scalar_apply);
 GKO_REGISTER_OPERATION(apply, jacobi::apply);
+GKO_REGISTER_OPERATION(scalar_apply, jacobi::scalar_apply);
 GKO_REGISTER_OPERATION(find_blocks, jacobi::find_blocks);
 GKO_REGISTER_OPERATION(generate, jacobi::generate);
 GKO_REGISTER_OPERATION(transpose_jacobi, jacobi::transpose_jacobi);
@@ -74,10 +76,15 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
-            this->get_executor()->run(jacobi::make_simple_apply(
-                num_blocks_, parameters_.max_block_size, storage_scheme_,
-                parameters_.storage_optimization.block_wise,
-                parameters_.block_pointers, blocks_, dense_b, dense_x));
+            if (parameters_.max_block_size == 1) {
+                this->get_executor()->run(jacobi::make_simple_scalar_apply(
+                    this->blocks_, dense_b, dense_x));
+            } else {
+                this->get_executor()->run(jacobi::make_simple_apply(
+                    num_blocks_, parameters_.max_block_size, storage_scheme_,
+                    parameters_.storage_optimization.block_wise,
+                    parameters_.block_pointers, blocks_, dense_b, dense_x));
+            }
         },
         b, x);
 }
@@ -90,11 +97,16 @@ void Jacobi<ValueType, IndexType>::apply_impl(const LinOp *alpha,
 {
     precision_dispatch_real_complex<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
-            this->get_executor()->run(jacobi::make_apply(
-                num_blocks_, parameters_.max_block_size, storage_scheme_,
-                parameters_.storage_optimization.block_wise,
-                parameters_.block_pointers, blocks_, dense_alpha, dense_b,
-                dense_beta, dense_x));
+            if (parameters_.max_block_size == 1) {
+                this->get_executor()->run(jacobi::make_scalar_apply(
+                    this->blocks_, dense_alpha, dense_b, dense_beta, dense_x));
+            } else {
+                this->get_executor()->run(jacobi::make_apply(
+                    num_blocks_, parameters_.max_block_size, storage_scheme_,
+                    parameters_.storage_optimization.block_wise,
+                    parameters_.block_pointers, blocks_, dense_alpha, dense_b,
+                    dense_beta, dense_x));
+            }
         },
         alpha, b, beta, x);
 }
@@ -218,33 +230,43 @@ void Jacobi<ValueType, IndexType>::generate(const LinOp *system_matrix,
     GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
     using csr_type = matrix::Csr<ValueType, IndexType>;
     const auto exec = this->get_executor();
-    auto csr_mtx =
-        convert_to_with_sorting<csr_type>(exec, system_matrix, skip_sorting);
-
-    if (parameters_.block_pointers.get_data() == nullptr) {
-        this->detect_blocks(csr_mtx.get());
-    }
+    if (parameters_.max_block_size == 1) {
+        auto diag = as<DiagonalExtractable<ValueType>>(system_matrix)
+                        ->extract_diagonal();
+        auto temp = gko::Array<ValueType>::view(
+            exec, system_matrix->get_size()[0], diag->get_values());
+        this->blocks_ = temp;
+    } else {
+        auto csr_mtx = convert_to_with_sorting<csr_type>(exec, system_matrix,
+                                                         skip_sorting);
+
+        if (parameters_.block_pointers.get_data() == nullptr) {
+            this->detect_blocks(csr_mtx.get());
+        }
 
-    const auto all_block_opt = parameters_.storage_optimization.of_all_blocks;
-    auto &precisions = parameters_.storage_optimization.block_wise;
-    // if adaptive version is used, make sure that the precision array is of
-    // the correct size by replicating it multiple times if needed
-    if (parameters_.storage_optimization.is_block_wise ||
-        all_block_opt != precision_reduction(0, 0)) {
-        if (!parameters_.storage_optimization.is_block_wise) {
-            precisions = gko::Array<precision_reduction>(exec, {all_block_opt});
+        const auto all_block_opt =
+            parameters_.storage_optimization.of_all_blocks;
+        auto &precisions = parameters_.storage_optimization.block_wise;
+        // if adaptive version is used, make sure that the precision array is of
+        // the correct size by replicating it multiple times if needed
+        if (parameters_.storage_optimization.is_block_wise ||
+            all_block_opt != precision_reduction(0, 0)) {
+            if (!parameters_.storage_optimization.is_block_wise) {
+                precisions =
+                    gko::Array<precision_reduction>(exec, {all_block_opt});
+            }
+            Array<precision_reduction> tmp(
+                exec, parameters_.block_pointers.get_num_elems() - 1);
+            exec->run(jacobi::make_initialize_precisions(precisions, tmp));
+            precisions = std::move(tmp);
+            conditioning_.resize_and_reset(num_blocks_);
         }
-        Array<precision_reduction> tmp(
-            exec, parameters_.block_pointers.get_num_elems() - 1);
-        exec->run(jacobi::make_initialize_precisions(precisions, tmp));
-        precisions = std::move(tmp);
-        conditioning_.resize_and_reset(num_blocks_);
-    }
 
-    exec->run(jacobi::make_generate(
-        csr_mtx.get(), num_blocks_, parameters_.max_block_size,
-        parameters_.accuracy, storage_scheme_, conditioning_, precisions,
-        parameters_.block_pointers, blocks_));
+        exec->run(jacobi::make_generate(
+            csr_mtx.get(), num_blocks_, parameters_.max_block_size,
+            parameters_.accuracy, storage_scheme_, conditioning_, precisions,
+            parameters_.block_pointers, blocks_));
+    }
 }
 
 
diff --git a/core/preconditioner/jacobi_kernels.hpp b/core/preconditioner/jacobi_kernels.hpp
index e5738e9f51f..22cfa512bb8 100644
--- a/core/preconditioner/jacobi_kernels.hpp
+++ b/core/preconditioner/jacobi_kernels.hpp
@@ -74,6 +74,12 @@ namespace kernels {
         const matrix::Dense<ValueType> *b,                                     \
         const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType)          \
+    void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                             const Array<ValueType> &diag,                \
+                             const matrix::Dense<ValueType> *b,           \
+                             matrix::Dense<ValueType> *x)
+
 #define GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType)       \
     void simple_apply(                                                     \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -85,6 +91,13 @@ namespace kernels {
         const Array<ValueType> &blocks, const matrix::Dense<ValueType> *b, \
         matrix::Dense<ValueType> *x)
 
+#define GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType)                    \
+    void scalar_apply(                                                       \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const Array<ValueType> &diag, const matrix::Dense<ValueType> *alpha, \
+        const matrix::Dense<ValueType> *b,                                   \
+        const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *x)
+
 #define GKO_DECLARE_JACOBI_TRANSPOSE_KERNEL(ValueType, IndexType)          \
     void transpose_jacobi(                                                 \
         std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks, \
@@ -127,8 +140,12 @@ namespace kernels {
     GKO_DECLARE_JACOBI_FIND_BLOCKS_KERNEL(ValueType, IndexType);      \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_GENERATE_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType>                                     \
+    GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL(ValueType);                \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_APPLY_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType>                                     \
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL(ValueType);         \
     template <typename ValueType, typename IndexType>                 \
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>                 \
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index 889fd1b0db6..bb9b918f5d9 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -65,12 +65,14 @@ constexpr int default_num_warps = 32;
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
 
+constexpr int default_block_size = 512;
+
 
 #include "common/preconditioner/jacobi_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
-size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
+size_type find_natural_blocks(std::shared_ptr<const DefaultExecutor> exec,
                               const matrix::Csr<ValueType, IndexType> *mtx,
                               int32 max_block_size,
                               IndexType *__restrict__ block_ptrs)
@@ -95,7 +97,7 @@ size_type find_natural_blocks(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename IndexType>
 inline size_type agglomerate_supervariables(
-    std::shared_ptr<const CudaExecutor> exec, int32 max_block_size,
+    std::shared_ptr<const DefaultExecutor> exec, int32 max_block_size,
     size_type num_natural_blocks, IndexType *block_ptrs)
 {
     Array<size_type> nums(exec, 1);
@@ -111,7 +113,7 @@ inline size_type agglomerate_supervariables(
 }  // namespace
 
 
-void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
+void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
                            const Array<precision_reduction> &source,
                            Array<precision_reduction> &precisions)
 {
@@ -126,7 +128,7 @@ void initialize_precisions(std::shared_ptr<const CudaExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void find_blocks(std::shared_ptr<const CudaExecutor> exec,
+void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType> *system_matrix,
                  uint32 max_block_size, size_type &num_blocks,
                  Array<IndexType> &block_pointers)
@@ -230,7 +232,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
-    std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     const Array<precision_reduction> &block_precisions,
     const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>
@@ -241,6 +243,61 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    kernel::scalar_apply<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, as_cuda_type(diag_values),
+        as_cuda_type(alpha->get_const_values()), b_stride,
+        as_cuda_type(b_values), as_cuda_type(beta->get_const_values()),
+        x_stride, as_cuda_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    kernel::simple_scalar_apply<<<grid_dim, default_block_size>>>(
+        num_rows, num_cols, as_cuda_type(diag_values), b_stride,
+        as_cuda_type(b_values), x_stride, as_cuda_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 }  // namespace jacobi
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/test/preconditioner/jacobi_kernels.cpp b/cuda/test/preconditioner/jacobi_kernels.cpp
index aafd6546996..87e0cbb2ab9 100644
--- a/cuda/test/preconditioner/jacobi_kernels.cpp
+++ b/cuda/test/preconditioner/jacobi_kernels.cpp
@@ -412,6 +412,37 @@ TEST_F(Jacobi, CudaApplyEquivalentToRef)
 }
 
 
+TEST_F(Jacobi, CudaScalarApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref));
+    auto sx = Vec::create(ref, sb->get_size());
+
+    auto d_smtx = gko::share(Mtx::create(cuda));
+    auto d_sb = gko::share(Vec::create(cuda));
+    auto d_sx = gko::share(Vec::create(cuda, sb->get_size()));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(cuda)->generate(d_smtx);
+
+    sj->apply(sb.get(), sx.get());
+    d_sj->apply(d_sb.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
 TEST_F(Jacobi, CudaLinearCombinationApplyEquivalentToRef)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
@@ -430,6 +461,46 @@ TEST_F(Jacobi, CudaLinearCombinationApplyEquivalentToRef)
 }
 
 
+TEST_F(Jacobi, CudaScalarLinearCombinationApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+    auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+
+    auto d_smtx = gko::share(Mtx::create(cuda));
+    auto d_sb = gko::share(Vec::create(cuda));
+    auto d_sx = gko::share(Vec::create(cuda));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+    d_sx->copy_from(sx.get());
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, cuda);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, cuda);
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(cuda)->generate(d_smtx);
+
+    sj->apply(alpha.get(), sb.get(), beta.get(), sx.get());
+    d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
 TEST_F(Jacobi, CudaApplyToMultipleVectorsEquivalentToRef)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
index 623e2c121ec..3b7f1efb90f 100644
--- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp
+++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp
@@ -280,6 +280,27 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 }  // namespace jacobi
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index fc020837a22..57d513774b5 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -72,6 +72,7 @@ constexpr int default_num_warps = 32;
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
 
+constexpr int default_block_size = 512;
 
 #include "common/preconditioner/jacobi_kernels.hpp.inc"
 
@@ -256,6 +257,62 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    hipLaunchKernelGGL(
+        kernel::scalar_apply, dim3(grid_dim), dim3(default_block_size), 0, 0,
+        num_rows, num_cols, as_hip_type(diag_values),
+        as_hip_type(alpha->get_const_values()), b_stride, as_hip_type(b_values),
+        as_hip_type(beta->get_const_values()), x_stride, as_hip_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x)
+{
+    const auto b_size = b->get_size();
+    const auto num_rows = b_size[0];
+    const auto num_cols = b_size[1];
+    const auto b_stride = b->get_stride();
+    const auto x_stride = x->get_stride();
+    const auto grid_dim = ceildiv(num_rows * num_cols, default_block_size);
+
+    const auto b_values = b->get_const_values();
+    const auto diag_values = diag.get_const_data();
+    auto x_values = x->get_values();
+
+    hipLaunchKernelGGL(kernel::simple_scalar_apply, dim3(grid_dim),
+                       dim3(default_block_size), 0, 0, num_rows, num_cols,
+                       as_hip_type(diag_values), b_stride,
+                       as_hip_type(b_values), x_stride, as_hip_type(x_values));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 }  // namespace jacobi
 }  // namespace hip
 }  // namespace kernels
diff --git a/hip/test/preconditioner/jacobi_kernels.cpp b/hip/test/preconditioner/jacobi_kernels.cpp
index 3931f666f12..4ef05dc7fff 100644
--- a/hip/test/preconditioner/jacobi_kernels.cpp
+++ b/hip/test/preconditioner/jacobi_kernels.cpp
@@ -443,6 +443,37 @@ TEST_F(Jacobi, HipApplyEquivalentToRef)
 }
 
 
+TEST_F(Jacobi, HipScalarApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref));
+    auto sx = Vec::create(ref, sb->get_size());
+
+    auto d_smtx = gko::share(Mtx::create(hip));
+    auto d_sb = gko::share(Vec::create(hip));
+    auto d_sx = gko::share(Vec::create(hip, sb->get_size()));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(hip)->generate(d_smtx);
+
+    sj->apply(sb.get(), sx.get());
+    d_sj->apply(d_sb.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
 TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRef)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
@@ -461,6 +492,46 @@ TEST_F(Jacobi, HipLinearCombinationApplyEquivalentToRef)
 }
 
 
+TEST_F(Jacobi, HipScalarLinearCombinationApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+    auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+
+    auto d_smtx = gko::share(Mtx::create(hip));
+    auto d_sb = gko::share(Vec::create(hip));
+    auto d_sx = gko::share(Vec::create(hip));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+    d_sx->copy_from(sx.get());
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, hip);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, hip);
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(hip)->generate(d_smtx);
+
+    sj->apply(alpha.get(), sb.get(), beta.get(), sx.get());
+    d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
 TEST_F(Jacobi, HipApplyToMultipleVectorsEquivalentToRef)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp
index 85924ac3772..dd05296abde 100644
--- a/include/ginkgo/core/preconditioner/jacobi.hpp
+++ b/include/ginkgo/core/preconditioner/jacobi.hpp
@@ -36,7 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
 
 
 namespace gko {
diff --git a/omp/preconditioner/jacobi_kernels.cpp b/omp/preconditioner/jacobi_kernels.cpp
index 1b0c0c6c1a5..605ba433578 100644
--- a/omp/preconditioner/jacobi_kernels.cpp
+++ b/omp/preconditioner/jacobi_kernels.cpp
@@ -696,6 +696,50 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x)
+{
+#pragma omp parallel for
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            auto diag_val = diag.get_const_data()[i] == zero<ValueType>()
+                                ? one<ValueType>()
+                                : diag.get_const_data()[i];
+            x->at(i, j) = beta->at(0) * x->at(i, j) +
+                          alpha->at(0) * b->at(i, j) / diag_val;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x)
+{
+#pragma omp parallel for
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            auto diag_val = diag.get_const_data()[i] == zero<ValueType>()
+                                ? one<ValueType>()
+                                : diag.get_const_data()[i];
+            x->at(i, j) = b->at(i, j) / diag_val;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 }  // namespace jacobi
 }  // namespace omp
 }  // namespace kernels
diff --git a/omp/test/preconditioner/jacobi_kernels.cpp b/omp/test/preconditioner/jacobi_kernels.cpp
index 78663913f71..095729a5e9a 100644
--- a/omp/test/preconditioner/jacobi_kernels.cpp
+++ b/omp/test/preconditioner/jacobi_kernels.cpp
@@ -397,6 +397,37 @@ TEST_F(Jacobi, OmpApplyEquivalentToRefWithDifferentBlockSize)
 }
 
 
+TEST_F(Jacobi, OmpScalarApplyEquivalentToRef)
+{
+    gko::size_type dim = 313;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref));
+    auto sx = Vec::create(ref, sb->get_size());
+
+    auto d_smtx = gko::share(Mtx::create(omp));
+    auto d_sb = gko::share(Vec::create(omp));
+    auto d_sx = gko::share(Vec::create(omp, sb->get_size()));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(omp)->generate(d_smtx);
+
+    sj->apply(sb.get(), sx.get());
+    d_sj->apply(d_sb.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
 TEST_F(Jacobi, OmpApplyEquivalentToRef)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
@@ -429,6 +460,46 @@ TEST_F(Jacobi, OmpLinearCombinationApplyEquivalentToRef)
 }
 
 
+TEST_F(Jacobi, OmpScalarLinearCombinationApplyEquivalentToRef)
+{
+    gko::size_type dim = 5;
+    std::ranlux48 engine(42);
+    auto dense_smtx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, dim, std::uniform_int_distribution<>(1, dim),
+        std::normal_distribution<>(1.0, 2.0), engine, ref));
+    gko::test::make_diag_dominant(dense_smtx.get());
+    auto smtx = gko::share(Mtx::create(ref));
+    smtx->copy_from(dense_smtx.get());
+    auto sb = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+    auto sx = gko::share(gko::test::generate_random_matrix<Vec>(
+        dim, 3, std::uniform_int_distribution<>(1, 1),
+        std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3),
+        4));
+
+    auto d_smtx = gko::share(Mtx::create(omp));
+    auto d_sb = gko::share(Vec::create(omp));
+    auto d_sx = gko::share(Vec::create(omp));
+    d_smtx->copy_from(smtx.get());
+    d_sb->copy_from(sb.get());
+    d_sx->copy_from(sx.get());
+    auto alpha = gko::initialize<Vec>({2.0}, ref);
+    auto d_alpha = gko::initialize<Vec>({2.0}, omp);
+    auto beta = gko::initialize<Vec>({-1.0}, ref);
+    auto d_beta = gko::initialize<Vec>({-1.0}, omp);
+
+    auto sj = Bj::build().with_max_block_size(1u).on(ref)->generate(smtx);
+    auto d_sj = Bj::build().with_max_block_size(1u).on(omp)->generate(d_smtx);
+
+    sj->apply(alpha.get(), sb.get(), beta.get(), sx.get());
+    d_sj->apply(d_alpha.get(), d_sb.get(), d_beta.get(), d_sx.get());
+
+    GKO_ASSERT_MTX_NEAR(sx.get(), d_sx.get(), 1e-12);
+}
+
+
 TEST_F(Jacobi, OmpApplyToMultipleVectorsEquivalentToRef)
 {
     initialize_data({0, 11, 24, 33, 45, 55, 67, 70, 80, 92, 100}, {}, {}, 13,
diff --git a/reference/preconditioner/jacobi_kernels.cpp b/reference/preconditioner/jacobi_kernels.cpp
index b96af16dca2..a176b01e4bf 100644
--- a/reference/preconditioner/jacobi_kernels.cpp
+++ b/reference/preconditioner/jacobi_kernels.cpp
@@ -136,7 +136,7 @@ inline size_type agglomerate_supervariables(uint32 max_block_size,
 
 
 template <typename ValueType, typename IndexType>
-void find_blocks(std::shared_ptr<const ReferenceExecutor> exec,
+void find_blocks(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Csr<ValueType, IndexType> *system_matrix,
                  uint32 max_block_size, size_type &num_blocks,
                  Array<IndexType> &block_pointers)
@@ -310,7 +310,7 @@ inline bool invert_block(IndexType block_size, IndexType *perm,
 
 template <typename ReducedType, typename ValueType, typename IndexType>
 inline bool validate_precision_reduction_feasibility(
-    std::shared_ptr<const ReferenceExecutor> exec, IndexType block_size,
+    std::shared_ptr<const DefaultExecutor> exec, IndexType block_size,
     const ValueType *block, size_type stride)
 {
     using gko::detail::float_traits;
@@ -340,7 +340,7 @@ inline bool validate_precision_reduction_feasibility(
 
 
 template <typename ValueType, typename IndexType>
-void generate(std::shared_ptr<const ReferenceExecutor> exec,
+void generate(std::shared_ptr<const DefaultExecutor> exec,
               const matrix::Csr<ValueType, IndexType> *system_matrix,
               size_type num_blocks, uint32 max_block_size,
               remove_complex<ValueType> accuracy,
@@ -480,7 +480,7 @@ inline void apply_block(size_type block_size, size_type num_rhs,
 }  // namespace
 
 
-void initialize_precisions(std::shared_ptr<const ReferenceExecutor> exec,
+void initialize_precisions(std::shared_ptr<const DefaultExecutor> exec,
                            const Array<precision_reduction> &source,
                            Array<precision_reduction> &precisions)
 {
@@ -492,7 +492,7 @@ void initialize_precisions(std::shared_ptr<const ReferenceExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-void apply(std::shared_ptr<const ReferenceExecutor> exec, size_type num_blocks,
+void apply(std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
            uint32 max_block_size,
            const preconditioner::block_interleaved_storage_scheme<IndexType>
                &storage_scheme,
@@ -528,7 +528,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_JACOBI_APPLY_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void simple_apply(
-    std::shared_ptr<const ReferenceExecutor> exec, size_type num_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     uint32 max_block_size,
     const preconditioner::block_interleaved_storage_scheme<IndexType>
         &storage_scheme,
@@ -560,6 +560,48 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_SIMPLE_APPLY_KERNEL);
 
 
+template <typename ValueType>
+void scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<ValueType> &diag,
+                  const matrix::Dense<ValueType> *alpha,
+                  const matrix::Dense<ValueType> *b,
+                  const matrix::Dense<ValueType> *beta,
+                  matrix::Dense<ValueType> *x)
+{
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            auto diag_val = diag.get_const_data()[i] == zero<ValueType>()
+                                ? one<ValueType>()
+                                : diag.get_const_data()[i];
+            x->at(i, j) = beta->at(0) * x->at(i, j) +
+                          alpha->at(0) * b->at(i, j) / diag_val;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_JACOBI_SCALAR_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void simple_scalar_apply(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<ValueType> &diag,
+                         const matrix::Dense<ValueType> *b,
+                         matrix::Dense<ValueType> *x)
+{
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            auto diag_val = diag.get_const_data()[i] == zero<ValueType>()
+                                ? one<ValueType>()
+                                : diag.get_const_data()[i];
+            x->at(i, j) = b->at(i, j) / diag_val;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_JACOBI_SIMPLE_SCALAR_APPLY_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void transpose_jacobi(
     std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
@@ -634,7 +676,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void convert_to_dense(
-    std::shared_ptr<const ReferenceExecutor> exec, size_type num_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_blocks,
     const Array<precision_reduction> &block_precisions,
     const Array<IndexType> &block_pointers, const Array<ValueType> &blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>
diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp
index f9cda1ad532..55192bf36c6 100644
--- a/reference/test/preconditioner/jacobi_kernels.cpp
+++ b/reference/test/preconditioner/jacobi_kernels.cpp
@@ -75,6 +75,7 @@ class Jacobi : public ::testing::Test {
         block_pointers.get_data()[2] = 5;
         block_precisions.get_data()[0] = gko::precision_reduction(0, 1);
         block_precisions.get_data()[1] = gko::precision_reduction(0, 0);
+        scalar_j_factory = Bj::build().with_max_block_size(1u).on(exec);
         bj_factory = Bj::build()
                          .with_max_block_size(3u)
                          .with_block_pointers(block_pointers)
@@ -111,6 +112,7 @@ class Jacobi : public ::testing::Test {
 
     std::shared_ptr<const gko::Executor> exec;
     std::unique_ptr<typename Bj::Factory> bj_factory;
+    std::unique_ptr<typename Bj::Factory> scalar_j_factory;
     std::unique_ptr<typename Bj::Factory> adaptive_bj_factory;
     gko::Array<index_type> block_pointers;
     gko::Array<gko::precision_reduction> block_precisions;
@@ -656,6 +658,21 @@ TYPED_TEST(Jacobi, AppliesToVector)
 }
 
 
+TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector)
+{
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
+    auto sj = this->scalar_j_factory->generate(this->mtx);
+
+    sj->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, -0.25, -0.5, 1.0, -0.25}),
+                        r<value_type>::value);
+}
+
+
 TYPED_TEST(Jacobi, AppliesToMixedVector)
 {
     using value_type = gko::next_precision<typename TestFixture::value_type>;
@@ -783,6 +800,32 @@ TYPED_TEST(Jacobi, AppliesToMultipleVectors)
 }
 
 
+TYPED_TEST(Jacobi, ScalarJacobiAppliesToMultipleVectors)
+{
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
+    auto sj = this->scalar_j_factory->generate(this->mtx);
+
+    sj->apply(b.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(
+        x,
+        l({{1.0, -0.5}, {-0.25, 1.0}, {-0.5, 0.0}, {1.0, -0.5}, {-0.25, 1.0}}),
+        r<value_type>::value);
+}
+
+
 TYPED_TEST(Jacobi, AppliesToMultipleVectorsWithAdaptivePrecision)
 {
     using Vec = typename TestFixture::Vec;
@@ -859,6 +902,23 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToVector)
 }
 
 
+TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector)
+{
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    auto x = gko::initialize<Vec>({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec);
+    auto b = gko::initialize<Vec>({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto sj = this->scalar_j_factory->generate(this->mtx);
+
+    sj->apply(alpha.get(), b.get(), beta.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(x, l({1.0, 0.5, -3.0, 4.0, -3.5}),
+                        r<value_type>::value);
+}
+
+
 TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector)
 {
     using value_type = gko::next_precision<typename TestFixture::value_type>;
@@ -973,6 +1033,34 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToMultipleVectors)
 }
 
 
+TYPED_TEST(Jacobi, ScalarAppliesLinearCombinationToMultipleVectors)
+{
+    using Vec = typename TestFixture::Vec;
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+    auto half_tol = std::sqrt(r<value_type>::value);
+    auto x =
+        gko::initialize<Vec>(3,
+                             {I<T>{1.0, 0.5}, I<T>{-1.0, -0.5}, I<T>{2.0, 1.0},
+                              I<T>{-2.0, -1.0}, I<T>{3.0, 1.5}},
+                             this->exec);
+    auto b =
+        gko::initialize<Vec>(3,
+                             {I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}, I<T>{-2.0, 0.0},
+                              I<T>{4.0, -2.0}, I<T>{-1.0, 4.0}},
+                             this->exec);
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto sj = this->scalar_j_factory->generate(this->mtx);
+
+    sj->apply(alpha.get(), b.get(), beta.get(), x.get());
+
+    GKO_ASSERT_MTX_NEAR(
+        x, l({{1.0, -1.5}, {0.5, 2.5}, {-3.0, -1.0}, {4.0, 0.0}, {-3.5, 0.5}}),
+        r<value_type>::value);
+}
+
+
 TYPED_TEST(Jacobi,
            AppliesLinearCombinationToMultipleVectorsWithAdaptivePrecision)
 {