diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
new file mode 100644
index 00000000000..b374850d347
--- /dev/null
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -0,0 +1,75 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+namespace kernel {
+
+
+template <int mat_blk_sz, int subwarp_size, typename ValueType,
+          typename IndexType>
+__global__ __launch_bounds__(default_block_size) void transpose_blocks(
+    const IndexType nbnz, ValueType* const values)
+{
+    const auto total_subwarp_count =
+        thread::get_subwarp_num_flat<subwarp_size, IndexType>();
+    const IndexType begin_blk =
+        thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+
+    auto thread_block = group::this_thread_block();
+    auto subwarp_grp = group::tiled_partition<subwarp_size>(thread_block);
+    const int sw_threadidx = subwarp_grp.thread_rank();
+
+    constexpr int mat_blk_sz_2{mat_blk_sz * mat_blk_sz};
+    constexpr int num_entries_per_thread{(mat_blk_sz_2 - 1) / subwarp_size + 1};
+    ValueType orig_vals[num_entries_per_thread];
+
+    for (auto ibz = begin_blk; ibz < nbnz; ibz += total_subwarp_count) {
+        for (int i = sw_threadidx; i < mat_blk_sz_2; i += subwarp_size) {
+            orig_vals[i / subwarp_size] = values[ibz * mat_blk_sz_2 + i];
+        }
+        subwarp_grp.sync();
+
+        for (int i = 0; i < num_entries_per_thread; i++) {
+            const int orig_pos = i * subwarp_size + sw_threadidx;
+            if (orig_pos >= mat_blk_sz_2) {
+                break;
+            }
+            const int orig_row = orig_pos % mat_blk_sz;
+            const int orig_col = orig_pos / mat_blk_sz;
+            const int new_pos = orig_row * mat_blk_sz + orig_col;
+            values[ibz * mat_blk_sz_2 + new_pos] = orig_vals[i];
+        }
+        subwarp_grp.sync();
+    }
+}
+
+
+}  // namespace kernel
diff --git a/core/base/block_sizes.hpp b/core/base/block_sizes.hpp
new file mode 100644
index 00000000000..62ec387d8e3
--- /dev/null
+++ b/core/base/block_sizes.hpp
@@ -0,0 +1,64 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_BLOCK_SIZES_HPP_
+#define GKO_CORE_BASE_BLOCK_SIZES_HPP_
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/synthesizer/containers.hpp>
+
+
+namespace gko {
+namespace fixedblock {
+
+
+/**
+ * @def GKO_FIXED_BLOCK_CUSTOM_SIZES
+ * Optionally-defined comma-separated list of fixed block sizes to compile.
+ */
+#ifdef GKO_FIXED_BLOCK_CUSTOM_SIZES
+/**
+ * A compile-time list of block sizes for which dedicated fixed-block matrix
+ * and corresponding preconditioner kernels should be compiled.
+ */
+using compiled_kernels = syn::value_list<int, GKO_FIXED_BLOCK_CUSTOM_SIZES>;
+#else
+using compiled_kernels = syn::value_list<int, 2, 3, 4, 7>;
+#endif
+
+
+}  // namespace fixedblock
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_BLOCK_SIZES_HPP_
diff --git a/core/base/utils.hpp b/core/base/utils.hpp
index 911945cef78..a42312ead37 100644
--- a/core/base/utils.hpp
+++ b/core/base/utils.hpp
@@ -219,6 +219,20 @@ std::shared_ptr<const Dest> convert_to_with_sorting(
                                                             skip_sorting);
 }
 
+/**
+ * Converts the given arguments into an array of entries of the requested
+ * template type.
+ *
+ * @tparam T  The requested type of entries in the output array.
+ *
+ * @param args  Entities to be filled into an array after casting to type T.
+ */
+template <typename T, typename... Args>
+constexpr std::array<T, sizeof...(Args)> to_std_array(Args&&... args)
+{
+    return {static_cast<T>(args)...};
+}
+
 
 }  // namespace gko
 
diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
index 8c0aa3649cd..8e28d1f38bb 100644
--- a/core/matrix/fbcsr.cpp
+++ b/core/matrix/fbcsr.cpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
@@ -155,14 +156,17 @@ template <typename ValueType, typename IndexType>
 void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const b,
                                              LinOp* const x) const
 {
-    using Dense = Dense<ValueType>;
     if (auto b_fbcsr = dynamic_cast<const Fbcsr<ValueType, IndexType>*>(b)) {
         // if b is a FBCSR matrix, we need an SpGeMM
         GKO_NOT_SUPPORTED(b_fbcsr);
     } else {
         // otherwise we assume that b is dense and compute a SpMV/SpMM
-        this->get_executor()->run(
-            fbcsr::make_spmv(this, as<Dense>(b), as<Dense>(x)));
+        precision_dispatch_real_complex<ValueType>(
+            [this](auto dense_b, auto dense_x) {
+                this->get_executor()->run(
+                    fbcsr::make_spmv(this, dense_b, dense_x));
+            },
+            b, x);
     }
 }
 
@@ -173,7 +177,6 @@ void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const alpha,
                                              const LinOp* const beta,
                                              LinOp* const x) const
 {
-    using Dense = Dense<ValueType>;
     if (auto b_fbcsr = dynamic_cast<const Fbcsr<ValueType, IndexType>*>(b)) {
         // if b is a FBCSR matrix, we need an SpGeMM
         GKO_NOT_SUPPORTED(b_fbcsr);
@@ -182,9 +185,13 @@ void Fbcsr<ValueType, IndexType>::apply_impl(const LinOp* const alpha,
         GKO_NOT_SUPPORTED(b_ident);
     } else {
         // otherwise we assume that b is dense and compute a SpMV/SpMM
-        this->get_executor()->run(
-            fbcsr::make_advanced_spmv(as<Dense>(alpha), this, as<Dense>(b),
-                                      as<Dense>(beta), as<Dense>(x)));
+        precision_dispatch_real_complex<ValueType>(
+            [this](auto dense_alpha, auto dense_b, auto dense_beta,
+                   auto dense_x) {
+                this->get_executor()->run(fbcsr::make_advanced_spmv(
+                    dense_alpha, this, dense_b, dense_beta, dense_x));
+            },
+            alpha, b, beta, x);
     }
 }
 
diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp
index 2965ceb34a2..16604c7c9da 100644
--- a/core/synthesizer/implementation_selection.hpp
+++ b/core/synthesizer/implementation_selection.hpp
@@ -68,7 +68,10 @@ namespace syn {
             _name(::gko::syn::value_list<int, Rest...>(), is_eligible,       \
                   int_args, type_args, std::forward<InferredArgs>(args)...); \
         }                                                                    \
-    }
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
 
 #define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable)         \
     template <typename Predicate, bool... BoolArgs, int... IntArgs,          \
diff --git a/core/test/utils/CMakeLists.txt b/core/test/utils/CMakeLists.txt
index 9b3e0e5e349..88c61456280 100644
--- a/core/test/utils/CMakeLists.txt
+++ b/core/test/utils/CMakeLists.txt
@@ -2,5 +2,6 @@ ginkgo_create_test(array_generator_test)
 ginkgo_create_test(assertions_test)
 ginkgo_create_test(matrix_generator_test)
 ginkgo_create_test(matrix_utils_test)
+ginkgo_create_test(fb_matrix_generator_test)
 ginkgo_create_test(unsort_matrix_test)
 ginkgo_create_test(value_generator_test)
diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp
new file mode 100644
index 00000000000..ad98575d349
--- /dev/null
+++ b/core/test/utils/fb_matrix_generator.hpp
@@ -0,0 +1,255 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_UTILS_FB_MATRIX_GENERATOR_HPP_
+#define GKO_CORE_TEST_UTILS_FB_MATRIX_GENERATOR_HPP_
+
+
+#include <numeric>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
+
+
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+
+
+namespace gko {
+namespace test {
+
+
+/**
+ * Generates a random matrix, ensuring the existence of diagonal entries.
+ *
+ * @tparam MatrixType  type of matrix to generate (matrix::Dense must implement
+ *                     the interface `ConvertibleTo<MatrixType>`)
+ * @tparam NonzeroDistribution  type of nonzero distribution
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ * @tparam MatrixArgs  the arguments from the matrix to be forwarded.
+ *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param nonzero_dist  distribution of nonzeros per row
+ * @param value_dist  distribution of matrix values
+ * @param engine  a random engine
+ * @param exec  executor where the matrix should be allocated
+ * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer to generated matrix of type MatrixType
+ */
+template <typename MatrixType = matrix::Dense<>, typename NonzeroDistribution,
+          typename ValueDistribution, typename Engine, typename... MatrixArgs>
+std::unique_ptr<MatrixType> generate_random_matrix_with_diag(
+    typename MatrixType::index_type num_rows,
+    typename MatrixType::index_type num_cols,
+    NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist,
+    Engine&& engine, std::shared_ptr<const Executor> exec, MatrixArgs&&... args)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = typename MatrixType::index_type;
+
+    matrix_data<value_type, index_type> data{gko::dim<2>(num_rows, num_cols),
+                                             {}};
+
+    for (index_type row = 0; row < num_rows; ++row) {
+        std::vector<index_type> col_idx(num_cols);
+        std::iota(col_idx.begin(), col_idx.end(), size_type(0));
+        // randomly generate number of nonzeros in this row
+        auto nnz_in_row = static_cast<index_type>(nonzero_dist(engine));
+        nnz_in_row = std::max(1, std::min(nnz_in_row, num_cols));
+        // select a subset of `nnz_in_row` column indexes, and fill these
+        // locations with random values
+        std::shuffle(col_idx.begin(), col_idx.end(), engine);
+        // add diagonal if it does not exist
+        auto it = std::find(col_idx.begin(), col_idx.begin() + nnz_in_row, row);
+        if (it == col_idx.begin() + nnz_in_row) {
+            col_idx[nnz_in_row - 1] = row;
+        }
+        std::for_each(
+            begin(col_idx), begin(col_idx) + nnz_in_row, [&](index_type col) {
+                data.nonzeros.emplace_back(
+                    row, col,
+                    detail::get_rand_value<value_type>(value_dist, engine));
+            });
+    }
+
+    data.ensure_row_major_order();
+
+    // convert to the correct matrix type
+    auto result = MatrixType::create(exec, std::forward<MatrixArgs>(args)...);
+    result->read(data);
+    return result;
+}
+
+
+/**
+ * Generates a block CSR matrix having the same sparsity pattern as
+ * a given CSR matrix.
+ *
+ * @param exec  Reference executor.
+ * @param csrmat  The CSR matrix to use for the block sparsity pattern of the
+ *                generated FBCSR matrix.
+ * @param block_size  Block size of output Fbcsr matrix.
+ * @param row_diag_dominant  If true, a row-diagonal-dominant Fbcsr matrix is
+ *                           generated. Note that in this case, the intput Csr
+ *                           matrix must have diagonal entries in all rows.
+ * @param rand_engine  Random number engine to use, such as std::ranlux48.
+ */
+template <typename ValueType, typename IndexType, typename RandEngine>
+std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_fbcsr_from_csr(
+    const std::shared_ptr<const ReferenceExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* const csrmat, const int block_size,
+    const bool row_diag_dominant, RandEngine&& rand_engine)
+{
+    const auto nbrows = static_cast<IndexType>(csrmat->get_size()[0]);
+    const auto nbcols = static_cast<IndexType>(csrmat->get_size()[1]);
+    const auto nbnz_temp =
+        static_cast<IndexType>(csrmat->get_num_stored_elements());
+    const int bs2 = block_size * block_size;
+
+    auto fmtx = matrix::Fbcsr<ValueType, IndexType>::create(
+        exec,
+        dim<2>{static_cast<size_type>(nbrows * block_size),
+               static_cast<size_type>(nbcols * block_size)},
+        nbnz_temp * bs2, block_size);
+    exec->copy(nbrows + 1, csrmat->get_const_row_ptrs(), fmtx->get_row_ptrs());
+    exec->copy(nbnz_temp, csrmat->get_const_col_idxs(), fmtx->get_col_idxs());
+
+    // We assume diagonal blocks are present for the diagonally-dominant case
+
+    const IndexType nbnz = fmtx->get_num_stored_elements() / bs2;
+
+    const IndexType* const row_ptrs = fmtx->get_const_row_ptrs();
+    const IndexType* const col_idxs = fmtx->get_const_col_idxs();
+    const IndexType nnz = nbnz * bs2;
+    ValueType* const vals = fmtx->get_values();
+    std::uniform_real_distribution<gko::remove_complex<ValueType>>
+        off_diag_dist(-1.0, 1.0);
+
+    for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) {
+        if (row_diag_dominant) {
+            const IndexType nrownz =
+                (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size;
+
+            std::uniform_real_distribution<gko::remove_complex<ValueType>>
+                diag_dist(1.01 * nrownz, 2 * nrownz);
+
+            for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1];
+                 ibz++) {
+                for (int i = 0; i < block_size * block_size; i++) {
+                    vals[ibz * bs2 + i] =
+                        gko::test::detail::get_rand_value<ValueType>(
+                            off_diag_dist, rand_engine);
+                }
+                if (col_idxs[ibz] == ibrow) {
+                    for (int i = 0; i < block_size; i++) {
+                        vals[ibz * bs2 + i * block_size + i] =
+                            static_cast<ValueType>(pow(-1, i)) *
+                            gko::test::detail::get_rand_value<ValueType>(
+                                diag_dist, rand_engine);
+                    }
+                }
+            }
+        } else {
+            for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1];
+                 ibz++) {
+                for (int i = 0; i < bs2; i++) {
+                    vals[ibz * bs2 + i] =
+                        gko::test::detail::get_rand_value<ValueType>(
+                            off_diag_dist, rand_engine);
+                }
+            }
+        }
+    }
+
+    return fmtx;
+}
+
+
+/**
+ * Generates a random block CSR matrix.
+ *
+ * The block sparsity pattern of the generated matrix always has the diagonal
+ * entry in each block-row.
+ *
+ * @param exec  Reference executor.
+ * @param nbrows  The number of block-rows in the generated matrix.
+ * @param nbcols  The number of block-columns in the generated matrix.
+ * @param mat_blk_sz  Block size of the generated matrix.
+ * @param diag_dominant  If true, a row-diagonal-dominant Fbcsr matrix is
+ *                       generated.
+ * @param unsort  If true, the blocks of the generated matrix within each
+ *                block-row are ordered randomly. Otherwise, blocks in each row
+ *                are ordered by block-column index.
+ * @param engine  Random number engine to use, such as std::ranlux48.
+ */
+template <typename ValueType, typename IndexType, typename RandEngine>
+std::unique_ptr<matrix::Fbcsr<ValueType, IndexType>> generate_random_fbcsr(
+    std::shared_ptr<const ReferenceExecutor> ref, const IndexType nbrows,
+    const IndexType nbcols, const int mat_blk_sz, const bool diag_dominant,
+    const bool unsort, RandEngine&& engine)
+{
+    using real_type = gko::remove_complex<ValueType>;
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>> rand_csr_ref =
+        diag_dominant
+            ? generate_random_matrix_with_diag<
+                  matrix::Csr<ValueType, IndexType>>(
+                  nbrows, nbcols,
+                  std::uniform_int_distribution<IndexType>(0, nbcols - 1),
+                  std::normal_distribution<real_type>(0.0, 1.0),
+                  std::move(engine), ref)
+            : generate_random_matrix<matrix::Csr<ValueType, IndexType>>(
+                  nbrows, nbcols,
+                  std::uniform_int_distribution<IndexType>(0, nbcols - 1),
+                  std::normal_distribution<real_type>(0.0, 1.0),
+                  std::move(engine), ref);
+    if (unsort && rand_csr_ref->is_sorted_by_column_index()) {
+        unsort_matrix(rand_csr_ref.get(), engine);
+    }
+    return generate_fbcsr_from_csr(ref, rand_csr_ref.get(), mat_blk_sz,
+                                   diag_dominant, std::move(engine));
+}
+
+
+}  // namespace test
+}  // namespace gko
+
+#endif  // GKO_CORE_TEST_UTILS_FB_MATRIX_GENERATOR_HPP_
diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp
new file mode 100644
index 00000000000..a37f370f8fd
--- /dev/null
+++ b/core/test/utils/fb_matrix_generator_test.cpp
@@ -0,0 +1,175 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/test/utils/fb_matrix_generator.hpp"
+
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include "accessor/block_col_major.hpp"
+#include "core/base/utils.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+
+
+namespace {
+
+
+class BlockMatrixGenerator : public ::testing::Test {
+protected:
+    using real_type = double;
+    using value_type = std::complex<real_type>;
+
+    BlockMatrixGenerator()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::test::generate_random_matrix<
+              gko::matrix::Csr<real_type, int>>(
+              nbrows, nbcols, std::normal_distribution<real_type>(10, 5),
+              std::normal_distribution<real_type>(20.0, 5.0), std::ranlux48(42),
+              exec)),
+          rbmtx(gko::test::generate_fbcsr_from_csr(exec, mtx.get(), blk_sz,
+                                                   false, std::ranlux48(42))),
+          rbmtx_dd(gko::test::generate_fbcsr_from_csr(exec, mtx.get(), blk_sz,
+                                                      true, std::ranlux48(42))),
+          cbmtx(gko::test::generate_random_fbcsr<value_type>(
+              exec, nbrows, nbcols, blk_sz, true, false, std::ranlux48(42)))
+    {}
+
+    const int nbrows = 100;
+    const int nbcols = nbrows;
+    const int blk_sz = 5;
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<gko::matrix::Csr<real_type, int>> mtx;
+    std::unique_ptr<gko::matrix::Fbcsr<real_type, int>> rbmtx;
+    std::unique_ptr<gko::matrix::Fbcsr<real_type, int>> rbmtx_dd;
+    std::unique_ptr<gko::matrix::Fbcsr<std::complex<real_type>, int>> cbmtx;
+
+    template <typename InputIterator, typename ValueType>
+    ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start,
+                             InputIterator sample_end)
+    {
+        using std::pow;
+        ValueType res = 0;
+        ValueType num_elems = 0;
+        while (sample_start != sample_end) {
+            auto tmp = *(sample_start++);
+            res += pow(tmp - c, n);
+            num_elems += 1;
+        }
+        return res / num_elems;
+    }
+};
+
+
+TEST_F(BlockMatrixGenerator, OutputHasCorrectSize)
+{
+    ASSERT_EQ(rbmtx->get_size(), gko::dim<2>(nbrows * blk_sz, nbcols * blk_sz));
+    ASSERT_EQ(rbmtx_dd->get_size(),
+              gko::dim<2>(nbrows * blk_sz, nbcols * blk_sz));
+    ASSERT_EQ(cbmtx->get_size(), gko::dim<2>(nbrows * blk_sz, nbcols * blk_sz));
+    ASSERT_EQ(rbmtx->get_block_size(), blk_sz);
+    ASSERT_EQ(rbmtx_dd->get_block_size(), blk_sz);
+    ASSERT_EQ(cbmtx->get_block_size(), blk_sz);
+}
+
+
+TEST_F(BlockMatrixGenerator, OutputHasCorrectSparsityPattern)
+{
+    ASSERT_EQ(mtx->get_num_stored_elements(),
+              rbmtx->get_num_stored_elements() / blk_sz / blk_sz);
+    for (int irow = 0; irow < nbrows; irow++) {
+        const int start = mtx->get_const_row_ptrs()[irow];
+        const int end = mtx->get_const_row_ptrs()[irow + 1];
+        ASSERT_EQ(start, rbmtx->get_const_row_ptrs()[irow]);
+        ASSERT_EQ(end, rbmtx->get_const_row_ptrs()[irow + 1]);
+        for (int iz = start; iz < end; iz++) {
+            ASSERT_EQ(mtx->get_const_col_idxs()[iz],
+                      rbmtx->get_const_col_idxs()[iz]);
+        }
+    }
+}
+
+
+TEST_F(BlockMatrixGenerator, ComplexOutputIsRowDiagonalDominantWhenRequested)
+{
+    using Dbv_t =
+        gko::acc::range<gko::acc::block_col_major<const value_type, 3>>;
+    const auto nbnz = cbmtx->get_num_stored_blocks();
+    const Dbv_t vals(gko::to_std_array<gko::size_type>(nbnz, blk_sz, blk_sz),
+                     cbmtx->get_const_values());
+    const int* const row_ptrs = cbmtx->get_const_row_ptrs();
+    const int* const col_idxs = cbmtx->get_const_col_idxs();
+
+    for (int irow = 0; irow < nbrows; irow++) {
+        std::vector<real_type> row_del_sum(blk_sz, 0.0);
+        std::vector<real_type> diag_val(blk_sz, 0.0);
+        bool diagfound{false};
+        for (int iz = row_ptrs[irow]; iz < row_ptrs[irow + 1]; iz++) {
+            if (col_idxs[iz] == irow) {
+                diagfound = true;
+                for (int i = 0; i < blk_sz; i++) {
+                    for (int j = 0; j < blk_sz; j++) {
+                        if (i == j) {
+                            diag_val[i] = abs(vals(iz, i, i));
+                        } else {
+                            row_del_sum[i] += abs(vals(iz, i, j));
+                        }
+                    }
+                }
+            } else {
+                for (int i = 0; i < blk_sz; i++) {
+                    for (int j = 0; j < blk_sz; j++) {
+                        row_del_sum[i] += abs(vals(iz, i, j));
+                    }
+                }
+            }
+        }
+        std::vector<real_type> diag_dom(blk_sz);
+        for (int i = 0; i < blk_sz; i++) {
+            diag_dom[i] = diag_val[i] - row_del_sum[i];
+        }
+
+        ASSERT_TRUE(diagfound);
+        for (int i = 0; i < blk_sz; i++) {
+            ASSERT_GT(diag_val[i], row_del_sum[i]);
+        }
+    }
+}
+
+
+}  // namespace
diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp
new file mode 100644
index 00000000000..32e9238e758
--- /dev/null
+++ b/cuda/base/cusparse_block_bindings.hpp
@@ -0,0 +1,496 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_BASE_CUSPARSE_BLOCK_BINDINGS_HPP_
+#define GKO_CUDA_BASE_CUSPARSE_BLOCK_BINDINGS_HPP_
+
+
+#include <cuda.h>
+#include <cusparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/cusparse_bindings.hpp"
+#include "cuda/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The CUSPARSE namespace.
+ *
+ * @ingroup cusparse
+ */
+namespace cusparse {
+
+
+/// Default storage layout within each small dense block
+constexpr cusparseDirection_t blockDir = CUSPARSE_DIRECTION_COLUMN;
+
+
+#define GKO_BIND_CUSPARSE32_BSRMV(ValueType, CusparseName)                    \
+    inline void bsrmv(cusparseHandle_t handle, cusparseOperation_t transA,    \
+                      int32 mb, int32 nb, int32 nnzb, const ValueType* alpha, \
+                      const cusparseMatDescr_t descrA, const ValueType* valA, \
+                      const int32* rowPtrA, const int32* colIndA,             \
+                      int block_size, const ValueType* x,                     \
+                      const ValueType* beta, ValueType* y)                    \
+    {                                                                         \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                           \
+            handle, blockDir, transA, mb, nb, nnzb, as_culibs_type(alpha),    \
+            descrA, as_culibs_type(valA), rowPtrA, colIndA, block_size,       \
+            as_culibs_type(x), as_culibs_type(beta), as_culibs_type(y)));     \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+#define GKO_BIND_CUSPARSE64_BSRMV(ValueType, CusparseName)                    \
+    inline void bsrmv(cusparseHandle_t handle, cusparseOperation_t transA,    \
+                      int64 mb, int64 nb, int64 nnzb, const ValueType* alpha, \
+                      const cusparseMatDescr_t descrA, const ValueType* valA, \
+                      const int64* rowPtrA, const int64* colIndA,             \
+                      int block_size, const ValueType* x,                     \
+                      const ValueType* beta, ValueType* y)                    \
+        GKO_NOT_IMPLEMENTED;                                                  \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE32_BSRMV(float, cusparseSbsrmv);
+GKO_BIND_CUSPARSE32_BSRMV(double, cusparseDbsrmv);
+GKO_BIND_CUSPARSE32_BSRMV(std::complex<float>, cusparseCbsrmv);
+GKO_BIND_CUSPARSE32_BSRMV(std::complex<double>, cusparseZbsrmv);
+GKO_BIND_CUSPARSE64_BSRMV(float, cusparseSbsrmv);
+GKO_BIND_CUSPARSE64_BSRMV(double, cusparseDbsrmv);
+GKO_BIND_CUSPARSE64_BSRMV(std::complex<float>, cusparseCbsrmv);
+GKO_BIND_CUSPARSE64_BSRMV(std::complex<double>, cusparseZbsrmv);
+template <typename ValueType>
+GKO_BIND_CUSPARSE32_BSRMV(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_CUSPARSE64_BSRMV(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_CUSPARSE32_BSRMV
+#undef GKO_BIND_CUSPARSE64_BSRMV
+
+
+#define GKO_BIND_CUSPARSE32_BSRMM(ValueType, CusparseName)                     \
+    inline void bsrmm(cusparseHandle_t handle, cusparseOperation_t transA,     \
+                      cusparseOperation_t transB, int32 mb, int32 n, int32 kb, \
+                      int32 nnzb, const ValueType* alpha,                      \
+                      const cusparseMatDescr_t descrA, const ValueType* valA,  \
+                      const int32* rowPtrA, const int32* colIndA,              \
+                      int block_size, const ValueType* B, int32 ldb,           \
+                      const ValueType* beta, ValueType* C, int32 ldc)          \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
+            CusparseName(handle, blockDir, transA, transB, mb, n, kb, nnzb,    \
+                         as_culibs_type(alpha), descrA, as_culibs_type(valA),  \
+                         rowPtrA, colIndA, block_size, as_culibs_type(B), ldb, \
+                         as_culibs_type(beta), as_culibs_type(C), ldc));       \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_CUSPARSE64_BSRMM(ValueType, CusparseName)                    \
+    inline void bsrmm(                                                        \
+        cusparseHandle_t handle, cusparseOperation_t transA,                  \
+        cusparseOperation_t transB, int64 mb, int64 n, int64 kb, int64 nnzb,  \
+        const ValueType* alpha, const cusparseMatDescr_t descrA,              \
+        const ValueType* valA, const int64* rowPtrA, const int64* colIndA,    \
+        int block_size, const ValueType* B, int64 ldb, const ValueType* beta, \
+        ValueType* C, int64 ldc) GKO_NOT_IMPLEMENTED;                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE32_BSRMM(float, cusparseSbsrmm);
+GKO_BIND_CUSPARSE32_BSRMM(double, cusparseDbsrmm);
+GKO_BIND_CUSPARSE32_BSRMM(std::complex<float>, cusparseCbsrmm);
+GKO_BIND_CUSPARSE32_BSRMM(std::complex<double>, cusparseZbsrmm);
+GKO_BIND_CUSPARSE64_BSRMM(float, cusparseSbsrmm);
+GKO_BIND_CUSPARSE64_BSRMM(double, cusparseDbsrmm);
+GKO_BIND_CUSPARSE64_BSRMM(std::complex<float>, cusparseCbsrmm);
+GKO_BIND_CUSPARSE64_BSRMM(std::complex<double>, cusparseZbsrmm);
+template <typename ValueType>
+GKO_BIND_CUSPARSE32_BSRMM(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_CUSPARSE64_BSRMM(ValueType, detail::not_implemented);
+
+
+#undef GKO_BIND_CUSPARSE32_BSRMM
+#undef GKO_BIND_CUSPARSE64_BSRMM
+
+
+template <typename ValueType, typename IndexType>
+inline int bsr_transpose_buffersize(cusparseHandle_t handle, IndexType mb,
+                                    IndexType nb, IndexType nnzb,
+                                    const ValueType* origValA,
+                                    const IndexType* origRowPtrA,
+                                    const IndexType* origColIndA,
+                                    int rowblocksize,
+                                    int colblocksize) GKO_NOT_IMPLEMENTED;
+
+template <typename ValueType, typename IndexType>
+inline void bsr_transpose(cusparseHandle_t handle, IndexType mb, IndexType nb,
+                          IndexType nnzb, const ValueType* origValA,
+                          const IndexType* origRowPtrA,
+                          const IndexType* origColIndA, int rowblocksize,
+                          int colblocksize, ValueType* TransValA,
+                          IndexType* transRowIndA, IndexType* transColPtrA,
+                          cusparseAction_t copyValues,
+                          cusparseIndexBase_t idxBase,
+                          void* pBuffer) GKO_NOT_IMPLEMENTED;
+
+// cuSparse does not transpose the blocks themselves,
+//  only the sparsity pattern
+#define GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(ValueType, CusparseName)           \
+    template <>                                                                \
+    inline int bsr_transpose_buffersize<ValueType, int32>(                     \
+        cusparseHandle_t handle, int32 mb, int32 nb, int32 nnzb,               \
+        const ValueType* origValA, const int32* origRowPtrA,                   \
+        const int32* origColIndA, int rowblocksize, int colblocksize)          \
+    {                                                                          \
+        int pBufferSize = -1;                                                  \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName##_bufferSize(               \
+            handle, mb, nb, nnzb, as_culibs_type(origValA), origRowPtrA,       \
+            origColIndA, rowblocksize, colblocksize, &pBufferSize));           \
+        return pBufferSize;                                                    \
+    }                                                                          \
+    template <>                                                                \
+    inline void bsr_transpose<ValueType, int32>(                               \
+        cusparseHandle_t handle, int32 mb, int32 nb, int32 nnzb,               \
+        const ValueType* origValA, const int32* origRowPtrA,                   \
+        const int32* origColIndA, int rowblocksize, int colblocksize,          \
+        ValueType* transValA, int32* transRowIdxA, int32* transColPtrA,        \
+        cusparseAction_t copyValues, cusparseIndexBase_t idxBase,              \
+        void* pBuffer)                                                         \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
+            CusparseName(handle, mb, nb, nnzb, as_culibs_type(origValA),       \
+                         origRowPtrA, origColIndA, rowblocksize, colblocksize, \
+                         as_culibs_type(transValA), transRowIdxA,              \
+                         transColPtrA, copyValues, idxBase, pBuffer));         \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(float, cusparseSgebsr2gebsc);
+GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(double, cusparseDgebsr2gebsc);
+GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex<float>, cusparseCgebsr2gebsc);
+GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex<double>, cusparseZgebsr2gebsc);
+
+#undef GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32
+
+
+inline std::unique_ptr<std::remove_pointer_t<bsrsm2Info_t>,
+                       std::function<void(bsrsm2Info_t)>>
+create_bsr_trsm_info()
+{
+    bsrsm2Info_t info{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateBsrsm2Info(&info));
+    return {info, [](bsrsm2Info_t info) { cusparseDestroyBsrsm2Info(info); }};
+}
+
+
+inline std::unique_ptr<std::remove_pointer_t<bsrilu02Info_t>,
+                       std::function<void(bsrilu02Info_t)>>
+create_bilu0_info()
+{
+    bsrilu02Info_t info{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateBsrilu02Info(&info));
+    return {info,
+            [](bsrilu02Info_t info) { cusparseDestroyBsrilu02Info(info); }};
+}
+
+
+#define GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(ValueType, CusparseName)          \
+    inline int bsrsm2_buffer_size(                                             \
+        cusparseHandle_t handle, cusparseOperation_t transA,                   \
+        cusparseOperation_t transX, int32 mb, int32 n, int32 nnzb,             \
+        const cusparseMatDescr_t descr, ValueType* val, const int32* rowPtr,   \
+        const int32* colInd, int block_sz, bsrsm2Info_t factor_info)           \
+    {                                                                          \
+        int factor_work_size = -1;                                             \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
+            CusparseName(handle, blockDir, transA, transX, mb, n, nnzb, descr, \
+                         as_culibs_type(val), rowPtr, colInd, block_sz,        \
+                         factor_info, &factor_work_size));                     \
+        return factor_work_size;                                               \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(ValueType, CusparseName)        \
+    inline int64 bsrsm2_buffer_size(                                         \
+        cusparseHandle_t handle, cusparseOperation_t transA,                 \
+        cusparseOperation_t transX, int64 mb, int64 n, int64 nnzb,           \
+        const cusparseMatDescr_t descr, ValueType* val, const int64* rowPtr, \
+        const int64* colInd, int block_size, bsrsm2Info_t factor_info)       \
+        GKO_NOT_IMPLEMENTED;                                                 \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(float, cusparseSbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(double, cusparseDbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(std::complex<float>,
+                                     cusparseCbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(std::complex<double>,
+                                     cusparseZbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(float, cusparseSbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(double, cusparseDbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(std::complex<float>,
+                                     cusparseCbsrsm2_bufferSize);
+GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(std::complex<double>,
+                                     cusparseZbsrsm2_bufferSize);
+template <typename ValueType>
+GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE(ValueType, detail::not_implemented);
+#undef GKO_BIND_CUSPARSE32_BSRSM_BUFFERSIZE
+#undef GKO_BIND_CUSPARSE64_BSRSM_BUFFERSIZE
+
+
+#define GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(ValueType, CusparseName)           \
+    inline void bsrsm2_analysis(                                               \
+        cusparseHandle_t handle, cusparseOperation_t trans1,                   \
+        cusparseOperation_t trans2, int32 mb, int32 n, int32 nnzb,             \
+        const cusparseMatDescr_t descr, const ValueType* val,                  \
+        const int32* rowPtr, const int32* colInd, int block_size,              \
+        bsrsm2Info_t factor_info, cusparseSolvePolicy_t policy,                \
+        void* factor_work_vec)                                                 \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                         \
+            CusparseName(handle, blockDir, trans1, trans2, mb, n, nnzb, descr, \
+                         as_culibs_type(val), rowPtr, colInd, block_size,      \
+                         factor_info, policy, factor_work_vec));               \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+#define GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(ValueType, CusparseName)           \
+    inline void bsrsm2_analysis(                                               \
+        cusparseHandle_t handle, cusparseOperation_t trans1,                   \
+        cusparseOperation_t trans2, size_type mb, size_type n, size_type nnzb, \
+        const cusparseMatDescr_t descr, const ValueType* val,                  \
+        const int64* rowPtr, const int64* colInd, int block_size,              \
+        bsrsm2Info_t factor_info, cusparseSolvePolicy_t policy,                \
+        void* factor_work_vec) GKO_NOT_IMPLEMENTED;                            \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(float, cusparseSbsrsm2_analysis);
+GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(double, cusparseDbsrsm2_analysis);
+GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(std::complex<float>,
+                                    cusparseCbsrsm2_analysis);
+GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(std::complex<double>,
+                                    cusparseZbsrsm2_analysis);
+GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(float, cusparseSbsrsm2_analysis);
+GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(double, cusparseDbsrsm2_analysis);
+GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(std::complex<float>,
+                                    cusparseCbsrsm2_analysis);
+GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(std::complex<double>,
+                                    cusparseZbsrsm2_analysis);
+template <typename ValueType>
+GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS(ValueType, detail::not_implemented);
+#undef GKO_BIND_CUSPARSE32_BSRSM2_ANALYSIS
+#undef GKO_BIND_CUSPARSE64_BSRSM2_ANALYSIS
+
+
+#define GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(ValueType, CusparseName)            \
+    inline void bsrsm2_solve(                                                \
+        cusparseHandle_t handle, cusparseOperation_t transA,                 \
+        cusparseOperation_t transX, int32 mb, int32 n, int32 nnzb,           \
+        const ValueType* alpha, const cusparseMatDescr_t descrA,             \
+        const ValueType* valA, const int32* rowPtrA, const int32* colIndA,   \
+        int blockSizeA, bsrsm2Info_t factor_info, const ValueType* b,        \
+        int32 ldb, ValueType* x, int32 ldx, cusparseSolvePolicy_t policy,    \
+        void* factor_work_vec)                                               \
+    {                                                                        \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                          \
+            handle, blockDir, transA, transX, mb, n, nnzb,                   \
+            as_culibs_type(alpha), descrA, as_culibs_type(valA), rowPtrA,    \
+            colIndA, blockSizeA, factor_info, as_culibs_type(b), ldb,        \
+            as_culibs_type(x), ldx, policy, factor_work_vec));               \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+#define GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(ValueType, CusparseName)              \
+    inline void bsrsm2_solve(                                                  \
+        cusparseHandle_t handle, cusparseOperation_t trans1,                   \
+        cusparseOperation_t trans2, size_type mb, size_type n, size_type nnzb, \
+        const ValueType* alpha, const cusparseMatDescr_t descr,                \
+        const ValueType* val, const int64* rowPtr, const int64* colInd,        \
+        int block_size, bsrsm2Info_t factor_info, const ValueType* b,          \
+        int64 ldb, ValueType* x, int64 ldx, cusparseSolvePolicy_t policy,      \
+        void* factor_work_vec) GKO_NOT_IMPLEMENTED;                            \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(float, cusparseSbsrsm2_solve);
+GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(double, cusparseDbsrsm2_solve);
+GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(std::complex<float>, cusparseCbsrsm2_solve);
+GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(std::complex<double>, cusparseZbsrsm2_solve);
+GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(float, cusparseSbsrsm2_solve);
+GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(double, cusparseDbsrsm2_solve);
+GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(std::complex<float>, cusparseCbsrsm2_solve);
+GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(std::complex<double>, cusparseZbsrsm2_solve);
+template <typename ValueType>
+GKO_BIND_CUSPARSE32_BSRSM2_SOLVE(ValueType, detail::not_implemented);
+template <typename ValueType>
+GKO_BIND_CUSPARSE64_BSRSM2_SOLVE(ValueType, detail::not_implemented);
+#undef GKO_BIND_CUSPARSE32_BSRSM2_SOLVE
+#undef GKO_BIND_CUSPARSE64_BSRSM2_SOLVE
+
+
+template <typename ValueType, typename IndexType>
+int bilu0_buffer_size(cusparseHandle_t handle, IndexType mb, IndexType nnzb,
+                      const cusparseMatDescr_t descr, const ValueType* vals,
+                      const IndexType* row_ptrs, const IndexType* col_idxs,
+                      int block_sz, bsrilu02Info_t info) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(ValueType, CusparseName)          \
+    template <>                                                               \
+    inline int bilu0_buffer_size<ValueType, int32>(                           \
+        cusparseHandle_t handle, int32 mb, int32 nnzb,                        \
+        const cusparseMatDescr_t descr, const ValueType* vals,                \
+        const int32* row_ptrs, const int32* col_idxs, int block_size,         \
+        bsrilu02Info_t info)                                                  \
+    {                                                                         \
+        int tmp_buffer_sz{};                                                  \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                           \
+            handle, blockDir, mb, nnzb, descr,                                \
+            as_culibs_type(const_cast<ValueType*>(vals)), row_ptrs, col_idxs, \
+            block_size, info, &tmp_buffer_sz));                               \
+        return tmp_buffer_sz;                                                 \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(float, cusparseSbsrilu02_bufferSize);
+GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(double, cusparseDbsrilu02_bufferSize);
+GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(std::complex<float>,
+                                    cusparseCbsrilu02_bufferSize);
+GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE(std::complex<double>,
+                                    cusparseZbsrilu02_bufferSize);
+
+#undef GKO_BIND_CUSPARSE_BILU0_BUFFER_SIZE
+
+
+template <typename ValueType, typename IndexType>
+inline void bilu0_analysis(cusparseHandle_t handle, IndexType mb,
+                           IndexType nnzb, const cusparseMatDescr_t descr,
+                           ValueType* vals, const IndexType* row_ptrs,
+                           const IndexType* col_idxs, int block_size,
+                           bsrilu02Info_t info, cusparseSolvePolicy_t policy,
+                           void* buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_BILU0_ANALYSIS(ValueType, CusparseName)              \
+    template <>                                                                \
+    inline void bilu0_analysis<ValueType, int32>(                              \
+        cusparseHandle_t handle, int32 mb, int32 nnzb,                         \
+        const cusparseMatDescr_t descr, ValueType* vals,                       \
+        const int32* row_ptrs, const int32* col_idxs, int block_size,          \
+        bsrilu02Info_t info, cusparseSolvePolicy_t policy, void* buffer)       \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                            \
+            handle, blockDir, mb, nnzb, descr, as_culibs_type(vals), row_ptrs, \
+            col_idxs, block_size, info, policy, buffer));                      \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_BILU0_ANALYSIS(float, cusparseSbsrilu02_analysis);
+GKO_BIND_CUSPARSE_BILU0_ANALYSIS(double, cusparseDbsrilu02_analysis);
+GKO_BIND_CUSPARSE_BILU0_ANALYSIS(std::complex<float>,
+                                 cusparseCbsrilu02_analysis);
+GKO_BIND_CUSPARSE_BILU0_ANALYSIS(std::complex<double>,
+                                 cusparseZbsrilu02_analysis);
+
+#undef GKO_BIND_CUSPARSE_BILU0_ANALYSIS
+
+
+template <typename ValueType, typename IndexType>
+void bilu0(cusparseHandle_t handle, IndexType mb, IndexType nnzb,
+           const cusparseMatDescr_t descr, ValueType* vals,
+           const IndexType* row_ptrs, const IndexType* col_idxs, int block_size,
+           bsrilu02Info_t info, cusparseSolvePolicy_t policy,
+           void* buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_BILU0(ValueType, CusparseName)                       \
+    template <>                                                                \
+    inline void bilu0<ValueType, int32>(                                       \
+        cusparseHandle_t handle, int32 mb, int32 nnzb,                         \
+        const cusparseMatDescr_t descr, ValueType* vals,                       \
+        const int32* row_ptrs, const int32* col_idxs, int block_size,          \
+        bsrilu02Info_t info, cusparseSolvePolicy_t policy, void* buffer)       \
+    {                                                                          \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(CusparseName(                            \
+            handle, blockDir, mb, nnzb, descr, as_culibs_type(vals), row_ptrs, \
+            col_idxs, block_size, info, policy, buffer));                      \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+GKO_BIND_CUSPARSE_BILU0(float, cusparseSbsrilu02);
+GKO_BIND_CUSPARSE_BILU0(double, cusparseDbsrilu02);
+GKO_BIND_CUSPARSE_BILU0(std::complex<float>, cusparseCbsrilu02);
+GKO_BIND_CUSPARSE_BILU0(std::complex<double>, cusparseZbsrilu02);
+
+#undef GKO_BIND_CUSPARSE_BILU0
+
+
+}  // namespace cusparse
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_CUSPARSE_BLOCK_BINDINGS_HPP_
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index 30f7fa1ba96..d1d6285e839 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -376,7 +376,7 @@ void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
-                                    run_generic_kernel_row_reduction)
+                                    run_generic_kernel_row_reduction);
 
 
 template <int subwarp_size, typename ValueType, typename KernelFunction,
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index c79e7a10dbb..cef7bb9383b 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cublas_v2.h>
+#include <cuda.h>
 #include <cuda_fp16.h>
 #include <cusparse.h>
 #include <thrust/complex.h>
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index 2fb777b2cac..bb775cd7405 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -43,12 +43,41 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/block_sizes.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
+#include "cuda/base/cublas_bindings.hpp"
+#include "cuda/base/cusparse_bindings.hpp"
+#include "cuda/base/cusparse_block_bindings.hpp"
+#include "cuda/base/math.hpp"
+#include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/atomic.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/merging.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
 namespace cuda {
+
+
+namespace csr_reuse {
+
+
+constexpr int warps_in_block = 4;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+constexpr int default_block_size{512};
+
+
+#include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
+
+
+}  // namespace csr_reuse
+
+
 /**
  * @brief The fixed-size block compressed sparse row matrix format namespace.
  *
@@ -57,22 +86,129 @@ namespace cuda {
 namespace fbcsr {
 
 
+constexpr int default_block_size{512};
+
+
+#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
+#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
+
+
+namespace {
+
+template <typename ValueType>
+void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
+                     const size_type nrows, const size_type ncols,
+                     const size_type orig_stride, const ValueType* const orig,
+                     const size_type trans_stride, ValueType* const trans)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        {
+            cublas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha,
+                         orig, orig_stride, &beta,
+                         static_cast<ValueType*>(nullptr), nrows, trans,
+                         trans_stride);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* a,
-          const matrix::Dense<ValueType>* b,
-          matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
+          const matrix::Fbcsr<ValueType, IndexType>* const a,
+          const matrix::Dense<ValueType>* const b,
+          matrix::Dense<ValueType>* const c)
+{
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_cusparse_handle();
+        cusparse::pointer_mode_guard pm_guard(handle);
+        const auto alpha = one<ValueType>();
+        const auto beta = zero<ValueType>();
+        auto descr = cusparse::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        assert(nrhs == c->get_size()[1]);
+        if (nrhs == 1) {
+            cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+                            nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs,
+                            b->get_const_values(), &beta, c->get_values());
+        } else {
+            auto trans_c =
+                Array<ValueType>(exec, c->get_size()[0] * c->get_size()[1]);
+            cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                            &alpha, descr, values, row_ptrs, col_idxs, bs,
+                            b->get_const_values(), nrhs, &beta,
+                            trans_c.get_data(), mb * bs);
+            dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(),
+                            c->get_stride(), c->get_values());
+        }
+        cusparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* a,
-                   const matrix::Dense<ValueType>* b,
-                   const matrix::Dense<ValueType>* beta,
-                   matrix::Dense<ValueType>* c) GKO_NOT_IMPLEMENTED;
+                   const matrix::Dense<ValueType>* const alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* const a,
+                   const matrix::Dense<ValueType>* const b,
+                   const matrix::Dense<ValueType>* const beta,
+                   matrix::Dense<ValueType>* const c)
+{
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_cusparse_handle();
+        const auto alphp = alpha->get_const_values();
+        const auto betap = beta->get_const_values();
+        auto descr = cusparse::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        assert(nrhs == c->get_size()[1]);
+        if (nrhs == 1) {
+            cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+                            nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                            b->get_const_values(), betap, c->get_values());
+        } else {
+            auto trans_c =
+                Array<ValueType>(exec, c->get_size()[0] * c->get_size()[1]);
+            dense_transpose(exec, mb * bs, nrhs, c->get_stride(),
+                            c->get_values(), mb * bs, trans_c.get_data());
+            cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                            alphp, descr, values, row_ptrs, col_idxs, bs,
+                            b->get_const_values(), nrhs, betap,
+                            trans_c.get_data(), mb * bs);
+            dense_transpose(exec, nrhs, mb * bs, mb * bs, trans_c.get_data(),
+                            c->get_stride(), c->get_values());
+        }
+        cusparse::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
@@ -103,10 +239,63 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
+namespace {
+
+
+template <int mat_blk_sz, typename ValueType, typename IndexType>
+void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
+                           matrix::Fbcsr<ValueType, IndexType>* const mat)
+{
+    constexpr int subwarp_size = config::warp_size;
+    const size_type nbnz = mat->get_num_stored_blocks();
+    const size_type numthreads = nbnz * subwarp_size;
+    const size_type numblocks = ceildiv(numthreads, default_block_size);
+    const dim3 block_size{static_cast<unsigned>(default_block_size), 1, 1};
+    const dim3 grid_dim{static_cast<unsigned>(numblocks), 1, 1};
+    kernel::transpose_blocks<mat_blk_sz, subwarp_size>
+        <<<grid_dim, block_size, 0, 0>>>(nbnz, mat->get_values());
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
+                                    transpose_blocks_impl);
+
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const CudaExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* orig,
-               matrix::Fbcsr<ValueType, IndexType>* trans) GKO_NOT_IMPLEMENTED;
+void transpose(const std::shared_ptr<const CudaExecutor> exec,
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
+{
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        const int bs = orig->get_block_size();
+        const IndexType nnzb =
+            static_cast<IndexType>(orig->get_num_stored_blocks());
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        const IndexType buffer_size = cusparse::bsr_transpose_buffersize(
+            exec->get_cusparse_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        cusparse::bsr_transpose(
+            exec->get_cusparse_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
+            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
+            copyValues, idxBase, buffer);
+
+        // transpose blocks
+        select_transpose_blocks(
+            fixedblock::compiled_kernels(),
+            [bs](int compiled_block_size) { return bs == compiled_block_size; },
+            syn::value_list<int>(), syn::type_list<>(), trans);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
@@ -116,7 +305,13 @@ template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
                     const matrix::Fbcsr<ValueType, IndexType>* orig,
                     matrix::Fbcsr<ValueType, IndexType>* trans)
-    GKO_NOT_IMPLEMENTED;
+{
+    const int grid_size =
+        ceildiv(trans->get_num_stored_elements(), default_block_size);
+    transpose(exec, orig, trans);
+    csr_reuse::conjugate_kernel<<<grid_size, default_block_size>>>(
+        trans->get_num_stored_elements(), as_cuda_type(trans->get_values()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
@@ -125,8 +320,31 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void calculate_max_nnz_per_row(
     std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Fbcsr<ValueType, IndexType>* source,
-    size_type* result) GKO_NOT_IMPLEMENTED;
+    const matrix::Fbcsr<ValueType, IndexType>* const source,
+    size_type* const result)
+{
+    const auto num_b_rows = source->get_num_block_rows();
+    const auto bs = source->get_block_size();
+
+    auto nnz_per_row = Array<size_type>(exec, num_b_rows);
+    auto block_results = Array<size_type>(exec, default_block_size);
+    auto d_result = Array<size_type>(exec, 1);
+
+    const auto grid_dim = ceildiv(num_b_rows, default_block_size);
+    csr_reuse::kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
+        num_b_rows, as_cuda_type(source->get_const_row_ptrs()),
+        nnz_per_row.get_data());
+
+    const auto n = ceildiv(num_b_rows, default_block_size);
+    const auto reduce_dim = n <= default_block_size ? n : default_block_size;
+    csr_reuse::kernel::reduce_max_nnz<<<reduce_dim, default_block_size>>>(
+        num_b_rows, nnz_per_row.get_const_data(), block_results.get_data());
+
+    csr_reuse::kernel::reduce_max_nnz<<<1, default_block_size>>>(
+        reduce_dim, block_results.get_const_data(), d_result.get_data());
+
+    *result = bs * exec->copy_val_to_host(d_result.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
@@ -145,8 +363,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Fbcsr<ValueType, IndexType>* to_check,
-    bool* is_sorted) GKO_NOT_IMPLEMENTED;
+    const matrix::Fbcsr<ValueType, IndexType>* const to_check,
+    bool* const is_sorted)
+{
+    *is_sorted = true;
+    auto gpu_array = Array<bool>(exec, 1);
+    // need to initialize the GPU value to true
+    exec->copy_from<bool>(exec->get_master().get(), 1, is_sorted,
+                          gpu_array.get_data());
+    auto block_size = default_block_size;
+    const auto num_brows =
+        static_cast<IndexType>(to_check->get_num_block_rows());
+    const auto num_blocks = ceildiv(num_brows, block_size);
+    csr_reuse::kernel::check_unsorted<<<num_blocks, block_size>>>(
+        to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+        num_brows, gpu_array.get_data());
+    *is_sorted = exec->copy_val_to_host(gpu_array.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
diff --git a/cuda/test/matrix/fbcsr_kernels.cpp b/cuda/test/matrix/fbcsr_kernels.cpp
index 860d6554584..8e34e93de81 100644
--- a/cuda/test/matrix/fbcsr_kernels.cpp
+++ b/cuda/test/matrix/fbcsr_kernels.cpp
@@ -33,28 +33,47 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/fbcsr.hpp>
 
 
+#include <random>
+
+
 #include <gtest/gtest.h>
 
 
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "core/matrix/fbcsr_kernels.hpp"
 #include "core/test/matrix/fbcsr_sample.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/fb_matrix_generator.hpp"
 #include "cuda/test/utils.hpp"
 
 
 namespace {
 
 
+template <typename T>
 class Fbcsr : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Fbcsr<>;
+    using value_type = T;
+    using index_type = int;
+    using real_type = gko::remove_complex<value_type>;
+    using Mtx = gko::matrix::Fbcsr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+
+    Fbcsr() : distb(), engine(42) {}
 
     void SetUp()
     {
         ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
         ref = gko::ReferenceExecutor::create();
         cuda = gko::CudaExecutor::create(0, ref);
+        const index_type rand_brows = 100;
+        const index_type rand_bcols = 70;
+        const int block_size = 3;
+        rsorted_ref = gko::test::generate_random_fbcsr<value_type, index_type>(
+            ref, rand_brows, rand_bcols, block_size, false, false,
+            std::ranlux48(43));
     }
 
     void TearDown()
@@ -67,18 +86,39 @@ class Fbcsr : public ::testing::Test {
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
-    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<const Mtx> rsorted_ref;
+
+    std::normal_distribution<gko::remove_complex<T>> distb;
+    std::ranlux48 engine;
+
+    value_type get_random_value()
+    {
+        return gko::test::detail::get_rand_value<T>(distb, engine);
+    }
+
+    void generate_sin(Dense* const x)
+    {
+        value_type* const xarr = x->get_values();
+        for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) {
+            xarr[i] =
+                static_cast<real_type>(2.0) *
+                std::sin(static_cast<real_type>(i / 2.0) + get_random_value());
+        }
+    }
 };
 
+TYPED_TEST_SUITE(Fbcsr, gko::test::ValueTypes, TypenameNameGenerator);
 
-TEST_F(Fbcsr, CanWriteFromMatrixOnDevice)
+
+TYPED_TEST(Fbcsr, CanWriteFromMatrixOnDevice)
 {
-    using value_type = Mtx::value_type;
-    using index_type = Mtx::index_type;
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    using index_type = typename Mtx::index_type;
     using MatData = gko::matrix_data<value_type, index_type>;
-    gko::testing::FbcsrSample<value_type, index_type> sample(ref);
+    gko::testing::FbcsrSample<value_type, index_type> sample(this->ref);
     auto refmat = sample.generate_fbcsr();
-    auto cudamat = gko::clone(cuda, refmat);
+    auto cudamat = gko::clone(this->cuda, refmat);
     MatData refdata;
     MatData cudadata;
 
@@ -89,4 +129,267 @@ TEST_F(Fbcsr, CanWriteFromMatrixOnDevice)
 }
 
 
+TYPED_TEST(Fbcsr, TransposeIsEquivalentToRefSortedBS3)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    using index_type = typename Mtx::index_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    auto trans_ref_linop = this->rsorted_ref->transpose();
+    std::unique_ptr<const Mtx> trans_ref =
+        gko::as<const Mtx>(std::move(trans_ref_linop));
+
+    auto trans_cuda_linop = rand_cuda->transpose();
+    std::unique_ptr<const Mtx> trans_cuda =
+        gko::as<const Mtx>(std::move(trans_cuda_linop));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(trans_ref, trans_cuda);
+    GKO_ASSERT_MTX_NEAR(trans_ref, trans_cuda, 0.0);
+}
+
+
+TYPED_TEST(Fbcsr, TransposeIsEquivalentToRefSortedBS7)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    using index_type = typename Mtx::index_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    const index_type rand_brows = 50;
+    const index_type rand_bcols = 40;
+    const int block_size = 7;
+    auto rsorted_ref2 =
+        gko::test::generate_random_fbcsr<value_type, index_type>(
+            this->ref, rand_brows, rand_bcols, block_size, false, false,
+            std::ranlux48(43));
+    rand_cuda->copy_from(gko::lend(rsorted_ref2));
+
+    auto trans_ref_linop = rsorted_ref2->transpose();
+    std::unique_ptr<const Mtx> trans_ref =
+        gko::as<const Mtx>(std::move(trans_ref_linop));
+    auto trans_cuda_linop = rand_cuda->transpose();
+    std::unique_ptr<const Mtx> trans_cuda =
+        gko::as<const Mtx>(std::move(trans_cuda_linop));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(trans_ref, trans_cuda);
+    GKO_ASSERT_MTX_NEAR(trans_ref, trans_cuda, 0.0);
+}
+
+
+TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename Mtx::value_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    auto x_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 1));
+    this->generate_sin(x_ref.get());
+    auto x_cuda = Dense::create(this->cuda);
+    x_cuda->copy_from(x_ref.get());
+    auto prod_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1));
+    auto prod_cuda = Dense::create(this->cuda, prod_ref->get_size());
+
+    rand_cuda->apply(x_cuda.get(), prod_cuda.get());
+    this->rsorted_ref->apply(x_ref.get(), prod_ref.get());
+
+    const double tol = r<value_type>::value;
+    GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol);
+}
+
+
+TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename Mtx::value_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    auto x_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 3));
+    this->generate_sin(x_ref.get());
+    auto x_cuda = Dense::create(this->cuda);
+    x_cuda->copy_from(x_ref.get());
+    auto prod_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3));
+    auto prod_cuda = Dense::create(this->cuda, prod_ref->get_size());
+
+    rand_cuda->apply(x_cuda.get(), prod_cuda.get());
+    this->rsorted_ref->apply(x_ref.get(), prod_ref.get());
+
+    const double tol = r<value_type>::value;
+    GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol);
+}
+
+
+TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename TestFixture::value_type;
+    using real_type = typename TestFixture::real_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    auto x_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 1));
+    this->generate_sin(x_ref.get());
+    auto x_cuda = Dense::create(this->cuda);
+    x_cuda->copy_from(x_ref.get());
+    auto prod_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1));
+    this->generate_sin(prod_ref.get());
+    auto prod_cuda = Dense::create(this->cuda);
+    prod_cuda->copy_from(prod_ref.get());
+    auto alpha_ref = Dense::create(this->ref, gko::dim<2>(1, 1));
+    alpha_ref->get_values()[0] =
+        static_cast<real_type>(2.4) + this->get_random_value();
+    auto beta_ref = Dense::create(this->ref, gko::dim<2>(1, 1));
+    beta_ref->get_values()[0] = -1.2;
+    auto alpha = Dense::create(this->cuda);
+    alpha->copy_from(alpha_ref.get());
+    auto beta = Dense::create(this->cuda);
+    beta->copy_from(beta_ref.get());
+
+    rand_cuda->apply(alpha.get(), x_cuda.get(), beta.get(), prod_cuda.get());
+    this->rsorted_ref->apply(alpha_ref.get(), x_ref.get(), beta_ref.get(),
+                             prod_ref.get());
+
+    const double tol = r<value_type>::value;
+    GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol);
+}
+
+
+TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Dense = typename TestFixture::Dense;
+    using value_type = typename TestFixture::value_type;
+    using real_type = typename TestFixture::real_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    auto x_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[1], 3));
+    this->generate_sin(x_ref.get());
+    auto x_cuda = Dense::create(this->cuda);
+    x_cuda->copy_from(x_ref.get());
+    auto prod_ref = Dense::create(
+        this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3));
+    this->generate_sin(prod_ref.get());
+    auto prod_cuda = Dense::create(this->cuda);
+    prod_cuda->copy_from(prod_ref.get());
+    auto alpha_ref = Dense::create(this->ref, gko::dim<2>(1, 1));
+    alpha_ref->get_values()[0] =
+        static_cast<real_type>(2.4) + this->get_random_value();
+    auto beta_ref = Dense::create(this->ref, gko::dim<2>(1, 1));
+    beta_ref->get_values()[0] = -1.2;
+    auto alpha = Dense::create(this->cuda);
+    alpha->copy_from(alpha_ref.get());
+    auto beta = Dense::create(this->cuda);
+    beta->copy_from(beta_ref.get());
+
+    rand_cuda->apply(alpha.get(), x_cuda.get(), beta.get(), prod_cuda.get());
+    this->rsorted_ref->apply(alpha_ref.get(), x_ref.get(), beta_ref.get(),
+                             prod_ref.get());
+
+    const double tol = r<value_type>::value;
+    GKO_ASSERT_MTX_NEAR(prod_ref, prod_cuda, 5 * tol);
+}
+
+
+TYPED_TEST(Fbcsr, ConjTransposeIsEquivalentToRefSortedBS3)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    using index_type = typename Mtx::index_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    auto trans_ref_linop = this->rsorted_ref->conj_transpose();
+    std::unique_ptr<const Mtx> trans_ref =
+        gko::as<const Mtx>(std::move(trans_ref_linop));
+
+    auto trans_cuda_linop = rand_cuda->conj_transpose();
+    std::unique_ptr<const Mtx> trans_cuda =
+        gko::as<const Mtx>(std::move(trans_cuda_linop));
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(trans_ref, trans_cuda);
+    GKO_ASSERT_MTX_NEAR(trans_ref, trans_cuda, 0.0);
+}
+
+
+TYPED_TEST(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    using index_type = typename Mtx::index_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+    gko::size_type ref_max_nnz{}, cuda_max_nnz{};
+
+    gko::kernels::cuda::fbcsr::calculate_max_nnz_per_row(
+        this->cuda, rand_cuda.get(), &cuda_max_nnz);
+    gko::kernels::reference::fbcsr::calculate_max_nnz_per_row(
+        this->ref, this->rsorted_ref.get(), &ref_max_nnz);
+
+    ASSERT_EQ(ref_max_nnz, cuda_max_nnz);
+}
+
+
+TYPED_TEST(Fbcsr, RecognizeSortedMatrix)
+{
+    using Mtx = typename TestFixture::Mtx;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+
+    ASSERT_TRUE(rand_cuda->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Fbcsr, RecognizeUnsortedMatrix)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    auto mat = this->rsorted_ref->clone();
+    index_type* const colinds = mat->get_col_idxs();
+    std::swap(colinds[0], colinds[1]);
+    auto unsrt_cuda = Mtx::create(this->cuda);
+    unsrt_cuda->copy_from(gko::lend(mat));
+
+    ASSERT_FALSE(unsrt_cuda->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Fbcsr, InplaceAbsoluteMatrixIsEquivalentToRef)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    auto rand_ref = Mtx::create(this->ref);
+    rand_ref->copy_from(this->rsorted_ref.get());
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+
+    rand_ref->compute_absolute_inplace();
+    rand_cuda->compute_absolute_inplace();
+
+    const double tol = r<value_type>::value;
+    GKO_ASSERT_MTX_NEAR(rand_ref, rand_cuda, tol);
+}
+
+
+TYPED_TEST(Fbcsr, OutplaceAbsoluteMatrixIsEquivalentToRef)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename Mtx::value_type;
+    auto rand_cuda = Mtx::create(this->cuda);
+    rand_cuda->copy_from(gko::lend(this->rsorted_ref));
+
+    auto abs_mtx = this->rsorted_ref->compute_absolute();
+    auto dabs_mtx = rand_cuda->compute_absolute();
+
+    const double tol = r<value_type>::value;
+    GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, tol);
+}
+
+
 }  // namespace
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 40e4268dccb..610f89673a9 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -382,7 +382,7 @@ void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
-                                    run_generic_kernel_row_reduction)
+                                    run_generic_kernel_row_reduction);
 
 
 template <int subwarp_size, typename ValueType, typename KernelFunction,
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index b6ef373fdee..acb177123d6 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -101,7 +101,7 @@ void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized,
-                                    run_kernel_sized_impl)
+                                    run_kernel_sized_impl);
 
 
 template <typename KernelFunction, typename... MappedKernelArgs>
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 030da93c245..7815e55badc 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -146,7 +146,7 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
-                                    run_kernel_reduction_sized_impl)
+                                    run_kernel_reduction_sized_impl);
 
 
 }  // namespace
@@ -358,7 +358,7 @@ void run_kernel_col_reduction_sized_impl(
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized,
-                                    run_kernel_col_reduction_sized_impl)
+                                    run_kernel_col_reduction_sized_impl);
 
 
 }  // namespace
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index 3396a18dc9e..5bf5272ec73 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -47,6 +47,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "accessor/block_col_major.hpp"
+#include "core/base/block_sizes.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/base/utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "omp/components/format_conversion.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -62,7 +70,38 @@ template <typename ValueType, typename IndexType>
 void spmv(std::shared_ptr<const OmpExecutor> exec,
           const matrix::Fbcsr<ValueType, IndexType>* const a,
           const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c) GKO_NOT_IMPLEMENTED;
+          matrix::Dense<ValueType>* const c)
+{
+    const int bs = a->get_block_size();
+    const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
+    const IndexType nbrows = a->get_num_block_rows();
+    const size_type nbnz = a->get_num_stored_blocks();
+    auto row_ptrs = a->get_const_row_ptrs();
+    auto col_idxs = a->get_const_col_idxs();
+    const acc::range<acc::block_col_major<const ValueType, 3>> avalues{
+        to_std_array<size_type>(nbnz, bs, bs), a->get_const_values()};
+
+#pragma omp parallel for
+    for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
+        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
+             ++i) {
+            c->get_values()[i] = zero<ValueType>();
+        }
+        for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
+             ++inz) {
+            for (int ib = 0; ib < bs; ib++) {
+                const IndexType row = ibrow * bs + ib;
+                for (int jb = 0; jb < bs; jb++) {
+                    const auto val = avalues(inz, ib, jb);
+                    const auto col = col_idxs[inz] * bs + jb;
+                    for (size_type j = 0; j < nvecs; ++j) {
+                        c->at(row, j) += val * b->at(col, j);
+                    }
+                }
+            }
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
 
@@ -73,7 +112,39 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
                    const matrix::Fbcsr<ValueType, IndexType>* const a,
                    const matrix::Dense<ValueType>* const b,
                    const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c) GKO_NOT_IMPLEMENTED;
+                   matrix::Dense<ValueType>* const c)
+{
+    const int bs = a->get_block_size();
+    const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
+    const IndexType nbrows = a->get_num_block_rows();
+    const size_type nbnz = a->get_num_stored_blocks();
+    auto row_ptrs = a->get_const_row_ptrs();
+    auto col_idxs = a->get_const_col_idxs();
+    auto valpha = alpha->at(0, 0);
+    auto vbeta = beta->at(0, 0);
+    const acc::range<acc::block_col_major<const ValueType, 3>> avalues{
+        to_std_array<size_type>(nbnz, bs, bs), a->get_const_values()};
+
+#pragma omp parallel for
+    for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
+        for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
+             ++i)
+            c->get_values()[i] *= vbeta;
+
+        for (IndexType inz = row_ptrs[ibrow]; inz < row_ptrs[ibrow + 1];
+             ++inz) {
+            for (int ib = 0; ib < bs; ib++) {
+                const IndexType row = ibrow * bs + ib;
+                for (int jb = 0; jb < bs; jb++) {
+                    const auto val = avalues(inz, ib, jb);
+                    const auto col = col_idxs[inz] * bs + jb;
+                    for (size_type j = 0; j < nvecs; ++j)
+                        c->at(row, j) += valpha * val * b->at(col, j);
+                }
+            }
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
@@ -99,27 +170,72 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
 
 
-template <typename ValueType, typename IndexType, typename UnaryOperator>
-inline void convert_fbcsr_to_csc(
-    size_type num_rows, const IndexType* const row_ptrs,
-    const IndexType* const col_idxs, const ValueType* const fbcsr_vals,
-    IndexType* const row_idxs, IndexType* const col_ptrs,
-    ValueType* const csc_vals, UnaryOperator op) GKO_NOT_IMPLEMENTED;
+template <typename ValueType, typename IndexType, typename UnaryOperator,
+          bool transpose_blocks>
+void convert_fbcsr_to_fbcsc(const IndexType num_blk_rows, const int blksz,
+                            const IndexType* const row_ptrs,
+                            const IndexType* const col_idxs,
+                            const ValueType* const fbcsr_vals,
+                            IndexType* const row_idxs,
+                            IndexType* const col_ptrs,
+                            ValueType* const csc_vals, UnaryOperator op)
+{
+    auto sizes =
+        gko::to_std_array<size_type>(row_ptrs[num_blk_rows], blksz, blksz);
+    const acc::range<acc::block_col_major<const ValueType, 3>> rvalues(
+        sizes, fbcsr_vals);
+    acc::range<acc::block_col_major<ValueType, 3>> cvalues(sizes, csc_vals);
+    for (IndexType brow = 0; brow < num_blk_rows; ++brow) {
+        for (auto i = row_ptrs[brow]; i < row_ptrs[brow + 1]; ++i) {
+            const auto dest_idx = col_ptrs[col_idxs[i]];
+            col_ptrs[col_idxs[i]]++;
+            row_idxs[dest_idx] = brow;
+            for (int ib = 0; ib < blksz; ib++) {
+                for (int jb = 0; jb < blksz; jb++) {
+                    cvalues(dest_idx, ib, jb) =
+                        op(transpose_blocks ? rvalues(i, jb, ib)
+                                            : rvalues(i, ib, jb));
+                }
+            }
+        }
+    }
+}
 
 
 template <typename ValueType, typename IndexType, typename UnaryOperator>
 void transpose_and_transform(
-    std::shared_ptr<const OmpExecutor> exec,
     matrix::Fbcsr<ValueType, IndexType>* const trans,
-    const matrix::Fbcsr<ValueType, IndexType>* const orig,
-    UnaryOperator op) GKO_NOT_IMPLEMENTED;
+    const matrix::Fbcsr<ValueType, IndexType>* const orig, UnaryOperator op)
+{
+    const int bs = orig->get_block_size();
+    auto trans_row_ptrs = trans->get_row_ptrs();
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto trans_col_idxs = trans->get_col_idxs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto trans_vals = trans->get_values();
+    auto orig_vals = orig->get_const_values();
+
+    const IndexType nbcols = orig->get_num_block_cols();
+    const IndexType nbrows = orig->get_num_block_rows();
+    auto orig_nbnz = orig_row_ptrs[nbrows];
+
+    trans_row_ptrs[0] = 0;
+    convert_unsorted_idxs_to_ptrs(orig_col_idxs, orig_nbnz, trans_row_ptrs + 1,
+                                  nbcols);
+
+    convert_fbcsr_to_fbcsc<ValueType, IndexType, UnaryOperator, true>(
+        nbrows, bs, orig_row_ptrs, orig_col_idxs, orig_vals, trans_col_idxs,
+        trans_row_ptrs + 1, trans_vals, op);
+}
 
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const OmpExecutor> exec,
                const matrix::Fbcsr<ValueType, IndexType>* const orig,
                matrix::Fbcsr<ValueType, IndexType>* const trans)
-    GKO_NOT_IMPLEMENTED;
+{
+    transpose_and_transform(trans, orig, [](const ValueType x) { return x; });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
@@ -129,7 +245,10 @@ template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
                     const matrix::Fbcsr<ValueType, IndexType>* const orig,
                     matrix::Fbcsr<ValueType, IndexType>* const trans)
-    GKO_NOT_IMPLEMENTED;
+{
+    transpose_and_transform(trans, orig,
+                            [](const ValueType x) { return conj(x); });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
@@ -139,7 +258,22 @@ template <typename ValueType, typename IndexType>
 void calculate_max_nnz_per_row(
     std::shared_ptr<const OmpExecutor> exec,
     const matrix::Fbcsr<ValueType, IndexType>* const source,
-    size_type* const result) GKO_NOT_IMPLEMENTED;
+    size_type* const result)
+{
+    const auto num_rows = source->get_size()[0];
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const int bs = source->get_block_size();
+    IndexType max_nnz = 0;
+
+#pragma omp parallel for reduction(max : max_nnz)
+    for (size_type i = 0; i < num_rows; i++) {
+        const size_type ibrow = i / bs;
+        max_nnz =
+            std::max((row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs, max_nnz);
+    }
+
+    *result = max_nnz;
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
@@ -149,7 +283,19 @@ template <typename ValueType, typename IndexType>
 void calculate_nonzeros_per_row(
     std::shared_ptr<const OmpExecutor> exec,
     const matrix::Fbcsr<ValueType, IndexType>* const source,
-    Array<size_type>* const result) GKO_NOT_IMPLEMENTED;
+    Array<size_type>* const result)
+{
+    const auto row_ptrs = source->get_const_row_ptrs();
+    auto row_nnz_val = result->get_data();
+    const int bs = source->get_block_size();
+    assert(result->get_num_elems() == source->get_size()[0]);
+
+#pragma omp parallel for
+    for (size_type i = 0; i < result->get_num_elems(); i++) {
+        const size_type ibrow = i / bs;
+        row_nnz_val[i] = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * bs;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_CALCULATE_NONZEROS_PER_ROW_KERNEL);
@@ -159,16 +305,81 @@ template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const OmpExecutor> exec,
     const matrix::Fbcsr<ValueType, IndexType>* const to_check,
-    bool* const is_sorted) GKO_NOT_IMPLEMENTED;
+    bool* const is_sorted)
+{
+    const auto row_ptrs = to_check->get_const_row_ptrs();
+    const auto col_idxs = to_check->get_const_col_idxs();
+    const auto size = to_check->get_num_block_rows();
+    bool local_is_sorted = true;
+#pragma omp parallel for reduction(&& : local_is_sorted)
+    for (size_type i = 0; i < size; ++i) {
+        // Skip comparison if any thread detects that it is not sorted
+        if (local_is_sorted) {
+            for (auto idx = row_ptrs[i] + 1; idx < row_ptrs[i + 1]; ++idx) {
+                if (col_idxs[idx - 1] > col_idxs[idx]) {
+                    local_is_sorted = false;
+                    break;
+                }
+            }
+        }
+    }
+    *is_sorted = local_is_sorted;
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
+namespace {
+
+template <int mat_blk_sz, typename ValueType, typename IndexType>
+void sort_by_column_index_impl(
+    syn::value_list<int, mat_blk_sz>,
+    matrix::Fbcsr<ValueType, IndexType>* const to_sort)
+{
+    auto row_ptrs = to_sort->get_const_row_ptrs();
+    auto col_idxs = to_sort->get_col_idxs();
+    auto values = to_sort->get_values();
+    const auto nbrows = to_sort->get_num_block_rows();
+    constexpr int bs2 = mat_blk_sz * mat_blk_sz;
+#pragma omp parallel for
+    for (IndexType irow = 0; irow < nbrows; ++irow) {
+        IndexType* const brow_col_idxs = col_idxs + row_ptrs[irow];
+        ValueType* const brow_vals = values + row_ptrs[irow] * bs2;
+        const IndexType nbnz_brow = row_ptrs[irow + 1] - row_ptrs[irow];
+
+        std::vector<IndexType> col_permute(nbnz_brow);
+        std::iota(col_permute.begin(), col_permute.end(), 0);
+        auto helper = detail::IteratorFactory<IndexType, IndexType>(
+            brow_col_idxs, col_permute.data(), nbnz_brow);
+        std::sort(helper.begin(), helper.end());
+
+        std::vector<ValueType> oldvalues(nbnz_brow * bs2);
+        std::copy(brow_vals, brow_vals + nbnz_brow * bs2, oldvalues.begin());
+        for (IndexType ibz = 0; ibz < nbnz_brow; ibz++) {
+            for (int i = 0; i < bs2; i++) {
+                brow_vals[ibz * bs2 + i] =
+                    oldvalues[col_permute[ibz] * bs2 + i];
+            }
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx,
+                                    sort_by_column_index_impl);
+
+}  // namespace
+
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const OmpExecutor> exec,
                           matrix::Fbcsr<ValueType, IndexType>* const to_sort)
-    GKO_NOT_IMPLEMENTED;
+{
+    const int bs = to_sort->get_block_size();
+    select_sort_col_idx(
+        fixedblock::compiled_kernels(),
+        [bs](int compiled_block_size) { return bs == compiled_block_size; },
+        syn::value_list<int>(), syn::type_list<>(), to_sort);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
@@ -178,7 +389,34 @@ template <typename ValueType, typename IndexType>
 void extract_diagonal(std::shared_ptr<const OmpExecutor> exec,
                       const matrix::Fbcsr<ValueType, IndexType>* const orig,
                       matrix::Diagonal<ValueType>* const diag)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto row_ptrs = orig->get_const_row_ptrs();
+    const auto col_idxs = orig->get_const_col_idxs();
+    const auto values = orig->get_const_values();
+    const int bs = orig->get_block_size();
+    const IndexType nbrows = orig->get_num_block_rows();
+    const IndexType nbdim_min =
+        std::min(orig->get_num_block_rows(), orig->get_num_block_cols());
+    auto diag_values = diag->get_values();
+
+    assert(diag->get_size()[0] == nbdim_min * bs);
+
+    const acc::range<acc::block_col_major<const ValueType, 3>> vblocks(
+        gko::to_std_array<size_type>(row_ptrs[nbrows], bs, bs), values);
+
+#pragma omp parallel for
+    for (IndexType ibrow = 0; ibrow < nbdim_min; ++ibrow) {
+        for (IndexType idx = row_ptrs[ibrow]; idx < row_ptrs[ibrow + 1];
+             ++idx) {
+            if (col_idxs[idx] == ibrow) {
+                for (int ib = 0; ib < bs; ib++) {
+                    diag_values[ibrow * bs + ib] = vblocks(idx, ib, ib);
+                }
+                break;
+            }
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
diff --git a/omp/test/matrix/fbcsr_kernels.cpp b/omp/test/matrix/fbcsr_kernels.cpp
index 4fe014a46cb..49d61a4c26a 100644
--- a/omp/test/matrix/fbcsr_kernels.cpp
+++ b/omp/test/matrix/fbcsr_kernels.cpp
@@ -33,14 +33,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/fbcsr.hpp>
 
 
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <vector>
+
+
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/fbcsr_kernels.hpp"
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
+#include "core/test/utils/fb_matrix_generator.hpp"
 
 
 namespace {
@@ -48,7 +60,15 @@ namespace {
 
 class Fbcsr : public ::testing::Test {
 protected:
-    using Mtx = gko::matrix::Fbcsr<>;
+    using real_type = double;
+    using index_type = int;
+    using Arr = gko::Array<index_type>;
+    using Mtx = gko::matrix::Fbcsr<real_type, index_type>;
+    using Vec = gko::matrix::Dense<real_type>;
+    using ComplexVec = gko::matrix::Dense<std::complex<real_type>>;
+    using ComplexMtx = gko::matrix::Fbcsr<std::complex<real_type>>;
+
+    Fbcsr() : rand_engine(42) {}
 
     void SetUp()
     {
@@ -63,10 +83,85 @@ class Fbcsr : public ::testing::Test {
         }
     }
 
-    std::shared_ptr<gko::ReferenceExecutor> ref;
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_nnz_row, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(int num_vectors = 1)
+    {
+        mtx = gko::test::generate_random_fbcsr<real_type>(
+            ref, num_brows, num_bcols, blk_sz, false, false, rand_engine);
+        complex_mtx = gko::test::generate_random_fbcsr<std::complex<real_type>>(
+            ref, num_brows, num_bcols, blk_sz, false, false, rand_engine);
+        square_mtx = gko::test::generate_random_fbcsr<real_type>(
+            ref, num_brows, num_brows, blk_sz, false, false, rand_engine);
+        dmtx = Mtx::create(omp);
+        dmtx->copy_from(mtx.get());
+        complex_dmtx = ComplexMtx::create(omp);
+        complex_dmtx->copy_from(complex_mtx.get());
+        square_dmtx = Mtx::create(omp);
+        square_dmtx->copy_from(square_mtx.get());
+        expected = gen_mtx<Vec>(num_brows * blk_sz, num_vectors, 1);
+        y = gen_mtx<Vec>(num_bcols * blk_sz, num_vectors, 1);
+        alpha = gko::initialize<Vec>({2.0}, ref);
+        beta = gko::initialize<Vec>({-1.0}, ref);
+        dresult = Vec::create(omp);
+        dresult->copy_from(expected.get());
+        dy = Vec::create(omp);
+        dy->copy_from(y.get());
+        dalpha = Vec::create(omp);
+        dalpha->copy_from(alpha.get());
+        dbeta = Vec::create(omp);
+        dbeta->copy_from(beta.get());
+    }
+
+    struct matrix_pair {
+        std::unique_ptr<Mtx> ref;
+        std::unique_ptr<Mtx> omp;
+    };
+
+    matrix_pair gen_unsorted_mtx()
+    {
+        constexpr int min_nnz_per_row{2};
+        auto local_mtx_ref = gko::test::generate_random_fbcsr<real_type>(
+            ref, num_brows, num_bcols, blk_sz, false, true, rand_engine);
+
+        auto local_mtx_omp = Mtx::create(omp);
+        local_mtx_omp->copy_from(local_mtx_ref.get());
+
+        return matrix_pair{std::move(local_mtx_ref), std::move(local_mtx_omp)};
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::OmpExecutor> omp;
 
+    const index_type num_brows = 112;
+    const index_type num_bcols = 31;
+    const int blk_sz = 3;
+    std::ranlux48 rand_engine;
+
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<ComplexMtx> complex_mtx;
+    std::unique_ptr<Mtx> square_mtx;
+    std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec> beta;
+
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<ComplexMtx> complex_dmtx;
+    std::unique_ptr<Mtx> square_dmtx;
+    std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec> dbeta;
 };
 
 
@@ -88,4 +183,240 @@ TEST_F(Fbcsr, CanWriteFromMatrixOnDevice)
 }
 
 
+TEST_F(Fbcsr, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Fbcsr, SimpleApplyToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply(y.get(), expected.get());
+    dmtx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Fbcsr, SimpleApplyToDenseMatrixIsEquivalentToRefUnsorted)
+{
+    set_up_apply_data(3);
+    auto pair = gen_unsorted_mtx();
+
+    pair.ref->apply(y.get(), expected.get());
+    pair.omp->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Fbcsr, AdvancedApplyToDenseMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+
+    mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Fbcsr, ApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+    auto complex_b = gen_mtx<ComplexVec>(num_bcols * blk_sz, 3, 1);
+    auto dcomplex_b = ComplexVec::create(omp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexVec>(num_brows * blk_sz, 3, 1);
+    auto dcomplex_x = ComplexVec::create(omp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    mtx->apply(complex_b.get(), complex_x.get());
+    dmtx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
+}
+
+
+TEST_F(Fbcsr, AdvancedApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data(3);
+    auto complex_b = gen_mtx<ComplexVec>(num_bcols * blk_sz, 3, 1);
+    auto dcomplex_b = ComplexVec::create(omp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexVec>(num_brows * blk_sz, 3, 1);
+    auto dcomplex_x = ComplexVec::create(omp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    mtx->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
+    dmtx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
+}
+
+
+TEST_F(Fbcsr, TransposeIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto trans = gko::as<Mtx>(mtx->transpose());
+    auto d_trans = gko::as<Mtx>(dmtx->transpose());
+
+    GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
+    ASSERT_TRUE(d_trans->is_sorted_by_column_index());
+}
+
+
+TEST_F(Fbcsr, ConjugateTransposeIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto trans = gko::as<ComplexMtx>(complex_mtx->conj_transpose());
+    auto d_trans = gko::as<ComplexMtx>(complex_dmtx->conj_transpose());
+
+    GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
+    ASSERT_TRUE(d_trans->is_sorted_by_column_index());
+}
+
+
+TEST_F(Fbcsr, CalculatesNonzerosPerRow)
+{
+    set_up_apply_data();
+    gko::Array<gko::size_type> row_nnz(ref, mtx->get_size()[0]);
+    gko::Array<gko::size_type> drow_nnz(omp, dmtx->get_size()[0]);
+
+    gko::kernels::reference::fbcsr::calculate_nonzeros_per_row(ref, mtx.get(),
+                                                               &row_nnz);
+    gko::kernels::omp::fbcsr::calculate_nonzeros_per_row(omp, dmtx.get(),
+                                                         &drow_nnz);
+
+    GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
+}
+
+
+TEST_F(Fbcsr, RecognizeSortedMatrix)
+{
+    set_up_apply_data();
+    bool is_sorted_omp{};
+
+    is_sorted_omp = dmtx->is_sorted_by_column_index();
+
+    ASSERT_TRUE(is_sorted_omp);
+}
+
+
+TEST_F(Fbcsr, RecognizeUnsortedMatrix)
+{
+    auto uns_mtx = gen_unsorted_mtx();
+    bool is_sorted_omp{};
+
+    is_sorted_omp = uns_mtx.omp->is_sorted_by_column_index();
+
+    ASSERT_FALSE(is_sorted_omp);
+}
+
+
+TEST_F(Fbcsr, SortSortedMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->sort_by_column_index();
+    dmtx->sort_by_column_index();
+
+    GKO_ASSERT_MTX_NEAR(mtx, dmtx, 0);
+    ASSERT_TRUE(dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(Fbcsr, SortUnsortedMatrixIsEquivalentToRef)
+{
+    auto uns_mtx = gen_unsorted_mtx();
+
+    uns_mtx.ref->sort_by_column_index();
+    uns_mtx.omp->sort_by_column_index();
+
+    GKO_ASSERT_MTX_NEAR(uns_mtx.ref, uns_mtx.omp, 0);
+    ASSERT_TRUE(uns_mtx.omp->is_sorted_by_column_index());
+}
+
+
+TEST_F(Fbcsr, ExtractDiagonalIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto diag = mtx->extract_diagonal();
+    auto ddiag = dmtx->extract_diagonal();
+
+    GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0);
+}
+
+
+TEST_F(Fbcsr, InplaceAbsoluteMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    mtx->compute_absolute_inplace();
+    dmtx->compute_absolute_inplace();
+
+    GKO_ASSERT_MTX_NEAR(mtx, dmtx, 1e-14);
+}
+
+
+TEST_F(Fbcsr, OutplaceAbsoluteMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto abs_mtx = mtx->compute_absolute();
+    auto dabs_mtx = dmtx->compute_absolute();
+
+    GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, 1e-14);
+}
+
+
+TEST_F(Fbcsr, InplaceAbsoluteComplexMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    complex_mtx->compute_absolute_inplace();
+    complex_dmtx->compute_absolute_inplace();
+
+    GKO_ASSERT_MTX_NEAR(complex_mtx, complex_dmtx, 1e-14);
+}
+
+
+TEST_F(Fbcsr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto abs_mtx = complex_mtx->compute_absolute();
+    auto dabs_mtx = complex_dmtx->compute_absolute();
+
+    GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, 1e-14);
+}
+
+
+TEST_F(Fbcsr, MaxNnzPerRowIsEquivalentToRefSortedBS3)
+{
+    auto mtx_ref = gko::test::generate_random_fbcsr<real_type>(
+        ref, num_brows, num_bcols, blk_sz, false, false, rand_engine);
+    auto rand_omp = Mtx::create(omp);
+    rand_omp->copy_from(gko::lend(mtx_ref));
+    gko::size_type ref_max_nnz{}, omp_max_nnz{};
+
+    gko::kernels::omp::fbcsr::calculate_max_nnz_per_row(
+        this->omp, rand_omp.get(), &omp_max_nnz);
+    gko::kernels::reference::fbcsr::calculate_max_nnz_per_row(
+        this->ref, mtx_ref.get(), &ref_max_nnz);
+
+    ASSERT_EQ(ref_max_nnz, omp_max_nnz);
+}
+
+
 }  // namespace
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index a7fbed91a78..feab61f3805 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -46,11 +46,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "accessor/block_col_major.hpp"
-#include "accessor/range.hpp"
 #include "core/base/allocator.hpp"
+#include "core/base/block_sizes.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "core/base/utils.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/fbcsr_builder.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
 #include "reference/components/format_conversion.hpp"
 
 
@@ -77,11 +79,8 @@ void spmv(const std::shared_ptr<const ReferenceExecutor>,
     const size_type nbnz = a->get_num_stored_blocks();
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    auto vals = a->get_const_values();
     const acc::range<acc::block_col_major<const ValueType, 3>> avalues{
-        std::array<size_type, 3>{nbnz, static_cast<size_type>(bs),
-                                 static_cast<size_type>(bs)},
-        vals};
+        to_std_array<size_type>(nbnz, bs, bs), a->get_const_values()};
 
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
         for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
@@ -118,16 +117,13 @@ void advanced_spmv(const std::shared_ptr<const ReferenceExecutor>,
     const int bs = a->get_block_size();
     const auto nvecs = static_cast<IndexType>(b->get_size()[1]);
     const IndexType nbrows = a->get_num_block_rows();
+    const size_type nbnz = a->get_num_stored_blocks();
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    auto vals = a->get_const_values();
     auto valpha = alpha->at(0, 0);
     auto vbeta = beta->at(0, 0);
     const acc::range<acc::block_col_major<const ValueType, 3>> avalues{
-        std::array<size_type, 3>{a->get_num_stored_blocks(),
-                                 static_cast<size_type>(bs),
-                                 static_cast<size_type>(bs)},
-        vals};
+        to_std_array<size_type>(nbnz, bs, bs), a->get_const_values()};
 
     for (IndexType ibrow = 0; ibrow < nbrows; ++ibrow) {
         for (IndexType i = ibrow * bs * nvecs; i < (ibrow + 1) * bs * nvecs;
@@ -414,8 +410,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
 
 
+namespace {
+
+
 template <int mat_blk_sz, typename ValueType, typename IndexType>
-static void sort_by_column_index_impl(
+void sort_by_column_index_impl(
+    syn::value_list<int, mat_blk_sz>,
     matrix::Fbcsr<ValueType, IndexType>* const to_sort)
 {
     auto row_ptrs = to_sort->get_const_row_ptrs();
@@ -423,10 +423,10 @@ static void sort_by_column_index_impl(
     auto values = to_sort->get_values();
     const auto nbrows = to_sort->get_num_block_rows();
     constexpr int bs2 = mat_blk_sz * mat_blk_sz;
-    for (IndexType i = 0; i < nbrows; ++i) {
-        IndexType* const brow_col_idxs = col_idxs + row_ptrs[i];
-        ValueType* const brow_vals = values + row_ptrs[i] * bs2;
-        const IndexType nbnz_brow = row_ptrs[i + 1] - row_ptrs[i];
+    for (IndexType irow = 0; irow < nbrows; ++irow) {
+        IndexType* const brow_col_idxs = col_idxs + row_ptrs[irow];
+        ValueType* const brow_vals = values + row_ptrs[irow] * bs2;
+        const IndexType nbnz_brow = row_ptrs[irow + 1] - row_ptrs[irow];
 
         std::vector<IndexType> col_permute(nbnz_brow);
         std::iota(col_permute.begin(), col_permute.end(), 0);
@@ -445,20 +445,22 @@ static void sort_by_column_index_impl(
     }
 }
 
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_sort_col_idx,
+                                    sort_by_column_index_impl);
+
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const ReferenceExecutor> exec,
                           matrix::Fbcsr<ValueType, IndexType>* const to_sort)
 {
     const int bs = to_sort->get_block_size();
-    if (bs == 2) {
-        sort_by_column_index_impl<2>(to_sort);
-    } else if (bs == 3) {
-        sort_by_column_index_impl<3>(to_sort);
-    } else if (bs == 4) {
-        sort_by_column_index_impl<4>(to_sort);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
+    select_sort_col_idx(
+        fixedblock::compiled_kernels(),
+        [bs](int compiled_block_size) { return bs == compiled_block_size; },
+        syn::value_list<int>(), syn::type_list<>(), to_sort);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp
index 19330b476dc..8897959055a 100644
--- a/reference/test/matrix/fbcsr_kernels.cpp
+++ b/reference/test/matrix/fbcsr_kernels.cpp
@@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/fbcsr_kernels.hpp"
 #include "core/test/matrix/fbcsr_sample.hpp"
 #include "core/test/utils.hpp"
+#include "core/test/utils/value_generator.hpp"
 
 
 namespace {
@@ -139,18 +140,16 @@ TYPED_TEST_SUITE(Fbcsr, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
 
 
 template <typename T>
-constexpr typename std::enable_if_t<!gko::is_complex_s<T>::value, T>
-get_some_number()
-{
-    return static_cast<T>(1.2);
-}
-
-template <typename T>
-constexpr typename std::enable_if_t<gko::is_complex_s<T>::value, T>
-get_some_number()
+std::unique_ptr<gko::matrix::Dense<T>> get_some_vectors(
+    std::shared_ptr<const gko::Executor> exec, const size_t nrows,
+    const size_t nrhs)
 {
     using RT = gko::remove_complex<T>;
-    return {static_cast<RT>(1.2), static_cast<RT>(3.4)};
+    std::ranlux48 engine(39);
+    std::normal_distribution<RT> dist(0.0, 5.0);
+    std::uniform_int_distribution<> nnzdist(1, nrhs);
+    return gko::test::generate_random_matrix<gko::matrix::Dense<T>>(
+        nrows, nrhs, nnzdist, dist, engine, exec);
 }
 
 
@@ -161,11 +160,7 @@ TYPED_TEST(Fbcsr, AppliesToDenseVector)
     using index_type = typename TestFixture::index_type;
     const index_type nrows = this->mtx2->get_size()[0];
     const index_type ncols = this->mtx2->get_size()[1];
-    auto x = Vec::create(this->exec, gko::dim<2>{(gko::size_type)ncols, 1});
-    T* const xvals = x->get_values();
-    for (index_type i = 0; i < ncols; i++) {
-        xvals[i] = std::sin(static_cast<T>(static_cast<float>((i + 1) ^ 2)));
-    }
+    auto x = get_some_vectors<T>(this->exec, ncols, 1);
     auto y = Vec::create(this->exec, gko::dim<2>{(gko::size_type)nrows, 1});
     auto yref = Vec::create(this->exec, gko::dim<2>{(gko::size_type)nrows, 1});
     using Csr = typename TestFixture::Csr;
@@ -190,13 +185,7 @@ TYPED_TEST(Fbcsr, AppliesToDenseMatrix)
     const gko::size_type nrows = this->mtx2->get_size()[0];
     const gko::size_type ncols = this->mtx2->get_size()[1];
     const gko::size_type nvecs = 3;
-    auto x = Vec::create(this->exec, gko::dim<2>{ncols, nvecs});
-    for (index_type i = 0; i < ncols; i++) {
-        for (index_type j = 0; j < nvecs; j++) {
-            x->at(i, j) = (static_cast<T>(3.0 * i) + get_some_number<T>()) /
-                          static_cast<T>(j + 1.0);
-        }
-    }
+    auto x = get_some_vectors<T>(this->exec, ncols, nvecs);
     auto y = Vec::create(this->exec, gko::dim<2>{nrows, nvecs});
     auto yref = Vec::create(this->exec, gko::dim<2>{nrows, nvecs});
 
@@ -209,6 +198,28 @@ TYPED_TEST(Fbcsr, AppliesToDenseMatrix)
 }
 
 
+TYPED_TEST(Fbcsr, AppliesToDenseComplexMatrix)
+{
+    using T = typename TestFixture::value_type;
+    using CT = typename gko::to_complex<T>;
+    using CVec = gko::matrix::Dense<CT>;
+    using index_type = typename TestFixture::index_type;
+    const gko::size_type nrows = this->mtx2->get_size()[0];
+    const gko::size_type ncols = this->mtx2->get_size()[1];
+    const gko::size_type nvecs = 3;
+    auto x = get_some_vectors<CT>(this->exec, ncols, nvecs);
+    auto y = CVec::create(this->exec, gko::dim<2>{nrows, nvecs});
+    auto yref = CVec::create(this->exec, gko::dim<2>{nrows, nvecs});
+
+    this->mtx2->apply(x.get(), y.get());
+    this->ref2csrmtx->apply(x.get(), yref.get());
+
+    const double tolerance =
+        std::numeric_limits<gko::remove_complex<T>>::epsilon();
+    GKO_ASSERT_MTX_NEAR(y, yref, tolerance);
+}
+
+
 TYPED_TEST(Fbcsr, AppliesLinearCombinationToDenseVector)
 {
     using Vec = typename TestFixture::Vec;
@@ -220,15 +231,8 @@ TYPED_TEST(Fbcsr, AppliesLinearCombinationToDenseVector)
     auto beta = gko::initialize<Vec>({betav}, this->exec);
     const gko::size_type nrows = this->mtx2->get_size()[0];
     const gko::size_type ncols = this->mtx2->get_size()[1];
-    auto x = Vec::create(this->exec, gko::dim<2>{ncols, 1});
-    auto y = Vec::create(this->exec, gko::dim<2>{nrows, 1});
-    for (index_type i = 0; i < ncols; i++) {
-        x->at(i, 0) = (i + 1.0) * (i + 1.0);
-    }
-    for (index_type i = 0; i < nrows; i++) {
-        y->at(i, 0) = static_cast<T>(std::sin(2 * 3.14 * (i + 0.1) / nrows)) +
-                      get_some_number<T>();
-    }
+    auto x = get_some_vectors<T>(this->exec, ncols, 1);
+    auto y = get_some_vectors<T>(this->exec, nrows, 1);
     auto yref = y->clone();
 
     this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());
@@ -252,21 +256,8 @@ TYPED_TEST(Fbcsr, AppliesLinearCombinationToDenseMatrix)
     const gko::size_type nrows = this->mtx2->get_size()[0];
     const gko::size_type ncols = this->mtx2->get_size()[1];
     const gko::size_type nvecs = 3;
-    auto x = Vec::create(this->exec, gko::dim<2>{ncols, nvecs});
-    auto y = Vec::create(this->exec, gko::dim<2>{nrows, nvecs});
-    for (index_type i = 0; i < ncols; i++) {
-        for (index_type j = 0; j < nvecs; j++) {
-            x->at(i, j) =
-                std::log(static_cast<T>(0.1 + static_cast<float>((i + 1) ^ 2)));
-        }
-    }
-    for (index_type i = 0; i < nrows; i++) {
-        for (index_type j = 0; j < nvecs; j++) {
-            y->at(i, j) =
-                static_cast<T>(std::sin(2 * 3.14 * (i + j + 0.1) / nrows)) +
-                get_some_number<T>();
-        }
-    }
+    auto x = get_some_vectors<T>(this->exec, ncols, nvecs);
+    auto y = get_some_vectors<T>(this->exec, nrows, nvecs);
     auto yref = y->clone();
 
     this->mtx2->apply(alpha.get(), x.get(), beta.get(), y.get());