diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 3979caa905c..8c7f45e5174 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 
 
@@ -142,7 +143,7 @@ bool validate(sycl::queue *queue, unsigned workgroup_size,
 
 
 template <typename IterArr, typename Validate>
-ConfigSetType get_first_cfg(IterArr &arr, Validate verify)
+std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
 {
     for (auto &cfg : arr) {
         if (verify(cfg)) {
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 330fa297e58..07cdb5b38aa 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/prefix_sum.dp.hpp"
 
@@ -52,7 +53,7 @@ namespace components {
 using BlockCfg = ConfigSet<11>;
 
 constexpr auto block_cfg_list =
-    ::gko::syn::value_list<ConfigSetType, BlockCfg::encode(512),
+    ::gko::syn::value_list<std::uint32_t, BlockCfg::encode(512),
                            BlockCfg::encode(256), BlockCfg::encode(128)>();
 
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
@@ -73,8 +74,8 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
     if (num_entries > 0) {
         auto queue = exec->get_queue();
         constexpr auto block_cfg_array = as_array(block_cfg_list);
-        const ConfigSetType cfg =
-            get_first_cfg(block_cfg_array, [&queue](ConfigSetType cfg) {
+        const std::uint32_t cfg =
+            get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) {
                 return validate(queue, BlockCfg::decode<0>(cfg), 16);
             });
         const auto wg_size = BlockCfg::decode<0>(cfg);
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index fd9ff2ac263..f76f85135eb 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
@@ -125,7 +126,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element,
  * @note To calculate the prefix sum over an array of size bigger than
  *       `block_size`, `finalize_prefix_sum` has to be used as well.
  */
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
                       ValueType *__restrict__ block_sum,
                       sycl::nd_item<3> item_ct1,
@@ -178,7 +179,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
     }
 }
 
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type num_elements,
                       ValueType *elements, ValueType *block_sum)
@@ -214,7 +215,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
  *
  * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
  */
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void finalize_prefix_sum(size_type num_elements,
                          ValueType *__restrict__ elements,
                          const ValueType *__restrict__ block_sum,
@@ -231,7 +232,7 @@ void finalize_prefix_sum(size_type num_elements,
     }
 }
 
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_elements,
                          ValueType *elements, const ValueType *block_sum)
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index d3e925ee4ba..9c2387a7113 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
+#include "core/base/types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -63,7 +64,7 @@ namespace dpcpp {
 constexpr int default_block_size = 256;
 using KCFG_1D = ConfigSet<11, 7>;
 constexpr auto kcfg_1d_list =
-    syn::value_list<ConfigSetType, KCFG_1D::encode(512, 64),
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
                     KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
                     KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
                     KCFG_1D::encode(256, 8)>();
@@ -201,7 +202,7 @@ void reduce_array(size_type size, const ValueType *__restrict__ source,
  * `source` of any size. Has to be called a second time on `result` to reduce
  * an array larger than `block_size`.
  */
-template <ConfigSetType cfg, typename ValueType>
+template <std::uint32_t cfg, typename ValueType>
 void reduce_add_array(
     size_type size, const ValueType *__restrict__ source,
     ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
@@ -216,7 +217,7 @@ void reduce_add_array(
     }
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type size,
                       const ValueType *source, ValueType *result)
@@ -263,8 +264,8 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
     ValueType answer = zero<ValueType>();
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 79357bd048f..eb261c6f35b 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <iostream>
-
-
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
@@ -70,12 +67,14 @@ namespace dpcpp {
  */
 namespace dense {
 
+
 using KCFG_1D = ConfigSet<11, 7>;
 constexpr auto kcfg_1d_list =
-    syn::value_list<ConfigSetType, KCFG_1D::encode(512, 64),
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
                     KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
                     KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
                     KCFG_1D::encode(256, 8)>();
+constexpr auto subgroup_list = syn::value_list<std::uint32_t, 64, 32, 16, 8>();
 constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
 constexpr auto default_block_size = 256;
 
@@ -119,6 +118,7 @@ void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols,
 
 GKO_ENABLE_DEFAULT_HOST(scale, scale)
 
+
 template <typename ValueType>
 void add_scaled(size_type num_rows, size_type num_cols,
                 size_type num_alpha_cols, const ValueType *__restrict__ alpha,
@@ -157,7 +157,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
 GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename OutType,
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename OutType,
           typename CallableGetValue, typename CallableReduce>
 void compute_partial_reduce(
     size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
@@ -191,7 +191,7 @@ void compute_partial_reduce(
 }
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType,
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType,
           typename CallableReduce, typename CallableFinalize>
 void finalize_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
@@ -220,7 +220,7 @@ void finalize_reduce_computation(
 }
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_dot(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     const ValueType *__restrict__ y, size_type stride_y,
@@ -230,13 +230,13 @@ void compute_partial_dot(
     compute_partial_reduce<cfg>(
         num_rows, work,
         [x, stride_x, y, stride_y](size_type i) {
-            return x[i * stride_x] * conj(y[i * stride_y]);
+            return x[i * stride_x] * y[i * stride_y];
         },
         [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
         tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_rows,
                          const ValueType *x, size_type stride_x,
@@ -244,7 +244,6 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          ValueType *work)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    std::cout << "partial " << cfg << std::endl;
     stream->submit([&](sycl::handler &cgh) {
         sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
@@ -267,8 +266,54 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
                                kcfg_1d_list)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_dot_computation(
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_partial_conj_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    compute_partial_reduce<cfg>(
+        num_rows, work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return conj(x[i * stride_x]) * y[i * stride_y];
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_partial_conj_dot(dim3 grid, dim3 block,
+                              size_t dynamic_shared_memory, sycl::queue *stream,
+                              size_type num_rows, const ValueType *x,
+                              size_type stride_x, const ValueType *y,
+                              size_type stride_y, ValueType *work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_conj_dot<cfg>(
+                    num_rows, x, stride_x, y, stride_y, work, item_ct1,
+                    (UninitializedArray<ValueType, wg_size> *)
+                        tmp_work_acc_ct1.get_pointer());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot,
+                                           compute_partial_conj_dot)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call,
+                               compute_partial_conj_dot, kcfg_1d_list)
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sum_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
     UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
@@ -279,14 +324,13 @@ void finalize_dot_computation(
         [](const ValueType &x) { return x; }, item_ct1, tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_dot_computation(dim3 grid, dim3 block,
-                              size_t dynamic_shared_memory, sycl::queue *stream,
-                              size_type size, const ValueType *work,
-                              ValueType *result)
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sum_reduce_computation(dim3 grid, dim3 block,
+                                     size_t dynamic_shared_memory,
+                                     sycl::queue *stream, size_type size,
+                                     const ValueType *work, ValueType *result)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    std::cout << "finalize " << cfg << std::endl;
     stream->submit([&](sycl::handler &cgh) {
         sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
@@ -295,7 +339,7 @@ void finalize_dot_computation(dim3 grid, dim3 block,
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
-                             finalize_dot_computation<cfg>(
+                             finalize_sum_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
                                  (UninitializedArray<ValueType, wg_size> *)
                                      tmp_work_acc_ct1.get_pointer());
@@ -303,13 +347,13 @@ void finalize_dot_computation(dim3 grid, dim3 block,
     });
 }
 
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation,
-                                           finalize_dot_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call,
-                               finalize_dot_computation, kcfg_1d_list)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation,
+                                           finalize_sum_reduce_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call,
+                               finalize_sum_reduce_computation, kcfg_1d_list)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_norm2(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
@@ -324,7 +368,7 @@ void compute_partial_norm2(
         tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                            sycl::queue *stream, size_type num_rows,
                            const ValueType *x, size_type stride_x,
@@ -353,8 +397,8 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
                                compute_partial_norm2, kcfg_1d_list)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_norm2_computation(
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sqrt_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
     UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
@@ -362,14 +406,14 @@ void finalize_norm2_computation(
     finalize_reduce_computation<cfg>(
         size, work, result,
         [](const ValueType &x, const ValueType &y) { return x + y; },
-        [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work);
+        [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_norm2_computation(dim3 grid, dim3 block,
-                                size_t dynamic_shared_memory,
-                                sycl::queue *stream, size_type size,
-                                const ValueType *work, ValueType *result)
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
+                                      size_t dynamic_shared_memory,
+                                      sycl::queue *stream, size_type size,
+                                      const ValueType *work, ValueType *result)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     stream->submit([&](sycl::handler &cgh) {
@@ -381,7 +425,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
-                             finalize_norm2_computation<cfg>(
+                             finalize_sqrt_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
                                  (UninitializedArray<ValueType, wg_size> *)
                                      tmp_work_acc_ct1.get_pointer());
@@ -389,13 +433,13 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
     });
 }
 
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation,
-                                           finalize_norm2_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call,
-                               finalize_norm2_computation, kcfg_1d_list)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation,
+                                           finalize_sqrt_reduce_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call,
+                               finalize_sqrt_reduce_computation, kcfg_1d_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
                  const size_type *__restrict__ row_ptrs,
                  const ValueType *__restrict__ source,
@@ -418,10 +462,12 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_coo, fill_in_coo)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_coo, fill_in_coo)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_coo_call, fill_in_coo, kcfg_1d_list)
 
 
-template <ConfigSetType cfg, typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
                        const ValueType *__restrict__ work,
                        IndexType *__restrict__ result,
@@ -451,7 +497,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
                                kcfg_1d_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
                  const ValueType *__restrict__ source,
                  IndexType *__restrict__ row_ptrs,
@@ -472,10 +518,12 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_csr, fill_in_csr)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_csr, fill_in_csr)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_csr_call, fill_in_csr, kcfg_1d_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_ell(size_type num_rows, size_type num_cols,
                  size_type source_stride, const ValueType *__restrict__ source,
                  size_type max_nnz_per_row, size_type result_stride,
@@ -505,10 +553,12 @@ void fill_in_ell(size_type num_rows, size_type num_cols,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_ell, fill_in_ell)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_ell, fill_in_ell)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_ell_call, fill_in_ell, kcfg_1d_list)
 
 
-template <ConfigSetType cfg>
+template <std::uint32_t cfg>
 void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              int slice_num, size_type stride_factor,
                              const size_type *__restrict__ nnz_per_row,
@@ -516,7 +566,7 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              size_type *__restrict__ slice_sets,
                              sycl::nd_item<3> item_ct1)
 {
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    constexpr auto sg_size = cfg;
     const auto sliceid = item_ct1.get_group(2);
     const auto tid_in_warp = item_ct1.get_local_id(2);
 
@@ -548,10 +598,10 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
                                            calculate_slice_lengths)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
-                               calculate_slice_lengths, kcfg_1d_list)
+                               calculate_slice_lengths, subgroup_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
                    size_type stride, const ValueType *__restrict__ source,
                    size_type *__restrict__ slice_lengths,
@@ -584,9 +634,12 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_sellp, fill_in_sellp)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_sellp, fill_in_sellp)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_sellp_call, fill_in_sellp, kcfg_1d_list)
 
-template <ConfigSetType cfg>
+
+template <std::uint32_t cfg>
 void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
                     size_type *__restrict__ result, sycl::nd_item<3> item_ct1,
                     uint8_t *dpct_local)
@@ -603,7 +656,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
     }
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32)>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32)>
 void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                     sycl::queue *stream, size_type size,
                     const size_type *nnz_per_row, size_type *result)
@@ -626,7 +679,8 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
 GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz,
                                kcfg_1d_list)
 
-template <ConfigSetType cfg>
+
+template <std::uint32_t cfg>
 void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
                               size_type stride_factor,
                               const size_type *__restrict__ nnz_per_row,
@@ -665,7 +719,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
                                reduce_max_nnz_per_slice, kcfg_1d_list)
 
 
-template <ConfigSetType cfg>
+template <std::uint32_t cfg>
 void reduce_total_cols(size_type num_slices,
                        const size_type *__restrict__ max_nnz_per_slice,
                        size_type *__restrict__ result,
@@ -682,7 +736,7 @@ void reduce_total_cols(size_type num_slices,
     }
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32)>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32)>
 void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::queue *stream, size_type num_slices,
                        const size_type *max_nnz_per_slice, size_type *result)
@@ -979,10 +1033,10 @@ void scale(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType> *alpha, matrix::Dense<ValueType> *x)
 {
     if (0) {
-        // oneapi::mkl::blas::row_major::scal(
-        //     *exec->get_queue(), x->get_size()[0] * x->get_size()[1],
-        //     exec->copy_val_to_host(alpha->get_const_values()),
-        //     x->get_values(), x->get_stride());
+        oneapi::mkl::blas::row_major::scal(
+            *exec->get_queue(), x->get_size()[0] * x->get_size()[1],
+            exec->copy_val_to_host(alpha->get_const_values()), x->get_values(),
+            x->get_stride());
     } else {
         // TODO: tune this parameter
         constexpr auto block_size = default_block_size;
@@ -1005,11 +1059,11 @@ void add_scaled(std::shared_ptr<const DpcppExecutor> exec,
                 const matrix::Dense<ValueType> *x, matrix::Dense<ValueType> *y)
 {
     if (0) {
-        // oneapi::mkl::blas::row_major::axpy(
-        //     *exec->get_queue(), x->get_size()[0],
-        //     exec->copy_val_to_host(alpha->get_const_values()),
-        //     x->get_const_values(), x->get_stride(), y->get_values(),
-        //     y->get_stride());
+        oneapi::mkl::blas::row_major::axpy(
+            *exec->get_queue(), x->get_size()[0],
+            exec->copy_val_to_host(alpha->get_const_values()),
+            x->get_const_values(), x->get_stride(), y->get_values(),
+            y->get_stride());
     } else {
         // TODO: tune this parameter
         constexpr auto block_size = default_block_size;
@@ -1048,10 +1102,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 namespace {
 
 
-#define GKO_BIND_DOT(ValueType, Func)                                        \
-    void dot(::cl::sycl::queue &exec_queue, std::int64_t n,                  \
-             const ValueType *x, std::int64_t incx, const ValueType *y,      \
-             std::int64_t incy, ValueType *result)                           \
+#define GKO_BIND_DOT(ValueType, Name, Func)                                  \
+    void Name(::cl::sycl::queue &exec_queue, std::int64_t n,                 \
+              const ValueType *x, std::int64_t incx, const ValueType *y,     \
+              std::int64_t incy, ValueType *result)                          \
     {                                                                        \
         Func(exec_queue, n, x, incx, y, incy, result);                       \
     }                                                                        \
@@ -1059,10 +1113,15 @@ namespace {
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
 
-GKO_BIND_DOT(float, oneapi::mkl::blas::row_major::dot);
-GKO_BIND_DOT(double, oneapi::mkl::blas::row_major::dot);
-GKO_BIND_DOT(std::complex<float>, oneapi::mkl::blas::row_major::dotc);
-GKO_BIND_DOT(std::complex<double>, oneapi::mkl::blas::row_major::dotc);
+GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(std::complex<double>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, conj_dot, oneapi::mkl::blas::row_major::dotc);
+GKO_BIND_DOT(std::complex<double>, conj_dot,
+             oneapi::mkl::blas::row_major::dotc);
 
 
 }  // namespace
@@ -1089,15 +1148,13 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
         constexpr auto work_per_thread = 32;
         auto queue = exec->get_queue();
         constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const ConfigSetType cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
                 return validate(queue, KCFG_1D::decode<0>(cfg),
                                 KCFG_1D::decode<1>(cfg));
             });
         const auto wg_size = KCFG_1D::decode<0>(cfg);
         const auto sg_size = KCFG_1D::decode<1>(cfg);
-        std::cout << "dot " << cfg << " " << wg_size << " " << sg_size
-                  << std::endl;
         const auto work_per_block = work_per_thread * wg_size;
         const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
         const dim3 block_dim{sg_size, 1, wg_size / sg_size};
@@ -1108,7 +1165,7 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                 cfg, grid_dim, block_dim, 0, exec->get_queue(),
                 x->get_size()[0], x->get_const_values() + col, x->get_stride(),
                 y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_dot_computation_call(
+            kernel::finalize_sum_reduce_computation_call(
                 cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
         }
@@ -1122,7 +1179,47 @@ template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *x,
                       const matrix::Dense<ValueType> *y,
-                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
+                      matrix::Dense<ValueType> *result)
+{
+    if (0) {
+        // TODO: write a custom kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            conj_dot(*exec->get_queue(), x->get_size()[0],
+                     x->get_const_values() + col, x->get_stride(),
+                     y->get_const_values() + col, y->get_stride(),
+                     result->get_values() + col);
+        }
+    } else {
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_conj_dot_call(
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(), work.get_data());
+            kernel::finalize_sum_reduce_computation_call(
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
@@ -1147,8 +1244,8 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
         constexpr auto work_per_thread = 32;
         auto queue = exec->get_queue();
         constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const ConfigSetType cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
                 return validate(queue, KCFG_1D::decode<0>(cfg),
                                 KCFG_1D::decode<1>(cfg));
             });
@@ -1165,7 +1262,7 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                 cfg, grid_dim, block_dim, 0, exec->get_queue(),
                 x->get_size()[0], x->get_const_values() + col, x->get_stride(),
                 work.get_data());
-            kernel::finalize_norm2_computation_call(
+            kernel::finalize_sqrt_reduce_computation_call(
                 cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
         }
@@ -1194,12 +1291,21 @@ void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
 
     components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
 
-    size_type grid_dim = ceildiv(num_rows, default_block_size);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, stride,
-                        nnz_prefix_sum.get_const_data(),
-                        source->get_const_values(), row_idxs, col_idxs, values);
+    kernel::fill_in_coo_call(
+        cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, num_cols,
+        stride, nnz_prefix_sum.get_const_data(), source->get_const_values(),
+        row_idxs, col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1213,8 +1319,8 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 {
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1241,9 +1347,10 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 
     size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_csr(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                        num_cols, stride, source->get_const_values(), row_ptrs,
-                        col_idxs, values);
+    kernel::fill_in_csr_call(cfg, grid_dim, default_block_size, 0,
+                             exec->get_queue(), num_rows, num_cols, stride,
+                             source->get_const_values(), row_ptrs, col_idxs,
+                             values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1265,11 +1372,20 @@ void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
     auto source_stride = source->get_stride();
     auto result_stride = result->get_stride();
 
-    auto grid_dim = ceildiv(result_stride, default_block_size);
-    kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, source_stride,
-                        source->get_const_values(), max_nnz_per_row,
-                        result_stride, col_ptrs, values);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    auto grid_dim = ceildiv(result_stride, wg_size);
+    kernel::fill_in_ell_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
+                             num_rows, num_cols, source_stride,
+                             source->get_const_values(), max_nnz_per_row,
+                             result_stride, col_ptrs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1293,8 +1409,8 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
 {
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1319,24 +1435,25 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
     const int slice_num = ceildiv(num_rows, slice_size);
 
     auto nnz_per_row = Array<size_type>(exec, num_rows);
-
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     auto grid_dim = slice_num;
 
     if (grid_dim > 0) {
         kernel::calculate_slice_lengths_call(
-            cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size,
-            slice_num, stride_factor, nnz_per_row.get_const_data(),
+            sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows,
+            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
             slice_lengths, slice_sets);
     }
+
     components::prefix_sum(exec, slice_sets, slice_num + 1);
+
     grid_dim = ceildiv(num_rows, wg_size);
     if (grid_dim > 0) {
-        kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                              num_cols, slice_size, stride,
-                              source->get_const_values(), slice_lengths,
-                              slice_sets, col_idxs, vals);
+        kernel::fill_in_sellp_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
+                                   num_rows, num_cols, slice_size, stride,
+                                   source->get_const_values(), slice_lengths,
+                                   slice_sets, col_idxs, vals);
     }
 }
 
@@ -1380,14 +1497,12 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
     const auto wg_size = KCFG_1D::decode<0>(cfg);
-    std::cout << "wg_size " << wg_size << "sg_size " << KCFG_1D::decode<1>(cfg)
-              << std::endl;
     const auto n = ceildiv(num_rows, wg_size);
     const size_type grid_dim = (n <= wg_size) ? n : wg_size;
 
@@ -1417,8 +1532,8 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
 {
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1463,8 +1578,8 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1502,27 +1617,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 template <typename ValueType>
 void transpose(std::shared_ptr<const DpcppExecutor> exec,
                const matrix::Dense<ValueType> *orig,
-               matrix::Dense<ValueType> *trans)
-{
-    // if (cublas::is_supported<ValueType>::value) {
-    //     auto handle = exec->get_cublas_handle();
-    //     {
-    //         cublas::pointer_mode_guard pm_guard(handle);
-    //         auto alpha = one<ValueType>();
-    //         auto beta = zero<ValueType>();
-    //         cublas::geam(
-    //             handle, oneapi::mkl::transpose::trans,
-    //             oneapi::mkl::transpose::nontrans, orig->get_size()[0],
-    //             orig->get_size()[1], &alpha, orig->get_const_values(),
-    //             orig->get_stride(), &beta, static_cast<ValueType
-    //             *>(nullptr), trans->get_size()[1], trans->get_values(),
-    //             trans->get_stride());
-    //     }
-    // } else {
-    //     GKO_NOT_IMPLEMENTED;
-    // }
-    GKO_NOT_IMPLEMENTED;
-};
+               matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
@@ -1530,27 +1625,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *orig,
-                    matrix::Dense<ValueType> *trans)
-{
-    // if (cublas::is_supported<ValueType>::value) {
-    //     auto handle = exec->get_cublas_handle();
-    //     {
-    //         cublas::pointer_mode_guard pm_guard(handle);
-    //         auto alpha = one<ValueType>();
-    //         auto beta = zero<ValueType>();
-    //         cublas::geam(
-    //             handle, oneapi::mkl::transpose::conjtrans,
-    //             oneapi::mkl::transpose::nontrans, orig->get_size()[0],
-    //             orig->get_size()[1], &alpha, orig->get_const_values(),
-    //             orig->get_stride(), &beta, static_cast<ValueType
-    //             *>(nullptr), trans->get_size()[1], trans->get_values(),
-    //             trans->get_stride());
-    //     }
-    // } else {
-    //     GKO_NOT_IMPLEMENTED;
-    // }
-    GKO_NOT_IMPLEMENTED;
-}
+                    matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp
index 2ae72880443..402192d0b77 100644
--- a/dpcpp/test/components/prefix_sum.cpp
+++ b/dpcpp/test/components/prefix_sum.cpp
@@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index e47de0a6487..2b9af16732a 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/fill_array.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
@@ -64,9 +65,12 @@ class Dense : public ::testing::Test {
     using vtype = double;
 #endif  // GINKGO_DPCPP_SINGLE_MODE
     using Mtx = gko::matrix::Dense<vtype>;
+    using MixedMtx = gko::matrix::Dense<gko::next_precision<vtype>>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
     using Arr = gko::Array<itype>;
-    // using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+    using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+    using MixedComplexMtx =
+        gko::matrix::Dense<gko::next_precision<std::complex<vtype>>>;
 
     Dense() : rand_engine(15) {}
 
@@ -116,15 +120,16 @@ class Dense : public ::testing::Test {
     void set_up_apply_data()
     {
         x = gen_mtx<Mtx>(65, 25);
-        // c_x = gen_mtx<ComplexMtx>(65, 25);
+        c_x = gen_mtx<ComplexMtx>(65, 25);
         y = gen_mtx<Mtx>(25, 35);
         expected = gen_mtx<Mtx>(65, 35);
         alpha = gko::initialize<Mtx>({2.0}, ref);
         beta = gko::initialize<Mtx>({-1.0}, ref);
+        square = gen_mtx<Mtx>(x->get_size()[0], x->get_size()[0]);
         dx = Mtx::create(dpcpp);
         dx->copy_from(x.get());
-        // dc_x = ComplexMtx::create(dpcpp);
-        // dc_x->copy_from(c_x.get());
+        dc_x = ComplexMtx::create(dpcpp);
+        dc_x->copy_from(c_x.get());
         dy = Mtx::create(dpcpp);
         dy->copy_from(y.get());
         dresult = Mtx::create(dpcpp);
@@ -133,6 +138,8 @@ class Dense : public ::testing::Test {
         dalpha->copy_from(alpha.get());
         dbeta = Mtx::create(dpcpp);
         dbeta->copy_from(beta.get());
+        dsquare = Mtx::create(dpcpp);
+        dsquare->copy_from(square.get());
 
         std::vector<itype> tmp(x->get_size()[0], 0);
         auto rng = std::default_random_engine{};
@@ -141,14 +148,25 @@ class Dense : public ::testing::Test {
         std::vector<itype> tmp2(x->get_size()[1], 0);
         std::iota(tmp2.begin(), tmp2.end(), 0);
         std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        std::vector<itype> tmp3(x->get_size()[0] / 10);
+        std::uniform_int_distribution<itype> row_dist(0, x->get_size()[0] - 1);
+        for (auto &i : tmp3) {
+            i = row_dist(rng);
+        }
         rpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
-        drpermute_idxs =
-            std::unique_ptr<Arr>(new Arr{dpcpp, tmp.begin(), tmp.end()});
         cpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
-        dcpermute_idxs =
-            std::unique_ptr<Arr>(new Arr{dpcpp, tmp2.begin(), tmp2.end()});
+        rgather_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp3.begin(), tmp3.end()});
+    }
+
+    template <typename ConvertedType, typename InputType>
+    std::unique_ptr<ConvertedType> convert(InputType &&input)
+    {
+        auto result = ConvertedType::create(input->get_executor());
+        input->convert_to(result.get());
+        return result;
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -157,21 +175,22 @@ class Dense : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> x;
-    // std::unique_ptr<ComplexMtx> c_x;
+    std::unique_ptr<ComplexMtx> c_x;
     std::unique_ptr<Mtx> y;
     std::unique_ptr<Mtx> alpha;
     std::unique_ptr<Mtx> beta;
     std::unique_ptr<Mtx> expected;
+    std::unique_ptr<Mtx> square;
     std::unique_ptr<Mtx> dresult;
     std::unique_ptr<Mtx> dx;
-    // std::unique_ptr<ComplexMtx> dc_x;
+    std::unique_ptr<ComplexMtx> dc_x;
     std::unique_ptr<Mtx> dy;
     std::unique_ptr<Mtx> dalpha;
     std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Mtx> dsquare;
     std::unique_ptr<Arr> rpermute_idxs;
-    std::unique_ptr<Arr> drpermute_idxs;
     std::unique_ptr<Arr> cpermute_idxs;
-    std::unique_ptr<Arr> dcpermute_idxs;
+    std::unique_ptr<Arr> rgather_idxs;
 };
 
 
@@ -201,7 +220,7 @@ TEST_F(Dense, DpcppStridedFillIsEquivalentToRef)
     dx->fill(42);
     result->copy_from(dx.get());
 
-    GKO_ASSERT_MTX_NEAR(result, x, r<T>::value);
+    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
 }
 
 
@@ -318,6 +337,28 @@ TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, SingleVectorDpcppComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->compute_conj_dot(y.get(), expected.get());
+    dx->compute_conj_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->compute_conj_dot(y.get(), expected.get());
+    dx->compute_conj_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
 TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
@@ -343,6 +384,23 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef)
 }
 
 
+#if !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(convert<MixedMtx>(y).get(), convert<MixedMtx>(expected).get());
+    dx->apply(convert<MixedMtx>(dy).get(), convert<MixedMtx>(dresult).get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7);
+}
+
+
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
 TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -354,38 +412,142 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
 }
 
 
-// TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
-//     auto dcomplex_b = ComplexMtx::create(dpcpp);
-//     dcomplex_b->copy_from(complex_b.get());
-//     auto complex_x = gen_mtx<ComplexMtx>(65, 1);
-//     auto dcomplex_x = ComplexMtx::create(dpcpp);
-//     dcomplex_x->copy_from(complex_x.get());
+#if !GINKGO_DPCPP_SINGLE_MODE
 
-//     x->apply(complex_b.get(), complex_x.get());
-//     dx->apply(dcomplex_b.get(), dcomplex_x.get());
 
-//     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
-// }
+TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef)
+{
+    set_up_apply_data();
 
+    x->apply(convert<MixedMtx>(alpha).get(), convert<MixedMtx>(y).get(),
+             convert<MixedMtx>(beta).get(), convert<MixedMtx>(expected).get());
+    dx->apply(convert<MixedMtx>(dalpha).get(), convert<MixedMtx>(dy).get(),
+              convert<MixedMtx>(dbeta).get(), convert<MixedMtx>(dresult).get());
 
-// TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
-//     auto dcomplex_b = ComplexMtx::create(dpcpp);
-//     dcomplex_b->copy_from(complex_b.get());
-//     auto complex_x = gen_mtx<ComplexMtx>(65, 1);
-//     auto dcomplex_x = ComplexMtx::create(dpcpp);
-//     dcomplex_x->copy_from(complex_x.get());
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7);
+}
 
-//     x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
-//     dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
 
-//     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
-// }
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r<vtype>::value);
+}
+
+
+#if !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
+    auto dcomplex_b = MixedComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<MixedComplexMtx>(65, 1);
+    auto dcomplex_x = MixedComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
+}
+
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
+    dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r<vtype>::value);
+}
+
+
+#if !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
+    auto dcomplex_b = MixedComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<MixedComplexMtx>(65, 1);
+    auto dcomplex_x = MixedComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(convert<MixedMtx>(alpha).get(), complex_b.get(),
+             convert<MixedMtx>(beta).get(), complex_x.get());
+    dx->apply(convert<MixedMtx>(dalpha).get(), dcomplex_b.get(),
+              convert<MixedMtx>(dbeta).get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
+}
+
+
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, ComputeDotComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+    auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2});
+    auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2});
+
+    complex_b->compute_dot(complex_x.get(), result.get());
+    dcomplex_b->compute_dot(dcomplex_x.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(result, dresult, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+    auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2});
+    auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2});
+
+    complex_b->compute_conj_dot(complex_x.get(), result.get());
+    dcomplex_b->compute_conj_dot(dcomplex_x.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(result, dresult, r<vtype>::value);
+}
 
 
 // TEST_F(Dense, IsTransposable)
@@ -494,42 +656,42 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef)
 }
 
 
-// TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
-//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<vtype>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
 
-//     x->convert_to(sellp_mtx.get());
-//     dx->convert_to(dsellp_mtx.get());
+    x->convert_to(sellp_mtx.get());
+    dx->convert_to(dsellp_mtx.get());
 
-//     GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6);
-// }
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r<vtype>::value);
+}
 
 
-// TEST_F(Dense, MoveToSellpIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
-//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+TEST_F(Dense, MoveToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<vtype>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
 
-//     x->move_to(sellp_mtx.get());
-//     dx->move_to(dsellp_mtx.get());
+    x->move_to(sellp_mtx.get());
+    dx->move_to(dsellp_mtx.get());
 
-//     GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6);
-// }
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r<vtype>::value);
+}
 
 
-// TEST_F(Dense, ConvertsEmptyToSellp)
-// {
-//     auto dempty_mtx = Mtx::create(dpcpp);
-//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(dpcpp);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
 
-//     dempty_mtx->convert_to(dsellp_mtx.get());
+    dempty_mtx->convert_to(dsellp_mtx.get());
 
-//     ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()),
-//     0); ASSERT_FALSE(dsellp_mtx->get_size());
-// }
+    ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
 
 
 TEST_F(Dense, CountNNZIsEquivalentToRef)
@@ -595,12 +757,63 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, CanGatherRows)
+{
+    set_up_apply_data();
+
+    auto r_gather = x->row_gather(rgather_idxs.get());
+    auto dr_gather = dx->row_gather(rgather_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
+}
+
+
+TEST_F(Dense, CanGatherRowsIntoDense)
+{
+    set_up_apply_data();
+    auto gather_size =
+        gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]};
+    auto r_gather = Mtx::create(ref, gather_size);
+    // test make_temporary_clone and non-default stride
+    auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2);
+
+    x->row_gather(rgather_idxs.get(), r_gather.get());
+    dx->row_gather(rgather_idxs.get(), dr_gather.get());
+
+    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
+}
+
+
+TEST_F(Dense, IsPermutable)
+{
+    set_up_apply_data();
+
+    auto permuted = square->permute(rpermute_idxs.get());
+    auto dpermuted = dsquare->permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
+                        static_cast<Mtx *>(dpermuted.get()), 0);
+}
+
+
+TEST_F(Dense, IsInversePermutable)
+{
+    set_up_apply_data();
+
+    auto permuted = square->inverse_permute(rpermute_idxs.get());
+    auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
+                        static_cast<Mtx *>(dpermuted.get()), 0);
+}
+
+
 TEST_F(Dense, IsRowPermutable)
 {
     set_up_apply_data();
 
     auto r_permute = x->row_permute(rpermute_idxs.get());
-    auto dr_permute = dx->row_permute(drpermute_idxs.get());
+    auto dr_permute = dx->row_permute(rpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
                         static_cast<Mtx *>(dr_permute.get()), 0);
@@ -612,7 +825,7 @@ TEST_F(Dense, IsColPermutable)
     set_up_apply_data();
 
     auto c_permute = x->column_permute(cpermute_idxs.get());
-    auto dc_permute = dx->column_permute(dcpermute_idxs.get());
+    auto dc_permute = dx->column_permute(cpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
                         static_cast<Mtx *>(dc_permute.get()), 0);
@@ -624,7 +837,7 @@ TEST_F(Dense, IsInverseRowPermutable)
     set_up_apply_data();
 
     auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
-    auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
                         static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
@@ -636,14 +849,14 @@ TEST_F(Dense, IsInverseColPermutable)
     set_up_apply_data();
 
     auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
-    auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
                         static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
 }
 
 
-TEST_F(Dense, ExtractDiagonalIsEquivalentToRef)
+TEST_F(Dense, ExtractDiagonalOnTallSkinnyIsEquivalentToRef)
 {
     set_up_apply_data();
 
@@ -654,6 +867,17 @@ TEST_F(Dense, ExtractDiagonalIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ExtractDiagonalOnShortFatIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto diag = y->extract_diagonal();
+    auto ddiag = dy->extract_diagonal();
+
+    GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0);
+}
+
+
 TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -676,4 +900,76 @@ TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, MakeComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto complex_x = x->make_complex();
+    auto dcomplex_x = dx->make_complex();
+
+    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
+}
+
+
+TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto complex_x = ComplexMtx::create(ref, x->get_size());
+    x->make_complex(complex_x.get());
+    auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size());
+    dx->make_complex(dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
+}
+
+
+TEST_F(Dense, GetRealIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto real_x = x->get_real();
+    auto dreal_x = dx->get_real();
+
+    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
+}
+
+
+TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto real_x = Mtx::create(ref, x->get_size());
+    x->get_real(real_x.get());
+    auto dreal_x = Mtx::create(dpcpp, dx->get_size());
+    dx->get_real(dreal_x.get());
+
+    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
+}
+
+
+TEST_F(Dense, GetImagIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto imag_x = x->get_imag();
+    auto dimag_x = dx->get_imag();
+
+    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+}
+
+
+TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto imag_x = Mtx::create(ref, x->get_size());
+    x->get_imag(imag_x.get());
+    auto dimag_x = Mtx::create(dpcpp, dx->get_size());
+    dx->get_imag(dimag_x.get());
+
+    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+}
+
+
 }  // namespace