From 19cbd4e0374e589e021bbb7b0da678b7d1f56896 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 17 Dec 2020 02:58:25 +0100
Subject: [PATCH 01/14] add ELL mixed-precision support

---
 benchmark/utils/formats.hpp              | 30 ++++++++++--
 common/matrix/ell_kernels.hpp.inc        | 62 ++++++++++++++----------
 core/device_hooks/common_kernels.inc.cpp | 17 ++++---
 core/matrix/ell_kernels.hpp              | 39 +++++++++------
 cuda/matrix/ell_kernels.cu               | 42 +++++++++-------
 include/ginkgo/core/base/types.hpp       | 51 +++++++++++++++++++
 omp/matrix/ell_kernels.cpp               | 35 +++++++------
 reference/matrix/ell_kernels.cpp         | 34 +++++++------
 8 files changed, 212 insertions(+), 98 deletions(-)
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index d25be4657ec..6b344aec06d 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -59,7 +59,8 @@ namespace formats {
 
 
 std::string available_format =
-    "coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, "
+    "coo, csr, ell, fell, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
+    "hybrid40, "
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
 #ifdef HAS_CUDA
@@ -90,6 +91,9 @@ std::string format_description =
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse "
     "Matrix-Vector Multiplication on CUDA.\n"
+    "fell: float Ellpack format according to Bell and Garland: Efficient "
+    "Sparse "
+    "Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ell and coo to represent the matrix.\n"
     "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
@@ -204,6 +208,23 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
+        {"fell",
+         [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<> &data) {
+             gko::matrix_data<float> conv_data;
+             conv_data.size = data.size;
+             conv_data.nonzeros.resize(data.nonzeros.size());
+             auto it = conv_data.nonzeros.begin();
+             for (auto &el : data.nonzeros) {
+                 it->row = el.row;
+                 it->column = el.column;
+                 it->value = el.value;
+                 ++it;
+             }
+             auto mat = gko::matrix::Ell<float>::create(std::move(exec));
+             mat->read(conv_data);
+             return mat;
+         }},
 #ifdef HAS_CUDA
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         {"cusp_csr", read_matrix_from_data<cusp_csr>},
@@ -212,8 +233,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
-#else // CUDA_VERSION >= 11000
-        // cusp_csr, cusp_coo use the generic ones from CUDA 11
+#else  // CUDA_VERSION >= 11000
+       // cusp_csr, cusp_coo use the generic ones from CUDA 11
         {"cusp_csr", read_matrix_from_data<cusp_gcsr>},
         {"cusp_coo", read_matrix_from_data<cusp_gcoo>},
 #endif
@@ -260,7 +281,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"hybridminstorage",
          READ_MATRIX(hybrid,
                      std::make_shared<hybrid::minimal_storage_limit>())},
-        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}};
+        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}
+};
 // clang-format on
 
 
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 00eb2d6871d..2f155142efd 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -34,22 +34,25 @@ namespace kernel {
 namespace {
 
 
-template <int num_thread_per_worker, bool atomic, typename ValueType,
+template <int num_thread_per_worker, bool atomic, typename InputValueType,
+          typename MatrixValueType, typename OutputValueType,
           typename IndexType, typename Closure>
 __device__ void spmv_kernel(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride, Closure op)
+    const InputValueType *__restrict__ b, const size_type b_stride,
+    OutputValueType *__restrict__ c, const size_type c_stride, Closure op)
 {
     const auto tidx = thread::get_thread_id_flat();
     const auto column_id = blockIdx.y;
+    using compute_type =
+        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<compute_type>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
@@ -68,14 +71,14 @@ __device__ void spmv_kernel(
             const auto x = tidx % num_rows;
             const auto worker_id = tidx / num_rows;
             const auto step_size = num_worker_per_row * num_thread_per_worker;
-            __shared__ UninitializedArray<ValueType, default_block_size /
-                                                         num_thread_per_worker>
+            __shared__ UninitializedArray<
+                compute_type, default_block_size / num_thread_per_worker>
                 storage;
             if (idx_in_worker == 0) {
                 storage[threadIdx.x] = 0;
             }
             __syncthreads();
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<compute_type>();
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
                  idx < num_stored_elements_per_row; idx += step_size) {
@@ -102,35 +105,41 @@ __device__ void spmv_kernel(
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false,
+          typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
+    const InputValueType *__restrict__ b, const size_type b_stride,
+    OutputValueType *__restrict__ c, const size_type c_stride)
 {
+    using compute_type =
+        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
         num_stored_elements_per_row, b, b_stride, c, c_stride,
-        [](const ValueType &x, const ValueType &y) { return x; });
+        [](const compute_type &x, const OutputValueType &y) { return x; });
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false,
+          typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
-    const IndexType *__restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
+    const MatrixValueType *__restrict__ alpha,
+    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    const InputValueType *__restrict__ b, const size_type b_stride,
+    const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
     const size_type c_stride)
 {
-    const ValueType alpha_val = alpha[0];
-    const ValueType beta_val = beta[0];
+    using compute_type =
+        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
+    const compute_type alpha_val = alpha[0];
+    const compute_type beta_val = beta[0];
     // Because the atomic operation changes the values of c during computation,
     // it can not do the right alpha * a * b + beta * c operation.
     // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
@@ -139,15 +148,16 @@ __global__ __launch_bounds__(default_block_size) void spmv(
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val](const ValueType &x, const ValueType &y) {
+            [&alpha_val](const compute_type &x, const OutputValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
-                return alpha_val * x + beta_val * y;
+            [&alpha_val, &beta_val](const compute_type &x,
+                                    const OutputValueType &y) {
+                return alpha_val * x + beta_val * compute_type{y};
             });
     }
 }
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 877051c3e04..42121074df4 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -931,15 +931,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType)
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
+GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, OutputValueType,
+                            IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType)
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
+GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType,
+                                     OutputValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 template <typename ValueType, typename IndexType>
diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp
index 7a1ec261d15..7cbe54b3720 100644
--- a/core/matrix/ell_kernels.hpp
+++ b/core/matrix/ell_kernels.hpp
@@ -46,18 +46,21 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType)  \
-    void spmv(std::shared_ptr<const DefaultExecutor> exec, \
-              const matrix::Ell<ValueType, IndexType> *a,  \
-              const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
-
-#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType)  \
-    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec, \
-                       const matrix::Dense<ValueType> *alpha,       \
-                       const matrix::Ell<ValueType, IndexType> *a,  \
-                       const matrix::Dense<ValueType> *b,           \
-                       const matrix::Dense<ValueType> *beta,        \
-                       matrix::Dense<ValueType> *c)
+#define GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \
+                                    OutputValueType, IndexType)      \
+    void spmv(std::shared_ptr<const DefaultExecutor> exec,           \
+              const matrix::Ell<MatrixValueType, IndexType> *a,      \
+              const matrix::Dense<InputValueType> *b,                \
+              matrix::Dense<OutputValueType> *c)
+
+#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \
+                                             OutputValueType, IndexType)      \
+    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,           \
+                       const matrix::Dense<MatrixValueType> *alpha,           \
+                       const matrix::Ell<MatrixValueType, IndexType> *a,      \
+                       const matrix::Dense<InputValueType> *b,                \
+                       const matrix::Dense<OutputValueType> *beta,            \
+                       matrix::Dense<OutputValueType> *c)
 
 #define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
     void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
@@ -87,10 +90,14 @@ namespace kernels {
                           matrix::Diagonal<ValueType> *diag)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                         \
-    template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType);                       \
-    template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType);              \
+    template <typename InputValueType, typename MatrixValueType,             \
+              typename OutputValueType, typename IndexType>                  \
+    GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType,             \
+                                OutputValueType, IndexType);                 \
+    template <typename InputValueType, typename MatrixValueType,             \
+              typename OutputValueType, typename IndexType>                  \
+    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType,    \
+                                         OutputValueType, IndexType);        \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType);           \
     template <typename ValueType, typename IndexType>                        \
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 9bc47f71e63..df223dfd7cf 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -108,13 +108,14 @@ using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 namespace {
 
 
-template <int info, typename ValueType, typename IndexType>
+template <int info, typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   matrix::Dense<ValueType> *c,
-                   const matrix::Dense<ValueType> *alpha = nullptr,
-                   const matrix::Dense<ValueType> *beta = nullptr)
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   matrix::Dense<OutputValueType> *c,
+                   const matrix::Dense<MatrixValueType> *alpha = nullptr,
+                   const matrix::Dense<OutputValueType> *beta = nullptr)
 {
     const auto nrows = a->get_size()[0];
     constexpr int num_thread_per_worker =
@@ -194,10 +195,12 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
 }  // namespace
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -212,7 +215,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+                               c->get_num_stored_elements(),
+                               zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
@@ -221,16 +225,18 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -253,7 +259,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 2c77aaef826..2f6da1e9e42 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -524,6 +524,57 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
+    template _macro(float, float, float, int32);                    \
+    template _macro(float, float, double, int32);                   \
+    template _macro(float, double, float, int32);                   \
+    template _macro(float, double, double, int32);                  \
+    template _macro(double, float, float, int32);                   \
+    template _macro(double, float, double, int32);                  \
+    template _macro(double, double, float, int32);                  \
+    template _macro(double, double, double, int32);                 \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<double>, int32);                   \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<double>, int32);                   \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<double>, int32);                   \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<double>, int32);                   \
+    template _macro(float, float, float, int64);                    \
+    template _macro(float, float, double, int64);                   \
+    template _macro(float, double, float, int64);                   \
+    template _macro(float, double, double, int64);                  \
+    template _macro(double, float, float, int64);                   \
+    template _macro(double, float, double, int64);                  \
+    template _macro(double, double, float, int64);                  \
+    template _macro(double, double, double, int64);                 \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<double>, int64);                   \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<double>, int64);                   \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<double>, int64);                   \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<double>, int64)
+
+
 /**
  * Instantiates a template for each value type conversion pair compiled by
  * Ginkgo.
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index a590357a1c2..621e1cfe940 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -56,38 +56,44 @@ namespace omp {
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const OmpExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
     auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
-            c->at(row, j) = zero<ValueType>();
+            c->at(row, j) = zero<OutputValueType>();
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
             auto val = a->val_at(row, i);
             auto col = a->col_at(row, i);
+
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += val * b->at(col, j);
+                c->at(row, j) +=
+                    OutputValueType(val * MatrixValueType(b->at(col, j)));
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
     auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
     auto alpha_val = alpha->at(0, 0);
@@ -102,13 +108,14 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
             auto val = a->val_at(row, i);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += alpha_val * val * b->at(col, j);
+                c->at(row, j) += OutputValueType(
+                    alpha_val * val * MatrixValueType(b->at(col, j)));
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index fdbaff5dfc5..fee0dd5f7fc 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -50,37 +50,42 @@ namespace reference {
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const ReferenceExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
     auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
-            c->at(row, j) = zero<ValueType>();
+            c->at(row, j) = zero<OutputValueType>();
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
             auto val = a->val_at(row, i);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += val * b->at(col, j);
+                c->at(row, j) +=
+                    OutputValueType(val * MatrixValueType(b->at(col, j)));
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
     auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
     auto alpha_val = alpha->at(0, 0);
@@ -94,13 +99,14 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
             auto val = a->val_at(row, i);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += alpha_val * val * b->at(col, j);
+                c->at(row, j) += OutputValueType(
+                    alpha_val * val * MatrixValueType(b->at(col, j)));
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 

From 89ffe458a572e9ca257a994d7f291b0413d69159 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <goebel.fritz@googlemail.com>
Date: Fri, 12 Mar 2021 16:40:50 +0100
Subject: [PATCH 02/14] fix benchmark formats, make ell spmv use accessor

---
 benchmark/utils/formats.hpp       |  8 +--
 common/matrix/ell_kernels.hpp.inc | 65 ++++++++++------------
 cuda/matrix/ell_kernels.cu        | 49 +++++++++++++----
 hip/matrix/ell_kernels.hip.cpp    | 89 ++++++++++++++++++++++---------
 omp/matrix/ell_kernels.cpp        | 48 +++++++++++++----
 reference/matrix/ell_kernels.cpp  | 49 +++++++++++++----
 6 files changed, 210 insertions(+), 98 deletions(-)

diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index 6b344aec06d..ed20f91119e 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -208,10 +208,10 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
-        {"fell",
+        {"ell_mixed",
          [](std::shared_ptr<const gko::Executor> exec,
-            const gko::matrix_data<> &data) {
-             gko::matrix_data<float> conv_data;
+            const gko::matrix_data<etype> &data) {
+             gko::matrix_data<gko::next_precision<etype>> conv_data;
              conv_data.size = data.size;
              conv_data.nonzeros.resize(data.nonzeros.size());
              auto it = conv_data.nonzeros.begin();
@@ -221,7 +221,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
                  it->value = el.value;
                  ++it;
              }
-             auto mat = gko::matrix::Ell<float>::create(std::move(exec));
+             auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(std::move(exec));
              mat->read(conv_data);
              return mat;
          }},
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 2f155142efd..79ed4ed979b 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -34,32 +34,30 @@ namespace kernel {
 namespace {
 
 
-template <int num_thread_per_worker, bool atomic, typename InputValueType,
-          typename MatrixValueType, typename OutputValueType,
-          typename IndexType, typename Closure>
+template <int num_thread_per_worker, bool atomic, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType,
+          typename Closure>
 __device__ void spmv_kernel(
     const size_type num_rows, const int num_worker_per_row,
-    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
+    acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const InputValueType *__restrict__ b, const size_type b_stride,
+    acc::range<b_accessor> b, const size_type b_stride,
     OutputValueType *__restrict__ c, const size_type c_stride, Closure op)
 {
     const auto tidx = thread::get_thread_id_flat();
-    const auto column_id = blockIdx.y;
-    using compute_type =
-        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
+    const decltype(tidx) column_id = blockIdx.y;
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            auto temp = zero<compute_type>();
+            auto temp = zero<OutputValueType>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
                 if (col_idx < idx) {
                     break;
                 } else {
-                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                    temp += val(ind) * b(col_idx, column_id);
                 }
             }
             const auto c_ind = tidx * c_stride + column_id;
@@ -72,13 +70,13 @@ __device__ void spmv_kernel(
             const auto worker_id = tidx / num_rows;
             const auto step_size = num_worker_per_row * num_thread_per_worker;
             __shared__ UninitializedArray<
-                compute_type, default_block_size / num_thread_per_worker>
+                OutputValueType, default_block_size / num_thread_per_worker>
                 storage;
             if (idx_in_worker == 0) {
                 storage[threadIdx.x] = 0;
             }
             __syncthreads();
-            auto temp = zero<compute_type>();
+            auto temp = zero<OutputValueType>();
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
                  idx < num_stored_elements_per_row; idx += step_size) {
@@ -87,7 +85,7 @@ __device__ void spmv_kernel(
                 if (col_idx < idx) {
                     break;
                 } else {
-                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                    temp += val(ind) * b(col_idx, column_id);
                 }
             }
             atomic_add(&storage[threadIdx.x], temp);
@@ -105,41 +103,34 @@ __device__ void spmv_kernel(
 }
 
 
-template <int num_thread_per_worker, bool atomic = false,
-          typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
+    acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const InputValueType *__restrict__ b, const size_type b_stride,
+    acc::range<b_accessor> b, const size_type b_stride,
     OutputValueType *__restrict__ c, const size_type c_stride)
 {
-    using compute_type =
-        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
         num_stored_elements_per_row, b, b_stride, c, c_stride,
-        [](const compute_type &x, const OutputValueType &y) { return x; });
+        [](const OutputValueType &x, const OutputValueType &y) { return x; });
 }
 
 
-template <int num_thread_per_worker, bool atomic = false,
-          typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const MatrixValueType *__restrict__ alpha,
-    const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    const InputValueType *__restrict__ b, const size_type b_stride,
-    const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
-    const size_type c_stride)
+    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
+    const IndexType *__restrict__ col, const size_type stride,
+    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
+    const size_type b_stride, const OutputValueType *__restrict__ beta,
+    OutputValueType *__restrict__ c, const size_type c_stride)
 {
-    using compute_type =
-        decltype(InputValueType{} + MatrixValueType{} + OutputValueType{});
-    const compute_type alpha_val = alpha[0];
-    const compute_type beta_val = beta[0];
+    const OutputValueType alpha_val = alpha(0);
+    const OutputValueType beta_val = beta[0];
     // Because the atomic operation changes the values of c during computation,
     // it can not do the right alpha * a * b + beta * c operation.
     // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
@@ -148,16 +139,16 @@ __global__ __launch_bounds__(default_block_size) void spmv(
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val](const compute_type &x, const OutputValueType &y) {
+            [&alpha_val](const OutputValueType &x, const OutputValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val, &beta_val](const compute_type &x,
+            [&alpha_val, &beta_val](const OutputValueType &x,
                                     const OutputValueType &y) {
-                return alpha_val * x + beta_val * compute_type{y};
+                return alpha_val * x + beta_val * y;
             });
     }
 }
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index df223dfd7cf..33db65c4c1c 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -107,6 +108,17 @@ using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 
 namespace {
 
+template <int dim, typename Type1, typename Type2>
+GKO_INLINE auto as_cuda_accessor(
+    const acc::range<acc::reduced_row_major<dim, Type1, Type2>> &acc)
+{
+    return acc::range<
+        acc::reduced_row_major<dim, cuda_type<Type1>, cuda_type<Type2>>>(
+        acc.get_accessor().get_size(),
+        as_cuda_type(acc.get_accessor().get_stored_data()),
+        acc.get_accessor().get_stride());
+}
+
 
 template <int info, typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
@@ -117,7 +129,16 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                    const matrix::Dense<MatrixValueType> *alpha = nullptr,
                    const matrix::Dense<OutputValueType> *beta = nullptr)
 {
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
     const auto nrows = a->get_size()[0];
+    const auto stride = a->get_stride();
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+
     constexpr int num_thread_per_worker =
         (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
@@ -125,23 +146,31 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                           num_thread_per_worker, 1);
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
+
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<long unsigned int, 1>{
+            {num_stored_elements_per_row * stride}},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<long unsigned int, 2>{{nrows, b->get_stride()}},
+        b->get_const_values());
+
     if (alpha == nullptr && beta == nullptr) {
         kernel::spmv<num_thread_per_worker, atomic>
             <<<grid_size, block_size, 0, 0>>>(
-                nrows, num_worker_per_row, as_cuda_type(a->get_const_values()),
-                a->get_const_col_idxs(), a->get_stride(),
-                a->get_num_stored_elements_per_row(),
-                as_cuda_type(b->get_const_values()), b->get_stride(),
+                nrows, num_worker_per_row, as_cuda_accessor(a_vals),
+                a->get_const_col_idxs(), stride, num_stored_elements_per_row,
+                as_cuda_accessor(b_vals), b->get_stride(),
                 as_cuda_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
+        const auto alpha_val = gko::acc::range<a_accessor>(
+            std::array<long unsigned int, 1>{1}, alpha->get_const_values());
         kernel::spmv<num_thread_per_worker, atomic>
             <<<grid_size, block_size, 0, 0>>>(
-                nrows, num_worker_per_row,
-                as_cuda_type(alpha->get_const_values()),
-                as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
-                a->get_stride(), a->get_num_stored_elements_per_row(),
-                as_cuda_type(b->get_const_values()), b->get_stride(),
-                as_cuda_type(beta->get_const_values()),
+                nrows, num_worker_per_row, as_cuda_accessor(alpha_val),
+                as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride,
+                num_stored_elements_per_row, as_cuda_accessor(b_vals),
+                b->get_stride(), as_cuda_type(beta->get_const_values()),
                 as_cuda_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 2e7a4eeeb50..8bcadd02370 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -111,15 +112,37 @@ using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 namespace {
 
 
-template <int info, typename ValueType, typename IndexType>
+template <int dim, typename Type1, typename Type2>
+GKO_INLINE auto as_hip_accessor(
+    const acc::range<acc::reduced_row_major<dim, Type1, Type2>> &acc)
+{
+    return acc::range<
+        acc::reduced_row_major<dim, hip_type<Type1>, hip_type<Type2>>>(
+        acc.get_accessor().get_size(),
+        as_hip_type(acc.get_accessor().get_stored_data()),
+        acc.get_accessor().get_stride());
+}
+
+
+template <int info, typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   matrix::Dense<ValueType> *c,
-                   const matrix::Dense<ValueType> *alpha = nullptr,
-                   const matrix::Dense<ValueType> *beta = nullptr)
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   matrix::Dense<OutputValueType> *c,
+                   const matrix::Dense<MatrixValueType> *alpha = nullptr,
+                   const matrix::Dense<OutputValueType> *beta = nullptr)
 {
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
     const auto nrows = a->get_size()[0];
+    const auto stride = a->get_stride();
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+
     constexpr int num_thread_per_worker =
         (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
@@ -127,22 +150,31 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                           num_thread_per_worker, 1);
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
+
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<long unsigned int, 1>{
+            {num_stored_elements_per_row * stride}},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<long unsigned int, 2>{{nrows, b->get_stride()}},
+        b->get_const_values());
+
     if (alpha == nullptr && beta == nullptr) {
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
-            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
-            as_hip_type(b->get_const_values()), b->get_stride(),
-            as_hip_type(c->get_values()), c->get_stride());
+            as_hip_accessor(a_vals), a->get_const_col_idxs(), stride,
+            num_stored_elements_per_row, as_hip_accessor(b_vals),
+            b->get_stride(), as_hip_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
+        const auto alpha_val = gko::acc::range<a_accessor>(
+            std::array<long unsigned int, 1>{1}, alpha->get_const_values());
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
-            as_hip_type(alpha->get_const_values()),
-            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
-            as_hip_type(b->get_const_values()), b->get_stride(),
+            as_hip_accessor(alpha_val), as_hip_accessor(a_vals),
+            a->get_const_col_idxs(), stride, num_stored_elements_per_row,
+            as_hip_accessor(b_vals), b->get_stride(),
             as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()),
             c->get_stride());
     } else {
@@ -197,10 +229,12 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
 }  // namespace
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -215,7 +249,8 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+                               c->get_num_stored_elements(),
+                               zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
@@ -224,16 +259,18 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -256,7 +293,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index 621e1cfe940..76e7eea80ed 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
 #include "omp/components/format_conversion.hpp"
 
 
@@ -63,7 +64,21 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
           const matrix::Dense<InputValueType> *b,
           matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto num_rows = b->get_size()[0];
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        b->get_const_values());
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; row++) {
@@ -71,12 +86,10 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
             c->at(row, j) = zero<OutputValueType>();
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
-
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) +=
-                    OutputValueType(val * MatrixValueType(b->at(col, j)));
+                c->at(row, j) += val * b_vals(col, j);
             }
         }
     }
@@ -95,9 +108,23 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
                    const matrix::Dense<OutputValueType> *beta,
                    matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
-    auto alpha_val = alpha->at(0, 0);
-    auto beta_val = beta->at(0, 0);
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto num_rows = b->get_size()[0];
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        b->get_const_values());
+    const auto alpha_val = OutputValueType(alpha->at(0, 0));
+    const auto beta_val = beta->at(0, 0);
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; row++) {
@@ -105,11 +132,10 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
             c->at(row, j) *= beta_val;
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += OutputValueType(
-                    alpha_val * val * MatrixValueType(b->at(col, j)));
+                c->at(row, j) += alpha_val * val * b_vals(col, j);
             }
         }
     }
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index fee0dd5f7fc..ba6d687d18a 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace reference {
@@ -57,18 +60,31 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
           const matrix::Dense<InputValueType> *b,
           matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto num_rows = b->get_size()[0];
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        b->get_const_values());
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
             c->at(row, j) = zero<OutputValueType>();
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) +=
-                    OutputValueType(val * MatrixValueType(b->at(col, j)));
+                c->at(row, j) += val * b_vals(col, j);
             }
         }
     }
@@ -87,20 +103,33 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<OutputValueType> *beta,
                    matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
-    auto alpha_val = alpha->at(0, 0);
-    auto beta_val = beta->at(0, 0);
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto num_rows = b->get_size()[0];
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        b->get_const_values());
+    const auto alpha_val = OutputValueType(alpha->at(0, 0));
+    const auto beta_val = beta->at(0, 0);
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
             c->at(row, j) *= beta_val;
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += OutputValueType(
-                    alpha_val * val * MatrixValueType(b->at(col, j)));
+                c->at(row, j) += alpha_val * val * b_vals(col, j);
             }
         }
     }

From b9ba1787efa4d7b426cf13caf9b631e8c336a878 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <goebel.fritz@googlemail.com>
Date: Fri, 12 Mar 2021 17:55:32 +0100
Subject: [PATCH 03/14] Add tests for mixed precision ELL

---
 cuda/test/matrix/ell_kernels.cpp      | 222 +++++++++++++++++++-
 hip/test/matrix/ell_kernels.hip.cpp   | 221 ++++++++++++++++++++
 omp/test/matrix/ell_kernels.cpp       | 289 +++++++++++++++++++++++++-
 reference/test/matrix/ell_kernels.cpp | 254 +++++++++++++++++++++-
 4 files changed, 974 insertions(+), 12 deletions(-)

diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index dff573a284f..e24f7083728 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -58,6 +58,7 @@ class Ell : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Ell<>;
     using Vec = gko::matrix::Dense<>;
+    using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
     Ell() : rand_engine(42) {}
@@ -92,22 +93,35 @@ class Ell : public ::testing::Test {
                           stride);
         mtx->copy_from(gen_mtx(num_rows, num_cols));
         expected = gen_mtx(num_rows, num_vectors);
+        expected2 = Vec2::create(ref);
+        expected2->copy_from(expected.get());
         y = gen_mtx(num_cols, num_vectors);
+        y2 = Vec2::create(ref);
+        y2->copy_from(y.get());
         alpha = gko::initialize<Vec>({2.0}, ref);
+        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
+        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(cuda);
         dmtx->copy_from(mtx.get());
         dresult = Vec::create(cuda);
         dresult->copy_from(expected.get());
+        dresult2 = Vec2::create(cuda);
+        dresult2->copy_from(expected2.get());
         dy = Vec::create(cuda);
         dy->copy_from(y.get());
+        dy2 = Vec2::create(cuda);
+        dy2->copy_from(y2.get());
         dalpha = Vec::create(cuda);
         dalpha->copy_from(alpha.get());
+        dalpha2 = Vec2::create(cuda);
+        dalpha2->copy_from(alpha2.get());
         dbeta = Vec::create(cuda);
         dbeta->copy_from(beta.get());
+        dbeta2 = Vec2::create(cuda);
+        dbeta2->copy_from(beta2.get());
     }
 
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
@@ -115,15 +129,23 @@ class Ell : public ::testing::Test {
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
+    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Vec2> dbeta2;
 };
 
 
@@ -138,6 +160,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -149,6 +204,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 1, 300, 600);
@@ -160,6 +248,39 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 1, 300, 600);
@@ -170,6 +291,39 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 3, 300, 600);
@@ -181,6 +335,39 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 3, 300, 600);
@@ -192,6 +379,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef)
 {
     set_up_apply_data(10, 10000);
diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp
index e9a875eb544..77253125abe 100644
--- a/hip/test/matrix/ell_kernels.hip.cpp
+++ b/hip/test/matrix/ell_kernels.hip.cpp
@@ -58,6 +58,7 @@ class Ell : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Ell<>;
     using Vec = gko::matrix::Dense<>;
+    using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
     Ell() : rand_engine(42) {}
@@ -92,19 +93,33 @@ class Ell : public ::testing::Test {
                           stride);
         mtx->copy_from(gen_mtx(num_rows, num_cols));
         expected = gen_mtx(num_rows, num_vectors);
+        expected2 = Vec2::create(ref);
+        expected2->copy_from(expected.get());
         y = gen_mtx(num_cols, num_vectors);
+        y2 = Vec2::create(ref);
+        y2->copy_from(y.get());
         alpha = gko::initialize<Vec>({2.0}, ref);
+        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
+        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(hip);
         dmtx->copy_from(mtx.get());
         dresult = Vec::create(hip);
         dresult->copy_from(expected.get());
+        dresult2 = Vec2::create(hip);
+        dresult2->copy_from(expected2.get());
         dy = Vec::create(hip);
         dy->copy_from(y.get());
+        dy2 = Vec2::create(hip);
+        dy2->copy_from(y2.get());
         dalpha = Vec::create(hip);
         dalpha->copy_from(alpha.get());
+        dalpha2 = Vec2::create(hip);
+        dalpha2->copy_from(alpha2.get());
         dbeta = Vec::create(hip);
         dbeta->copy_from(beta.get());
+        dbeta2 = Vec2::create(hip);
+        dbeta2->copy_from(beta2.get());
     }
 
 
@@ -115,15 +130,23 @@ class Ell : public ::testing::Test {
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
+    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Vec2> dbeta2;
 };
 
 
@@ -138,6 +161,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -149,6 +205,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 1, 300, 600);
@@ -160,6 +249,39 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 1, 300, 600);
@@ -170,6 +292,39 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 1, 300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 3, 300, 600);
@@ -181,6 +336,39 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(532, 231, 3, 300, 600);
@@ -192,6 +380,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(532, 231, 3, 300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef)
 {
     set_up_apply_data(10, 10000);
diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp
index 281292e9684..01f1a31121d 100644
--- a/omp/test/matrix/ell_kernels.cpp
+++ b/omp/test/matrix/ell_kernels.cpp
@@ -57,6 +57,7 @@ class Ell : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Ell<>;
     using Vec = gko::matrix::Dense<>;
+    using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
     Ell() : rand_engine(42) {}
@@ -90,19 +91,33 @@ class Ell : public ::testing::Test {
         mtx = Mtx::create(ref, gko::dim<2>{}, max_nonzeros_per_row, stride);
         mtx->copy_from(gen_mtx(532, 231, 1));
         expected = gen_mtx(532, num_vectors, 1);
+        expected2 = Vec2::create(ref);
+        expected2->copy_from(expected.get());
         y = gen_mtx(231, num_vectors, 1);
+        y2 = Vec2::create(ref);
+        y2->copy_from(y.get());
         alpha = gko::initialize<Vec>({2.0}, ref);
+        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
+        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(omp);
         dmtx->copy_from(mtx.get());
         dresult = Vec::create(omp);
         dresult->copy_from(expected.get());
+        dresult2 = Vec2::create(omp);
+        dresult2->copy_from(expected2.get());
         dy = Vec::create(omp);
         dy->copy_from(y.get());
+        dy2 = Vec2::create(omp);
+        dy2->copy_from(y2.get());
         dalpha = Vec::create(omp);
         dalpha->copy_from(alpha.get());
+        dalpha2 = Vec2::create(omp);
+        dalpha2->copy_from(alpha2.get());
         dbeta = Vec::create(omp);
         dbeta->copy_from(beta.get());
+        dbeta2 = Vec2::create(omp);
+        dbeta2->copy_from(beta2.get());
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -112,15 +127,23 @@ class Ell : public ::testing::Test {
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
+    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Vec2> dbeta2;
 };
 
 
@@ -135,6 +158,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -146,6 +202,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef)
 {
     set_up_apply_data(300, 600);
@@ -157,6 +246,39 @@ TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef)
 {
     set_up_apply_data(300, 600);
@@ -167,6 +289,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(0, 0, 3);
@@ -178,6 +333,39 @@ TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(0, 0, 3);
@@ -189,7 +377,40 @@ TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef)
 }
 
 
-TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef)
+TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(300, 600, 3);
 
@@ -200,6 +421,39 @@ TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(300, 600, 3);
@@ -210,6 +464,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data(300, 600);
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 77436ad1287..c3326737e4f 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -120,11 +120,46 @@ TYPED_TEST(Ell, AppliesToDenseVector)
 }
 
 
-TYPED_TEST(Ell, AppliesToMixedDenseVector)
+TYPED_TEST(Ell, MixedAppliesToDenseVector1)
 {
-    using MixedVec = typename TestFixture::MixedVec;
-    auto x = gko::initialize<MixedVec>({2.0, 1.0, 4.0}, this->exec);
-    auto y = MixedVec::create(this->exec, gko::dim<2>{2, 1});
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = typename gko::matrix::Dense<next_T>;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseVector2)
+{
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec2::create(this->exec, gko::dim<2>{2, 1});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseVector3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<gko::next_precision<T>>;
+    auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec1::create(this->exec, gko::dim<2>{2, 1});
 
     this->mtx1->apply(x.get(), y.get());
 
@@ -154,6 +189,80 @@ TYPED_TEST(Ell, AppliesToDenseMatrix)
 }
 
 
+TYPED_TEST(Ell, MixedAppliesToDenseMatrix1)
+{
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = gko::matrix::Dense<next_T>;
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseMatrix2)
+{
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    // clang-format off
+    auto x = gko::initialize<Vec1>(
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec2::create(this->exec, gko::dim<2>{2});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseMatrix3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    // clang-format off
+    auto x = gko::initialize<Vec2>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec1::create(this->exec, gko::dim<2>{2});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
+    // clang-format on
+}
+
+
 TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector)
 {
     using Vec = typename TestFixture::Vec;
@@ -168,13 +277,52 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector)
 }
 
 
-TYPED_TEST(Ell, AppliesLinearCombinationToMixedDenseVector)
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1)
+{
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2)
 {
-    using MixedVec = typename TestFixture::MixedVec;
-    auto alpha = gko::initialize<MixedVec>({-1.0}, this->exec);
-    auto beta = gko::initialize<MixedVec>({2.0}, this->exec);
-    auto x = gko::initialize<MixedVec>({2.0, 1.0, 4.0}, this->exec);
-    auto y = gko::initialize<MixedVec>({1.0, 2.0}, this->exec);
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec2>({2.0}, this->exec);
+    auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec2>({1.0, 2.0}, this->exec);
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec1>({2.0}, this->exec);
+    auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec1>({1.0, 2.0}, this->exec);
 
     this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
@@ -208,6 +356,92 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseMatrix)
 }
 
 
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1)
+{
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    auto y = gko::initialize<Vec>(
+        {I<next_T>{1.0, 0.5},
+         I<next_T>{2.0, -1.5}}, this->exec);
+    // clang-format on
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2)
+{
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec2>({2.0}, this->exec);
+    // clang-format off
+    auto x = gko::initialize<Vec1>(
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
+    auto y = gko::initialize<Vec2>(
+        {I<next_T>{1.0, 0.5},
+         I<next_T>{2.0, -1.5}}, this->exec);
+    // clang-format on
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec1>({2.0}, this->exec);
+    // clang-format off
+    auto x = gko::initialize<Vec2>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    auto y = gko::initialize<Vec1>(
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
+    // clang-format on
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
+    // clang-format on
+}
+
+
 TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
 {
     using Vec = typename TestFixture::Vec;

From 420384c9574c04505ff8944796a9377576def567 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <goebel.fritz@googlemail.com>
Date: Mon, 15 Mar 2021 12:27:24 +0100
Subject: [PATCH 04/14] adjust tolerances for lp value types

---
 cuda/test/matrix/ell_kernels.cpp    | 24 ++++++++++++------------
 hip/test/matrix/ell_kernels.hip.cpp | 24 ++++++++++++------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index e24f7083728..0df427f139e 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -167,7 +167,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -189,7 +189,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -211,7 +211,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -233,7 +233,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -255,7 +255,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -277,7 +277,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -298,7 +298,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -320,7 +320,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -342,7 +342,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -364,7 +364,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -386,7 +386,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -408,7 +408,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp
index 77253125abe..9d1e1440544 100644
--- a/hip/test/matrix/ell_kernels.hip.cpp
+++ b/hip/test/matrix/ell_kernels.hip.cpp
@@ -168,7 +168,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -190,7 +190,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -212,7 +212,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -234,7 +234,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -256,7 +256,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -278,7 +278,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -299,7 +299,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -321,7 +321,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -343,7 +343,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -365,7 +365,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -387,7 +387,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 
@@ -409,7 +409,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
 }
 
 

From b2e10a77f606f71eb2804a0b51aa305521905bc5 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <goebel.fritz@googlemail.com>
Date: Mon, 15 Mar 2021 13:43:42 +0100
Subject: [PATCH 05/14] use size_type for accessor dimensions

---
 cuda/matrix/ell_kernels.cu       | 7 +++----
 hip/matrix/ell_kernels.hip.cpp   | 7 +++----
 omp/matrix/ell_kernels.cpp       | 8 ++++----
 reference/matrix/ell_kernels.cpp | 8 ++++----
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 33db65c4c1c..2dc669ba5a7 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -148,11 +148,10 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                          b->get_size()[1], 1);
 
     const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<long unsigned int, 1>{
-            {num_stored_elements_per_row * stride}},
+        std::array<size_type, 1>{{num_stored_elements_per_row * stride}},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<long unsigned int, 2>{{nrows, b->get_stride()}},
+        std::array<size_type, 2>{{nrows, b->get_stride()}},
         b->get_const_values());
 
     if (alpha == nullptr && beta == nullptr) {
@@ -164,7 +163,7 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                 as_cuda_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
         const auto alpha_val = gko::acc::range<a_accessor>(
-            std::array<long unsigned int, 1>{1}, alpha->get_const_values());
+            std::array<size_type, 1>{1}, alpha->get_const_values());
         kernel::spmv<num_thread_per_worker, atomic>
             <<<grid_size, block_size, 0, 0>>>(
                 nrows, num_worker_per_row, as_cuda_accessor(alpha_val),
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 8bcadd02370..24966a64113 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -152,11 +152,10 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                          b->get_size()[1], 1);
 
     const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<long unsigned int, 1>{
-            {num_stored_elements_per_row * stride}},
+        std::array<size_type, 1>{{num_stored_elements_per_row * stride}},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<long unsigned int, 2>{{nrows, b->get_stride()}},
+        std::array<size_type, 2>{{nrows, b->get_stride()}},
         b->get_const_values());
 
     if (alpha == nullptr && beta == nullptr) {
@@ -168,7 +167,7 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
             b->get_stride(), as_hip_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
         const auto alpha_val = gko::acc::range<a_accessor>(
-            std::array<long unsigned int, 1>{1}, alpha->get_const_values());
+            std::array<size_type, 1>{1}, alpha->get_const_values());
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index 76e7eea80ed..040176053d6 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -74,10 +74,10 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     const auto stride = a->get_stride();
     const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        std::array<size_type, 2>{num_rows, b->get_stride()},
         b->get_const_values());
 
 #pragma omp parallel for
@@ -118,10 +118,10 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     const auto stride = a->get_stride();
     const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        std::array<size_type, 2>{num_rows, b->get_stride()},
         b->get_const_values());
     const auto alpha_val = OutputValueType(alpha->at(0, 0));
     const auto beta_val = beta->at(0, 0);
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index ba6d687d18a..8b0f541799a 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -70,10 +70,10 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     const auto stride = a->get_stride();
     const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        std::array<size_type, 2>{num_rows, b->get_stride()},
         b->get_const_values());
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
@@ -113,10 +113,10 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     const auto stride = a->get_stride();
     const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
-        std::array<long unsigned int, 1>{num_stored_elements_per_row * stride},
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<long unsigned int, 2>{num_rows, b->get_stride()},
+        std::array<size_type, 2>{num_rows, b->get_stride()},
         b->get_const_values());
     const auto alpha_val = OutputValueType(alpha->at(0, 0));
     const auto beta_val = beta->at(0, 0);

From d0a4e44d5ad45793e303f033d9864b1bd4d131d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Wed, 17 Mar 2021 11:29:28 +0100
Subject: [PATCH 06/14] Fix HIP and CUDA ELL SpMV Debug error

---
 cuda/matrix/ell_kernels.cu     | 4 ++--
 hip/matrix/ell_kernels.hip.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 2dc669ba5a7..57617269ed0 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -151,8 +151,8 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
         std::array<size_type, 1>{{num_stored_elements_per_row * stride}},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<size_type, 2>{{nrows, b->get_stride()}},
-        b->get_const_values());
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
 
     if (alpha == nullptr && beta == nullptr) {
         kernel::spmv<num_thread_per_worker, atomic>
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 24966a64113..f1972635385 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -155,8 +155,8 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
         std::array<size_type, 1>{{num_stored_elements_per_row * stride}},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<size_type, 2>{{nrows, b->get_stride()}},
-        b->get_const_values());
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
 
     if (alpha == nullptr && beta == nullptr) {
         hipLaunchKernelGGL(

From 319491cebabf8346248ffc9156667a94d9893d0c Mon Sep 17 00:00:00 2001
From: Fritz Goebel <goebel.fritz@googlemail.com>
Date: Thu, 18 Mar 2021 11:07:22 +0100
Subject: [PATCH 07/14] Code Review

---
 common/matrix/ell_kernels.hpp.inc | 18 +++++++++---------
 cuda/matrix/ell_kernels.cu        |  6 +++---
 dpcpp/matrix/ell_kernels.dp.cpp   | 27 +++++++++++++++------------
 hip/matrix/ell_kernels.hip.cpp    |  7 +++----
 omp/matrix/ell_kernels.cpp        |  8 ++++----
 reference/matrix/ell_kernels.cpp  |  8 ++++----
 6 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 79ed4ed979b..2323d512258 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -41,8 +41,8 @@ __device__ void spmv_kernel(
     const size_type num_rows, const int num_worker_per_row,
     acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, const size_type b_stride,
-    OutputValueType *__restrict__ c, const size_type c_stride, Closure op)
+    acc::range<b_accessor> b, OutputValueType *__restrict__ c,
+    const size_type c_stride, Closure op)
 {
     const auto tidx = thread::get_thread_id_flat();
     const decltype(tidx) column_id = blockIdx.y;
@@ -109,12 +109,12 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
     acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, const size_type b_stride,
-    OutputValueType *__restrict__ c, const size_type c_stride)
+    acc::range<b_accessor> b, OutputValueType *__restrict__ c,
+    const size_type c_stride)
 {
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
-        num_stored_elements_per_row, b, b_stride, c, c_stride,
+        num_stored_elements_per_row, b, c, c_stride,
         [](const OutputValueType &x, const OutputValueType &y) { return x; });
 }
 
@@ -126,8 +126,8 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     acc::range<a_accessor> alpha, acc::range<a_accessor> val,
     const IndexType *__restrict__ col, const size_type stride,
     const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
-    const size_type b_stride, const OutputValueType *__restrict__ beta,
-    OutputValueType *__restrict__ c, const size_type c_stride)
+    const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
+    const size_type c_stride)
 {
     const OutputValueType alpha_val = alpha(0);
     const OutputValueType beta_val = beta[0];
@@ -138,14 +138,14 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     if (atomic) {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
+            num_stored_elements_per_row, b, c, c_stride,
             [&alpha_val](const OutputValueType &x, const OutputValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
+            num_stored_elements_per_row, b, c, c_stride,
             [&alpha_val, &beta_val](const OutputValueType &x,
                                     const OutputValueType &y) {
                 return alpha_val * x + beta_val * y;
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 57617269ed0..8808a502ebb 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -159,8 +159,8 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
             <<<grid_size, block_size, 0, 0>>>(
                 nrows, num_worker_per_row, as_cuda_accessor(a_vals),
                 a->get_const_col_idxs(), stride, num_stored_elements_per_row,
-                as_cuda_accessor(b_vals), b->get_stride(),
-                as_cuda_type(c->get_values()), c->get_stride());
+                as_cuda_accessor(b_vals), as_cuda_type(c->get_values()),
+                c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
         const auto alpha_val = gko::acc::range<a_accessor>(
             std::array<size_type, 1>{1}, alpha->get_const_values());
@@ -169,7 +169,7 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                 nrows, num_worker_per_row, as_cuda_accessor(alpha_val),
                 as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride,
                 num_stored_elements_per_row, as_cuda_accessor(b_vals),
-                b->get_stride(), as_cuda_type(beta->get_const_values()),
+                as_cuda_type(beta->get_const_values()),
                 as_cuda_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index 01a40b5d9b5..b4dfeafeb54 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -56,24 +56,27 @@ namespace dpcpp {
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const DpcppExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b,
-          matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index f1972635385..fc10fc7d0e1 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -164,7 +164,7 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
             as_hip_accessor(a_vals), a->get_const_col_idxs(), stride,
             num_stored_elements_per_row, as_hip_accessor(b_vals),
-            b->get_stride(), as_hip_type(c->get_values()), c->get_stride());
+            as_hip_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
         const auto alpha_val = gko::acc::range<a_accessor>(
             std::array<size_type, 1>{1}, alpha->get_const_values());
@@ -173,9 +173,8 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
             as_hip_accessor(alpha_val), as_hip_accessor(a_vals),
             a->get_const_col_idxs(), stride, num_stored_elements_per_row,
-            as_hip_accessor(b_vals), b->get_stride(),
-            as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()),
-            c->get_stride());
+            as_hip_accessor(b_vals), as_hip_type(beta->get_const_values()),
+            as_hip_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index 040176053d6..db7fe3cbc0d 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -77,8 +77,8 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<size_type, 2>{num_rows, b->get_stride()},
-        b->get_const_values());
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; row++) {
@@ -121,8 +121,8 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<size_type, 2>{num_rows, b->get_stride()},
-        b->get_const_values());
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
     const auto alpha_val = OutputValueType(alpha->at(0, 0));
     const auto beta_val = beta->at(0, 0);
 
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 8b0f541799a..58df834bfba 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -73,8 +73,8 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<size_type, 2>{num_rows, b->get_stride()},
-        b->get_const_values());
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
@@ -116,8 +116,8 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
     const auto b_vals = gko::acc::range<b_accessor>(
-        std::array<size_type, 2>{num_rows, b->get_stride()},
-        b->get_const_values());
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
     const auto alpha_val = OutputValueType(alpha->at(0, 0));
     const auto beta_val = beta->at(0, 0);
 

From 07bf3be16d26de25a66051bd82070bd89c4743f9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 15 Apr 2021 13:44:43 +0200
Subject: [PATCH 08/14] add GINKGO_MIXED_PRECISION flag and dispatch

---
 .gitlab-ci.yml                                |  41 +++++++
 CMakeLists.txt                                |   1 +
 INSTALL.md                                    |   4 +
 cmake/get_info.cmake                          |   2 +-
 core/base/mixed_precision_types.hpp           | 108 ++++++++++++++++++
 core/device_hooks/common_kernels.inc.cpp      |   1 +
 core/matrix/ell.cpp                           |  13 ++-
 cuda/matrix/ell_kernels.cu                    |   1 +
 dpcpp/matrix/ell_kernels.dp.cpp               |   1 +
 hip/matrix/ell_kernels.hip.cpp                |   1 +
 include/ginkgo/config.hpp.in                  |   4 +
 .../ginkgo/core/base/precision_dispatch.hpp   |  94 +++++++++++++++
 include/ginkgo/core/base/types.hpp            |  51 ---------
 omp/matrix/ell_kernels.cpp                    |   1 +
 reference/matrix/ell_kernels.cpp              |   1 +
 15 files changed, 267 insertions(+), 57 deletions(-)
 create mode 100644 core/base/mixed_precision_types.hpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bbe85b1fbe8..11e66475301 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -32,6 +32,7 @@ include:
   BUILD_HWLOC: "ON"
   FAST_TESTS: "OFF"
   DPCPP_SINGLE_MODE: "OFF"
+  MIXED_PRECISION: "ON"
   RUN_EXAMPLES: "OFF"
   CONFIG_LOG: "ON"
   CXX_FLAGS: ""
@@ -77,6 +78,7 @@ include:
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
+        -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
         -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
@@ -111,6 +113,7 @@ include:
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
+        -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
         -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
@@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static:
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
 
+build/nocuda-nomixed/gcc/omp/release/shared:
+  <<: *default_build_with_test
+  extends:
+    - .quick_test_condition
+    - .use_gko-nocuda-gnu9-llvm8
+  variables:
+    <<: *default_variables
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Release"
+    MIXED_PRECISION: "OFF"
+
+build/nocuda-nomixed/clang/omp/debug/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-nocuda-gnu9-llvm8
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    MIXED_PRECISION: "OFF"
+
+build/nocuda-nomixed/intel/omp/release/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-nocuda-gnu9-llvm8-intel
+  variables:
+    <<: *default_variables
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+    MIXED_PRECISION: "OFF"
 
 build/dpcpp/cpu/release/static:
   <<: *default_build_with_test
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1318a6900d4..f0d68b51a34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
 option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
 option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
+option(GINKGO_MIXED_PRECISION "Add true mixed-precision kernel (otherwise they will be conversion-based)" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_EXPORT_BUILD_DIR
diff --git a/INSTALL.md b/INSTALL.md
index d85bcc48329..4fcc60d9ae4 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development
     (requires clang-format, will also download git-cmake-format),
     default is `OFF`.
+*   `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels
+    instead of converting data on the fly, default is `OFF`.
+    Enabling this flag increases the library size, but improves performance of
+    mixed-precision kernels.
 *   `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
     (will download googletest), default is `ON`.
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 835bdf0c45b..508bb6e0191 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -127,7 +127,7 @@ foreach(log_type ${log_types})
     ginkgo_print_module_footer(${${log_type}} "User configuration:")
     ginkgo_print_module_footer(${${log_type}} "  Enabled modules:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
+        "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP;GINKGO_MIXED_PRECISION")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
new file mode 100644
index 00000000000..079fcb8a075
--- /dev/null
+++ b/core/base/mixed_precision_types.hpp
@@ -0,0 +1,108 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
+#define GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#ifdef GINKGO_MIXED_PRECISION
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
+    template _macro(float, float, float, int32);                    \
+    template _macro(float, float, double, int32);                   \
+    template _macro(float, double, float, int32);                   \
+    template _macro(float, double, double, int32);                  \
+    template _macro(double, float, float, int32);                   \
+    template _macro(double, float, double, int32);                  \
+    template _macro(double, double, float, int32);                  \
+    template _macro(double, double, double, int32);                 \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<double>, int32);                   \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<double>, int32);                   \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<double>, int32);                   \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<double>, int32);                   \
+    template _macro(float, float, float, int64);                    \
+    template _macro(float, float, double, int64);                   \
+    template _macro(float, double, float, int64);                   \
+    template _macro(float, double, double, int64);                  \
+    template _macro(double, float, float, int64);                   \
+    template _macro(double, float, double, int64);                  \
+    template _macro(double, double, float, int64);                  \
+    template _macro(double, double, double, int64);                 \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<double>, int64);                   \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<float>, std::complex<double>,      \
+                    std::complex<double>, int64);                   \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<double>, std::complex<float>,      \
+                    std::complex<double>, int64);                   \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<double>, int64)
+#else
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
+    template _macro(float, float, float, int32);                    \
+    template _macro(double, double, double, int32);                 \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<float>, int32);                    \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<double>, int32);                   \
+    template _macro(float, float, float, int64);                    \
+    template _macro(double, double, double, int64);                 \
+    template _macro(std::complex<float>, std::complex<float>,       \
+                    std::complex<float>, int64);                    \
+    template _macro(std::complex<double>, std::complex<double>,     \
+                    std::complex<double>, int64)
+#endif
+
+
+#endif  // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 42121074df4..810a3fab4e4 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/precision_conversion.hpp"
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 0dc9a950f75..fcb198106cd 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -102,7 +102,7 @@ size_type calculate_max_nnz_per_row(
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
-    precision_dispatch_real_complex<ValueType>(
+    mixed_precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->get_executor()->run(ell::make_spmv(this, dense_b, dense_x));
         },
@@ -114,12 +114,15 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
                                            const LinOp *beta, LinOp *x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
+    mixed_precision_dispatch_real_complex<ValueType>(
+        [this, alpha, beta](auto dense_b, auto dense_x) {
+            auto dense_alpha = make_temporary_conversion<ValueType>(alpha);
+            auto dense_beta = make_temporary_conversion<
+                typename std::decay_t<decltype(*dense_x)>::value_type>(beta);
             this->get_executor()->run(ell::make_advanced_spmv(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
+                dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x));
         },
-        alpha, b, beta, x);
+        b, x);
 }
 
 
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 8808a502ebb..953ce4be8d7 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index b4dfeafeb54..3c1d93eb323 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/mixed_precision_types.hpp"
 #include "dpcpp/components/format_conversion.dp.hpp"
 
 
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index fc10fc7d0e1..b1ad7bf4662 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index edef91e6d9e..1c6a31ea481 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -63,6 +63,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING
 
 
+/* Should we compile mixed-precision kernels for Ginkgo? */
+#cmakedefine GINKGO_MIXED_PRECISION
+
+
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
 #define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 5588d566b41..17901e207c1 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -226,6 +226,100 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha,
 }
 
 
+/**
+ * Calls the given function with each given argument LinOp
+ * converted into matrix::Dense<ValueType> as parameters.
+ *
+ * If GINKGO_MIXED_PRECISION is defined, this means that the function will be
+ * called with its dynamic type with its static type, so the (templated/generic)
+ * function will be instantiated with all pairs of Dense<ValueType> and
+ * Dense<next_precision<ValueType>> parameter types, and the appropriate
+ * overload will be called based on the dynamic type of the parameter.
+ *
+ * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like
+ * precision_dispatch.
+ *
+ *
+ * @param fn  the given function. It will be passed one (potentially const)
+ *            matrix::Dense<...>* parameter per parameter in the parameter
+ *            pack `linops`.
+ * @param linops  the given arguments to be converted and passed on to fn.
+ *
+ * @tparam ValueType  the value type to use for the parameters of `fn`.
+ * @tparam Function  the function pointer, lambda or other functor type to call
+ *                   with the converted arguments.
+ * @tparam Args  the argument type list.
+ */
+template <typename ValueType, typename Function>
+void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out)
+{
+#ifdef GINKGO_MIXED_PRECISION
+    using fst_type = matrix::Dense<ValueType>;
+    using snd_type = matrix::Dense<next_precision<ValueType>>;
+    if (auto dense_in = dynamic_cast<const fst_type *>(in)) {
+        if (auto dense_out = dynamic_cast<fst_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else if (auto dense_out = dynamic_cast<snd_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else {
+            GKO_NOT_SUPPORTED(out);
+        }
+    } else if (auto dense_in = dynamic_cast<const snd_type *>(in)) {
+        if (auto dense_out = dynamic_cast<fst_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else if (auto dense_out = dynamic_cast<snd_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else {
+            GKO_NOT_SUPPORTED(out);
+        }
+    } else {
+        GKO_NOT_SUPPORTED(in);
+    }
+#else
+    precision_dispatch<ValueType>(fn, in, out);
+#endif
+}
+
+
+/**
+ * Calls the given function with the given LinOps cast to their dynamic type
+ * matrix::Dense<ValueType>* as parameters.
+ * If ValueType is real and both `in` and `out` are complex, uses
+ * matrix::Dense::get_real_view() to convert them into real matrices after
+ * precision conversion.
+ *
+ * @see mixed_precision_dispatch()
+ */
+template <typename ValueType, typename Function>
+void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in,
+                                           LinOp *out)
+{
+#ifdef GINKGO_MIXED_PRECISION
+    // do we need to convert complex Dense to real Dense?
+    auto complex_to_real =
+        !(is_complex<ValueType>() ||
+          dynamic_cast<const ConvertibleTo<matrix::Dense<>> *>(in));
+    if (complex_to_real) {
+        mixed_precision_dispatch<to_complex<ValueType>>(
+            [&fn](auto dense_in, auto dense_out) {
+                using Dense = matrix::Dense<ValueType>;
+                // These dynamic_casts are only needed to make the code compile
+                // If ValueType is complex, this branch will never be taken
+                // If ValueType is real, the cast is a no-op
+                fn(dynamic_cast<const Dense *>(
+                       dense_in->create_real_view().get()),
+                   dynamic_cast<Dense *>(dense_out->create_real_view().get()));
+            },
+            in, out);
+    } else {
+        mixed_precision_dispatch<ValueType>(fn, in, out);
+    }
+#else
+    precision_dispatch_real_complex<ValueType>(fn, in, out);
+#endif
+}
+
+
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 2f6da1e9e42..2c77aaef826 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -524,57 +524,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(float, float, float, int32);                    \
-    template _macro(float, float, double, int32);                   \
-    template _macro(float, double, float, int32);                   \
-    template _macro(float, double, double, int32);                  \
-    template _macro(double, float, float, int32);                   \
-    template _macro(double, float, double, int32);                  \
-    template _macro(double, double, float, int32);                  \
-    template _macro(double, double, double, int32);                 \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<double>, int32);                   \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<double>, int32);                   \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<double>, int32);                   \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<double>, int32);                   \
-    template _macro(float, float, float, int64);                    \
-    template _macro(float, float, double, int64);                   \
-    template _macro(float, double, float, int64);                   \
-    template _macro(float, double, double, int64);                  \
-    template _macro(double, float, float, int64);                   \
-    template _macro(double, float, double, int64);                  \
-    template _macro(double, double, float, int64);                  \
-    template _macro(double, double, double, int64);                 \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<double>, int64);                   \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<double>, int64);                   \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<double>, int64);                   \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<double>, int64)
-
-
 /**
  * Instantiates a template for each value type conversion pair compiled by
  * Ginkgo.
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index db7fe3cbc0d..58ef6c3a72e 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 #include "omp/components/format_conversion.hpp"
 
 
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 58df834bfba..25de03cf709 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 
 
 namespace gko {

From 9618ef30ba9b04551c2960621a2a2aa0e483d66c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 15 Apr 2021 16:42:53 +0200
Subject: [PATCH 09/14] fix mixed_precision_dispatch no-op casts

---
 include/ginkgo/core/base/precision_dispatch.hpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 17901e207c1..4c2624863e5 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -302,13 +302,24 @@ void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in,
     if (complex_to_real) {
         mixed_precision_dispatch<to_complex<ValueType>>(
             [&fn](auto dense_in, auto dense_out) {
-                using Dense = matrix::Dense<ValueType>;
+                using DenseIn = typename std::decay_t<decltype(*dense_in)>;
+                using DenseOut = typename std::decay_t<decltype(*dense_out)>;
+                using DenseDummy = matrix::Dense<ValueType>;
+                using RealIn = remove_complex<typename DenseIn::value_type>;
+                using RealOut = remove_complex<typename DenseOut::value_type>;
+                using CastIn =
+                    std::conditional_t<is_complex<ValueType>(), DenseDummy,
+                                       matrix::Dense<RealIn>>;
+                using CastOut =
+                    std::conditional_t<is_complex<ValueType>(), DenseDummy,
+                                       matrix::Dense<RealOut>>;
                 // These dynamic_casts are only needed to make the code compile
                 // If ValueType is complex, this branch will never be taken
                 // If ValueType is real, the cast is a no-op
-                fn(dynamic_cast<const Dense *>(
+                fn(dynamic_cast<const CastIn *>(
                        dense_in->create_real_view().get()),
-                   dynamic_cast<Dense *>(dense_out->create_real_view().get()));
+                   dynamic_cast<CastOut *>(
+                       dense_out->create_real_view().get()));
             },
             in, out);
     } else {

From cb0b181f674bdd3f5186b0ad7597a8dc0471dd04 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sat, 17 Apr 2021 19:08:10 +0200
Subject: [PATCH 10/14] review updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* formatting and documentation
* improve CMake configuration output
* remove unused variables
* extract test dimensions from ell_kernels tests
* use SFINAE instead of casting in precision_dispatch.hpp

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
---
 CMakeLists.txt                                |  2 +-
 benchmark/utils/formats.hpp                   |  5 +-
 cmake/get_info.cmake                          |  5 +-
 cuda/test/matrix/ell_kernels.cpp              | 47 +++++++-------
 hip/test/matrix/ell_kernels.hip.cpp           | 47 +++++++-------
 .../ginkgo/core/base/precision_dispatch.hpp   | 63 +++++++++----------
 omp/matrix/ell_kernels.cpp                    |  2 -
 reference/matrix/ell_kernels.cpp              |  2 -
 8 files changed, 90 insertions(+), 83 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f0d68b51a34..e999611aa57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
 option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
 option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
-option(GINKGO_MIXED_PRECISION "Add true mixed-precision kernel (otherwise they will be conversion-based)" OFF)
+option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_EXPORT_BUILD_DIR
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index ed20f91119e..8c1aabbcafb 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -91,9 +91,8 @@ std::string format_description =
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse "
     "Matrix-Vector Multiplication on CUDA.\n"
-    "fell: float Ellpack format according to Bell and Garland: Efficient "
-    "Sparse "
-    "Matrix-Vector Multiplication on CUDA.\n"
+    "fell: Mixed Precision Ellpack format according to Bell and Garland: "
+    "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ell and coo to represent the matrix.\n"
     "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 508bb6e0191..bae1e864a02 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -127,7 +127,10 @@ foreach(log_type ${log_types})
     ginkgo_print_module_footer(${${log_type}} "User configuration:")
     ginkgo_print_module_footer(${${log_type}} "  Enabled modules:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP;GINKGO_MIXED_PRECISION")
+        "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
+    ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
+    ginkgo_print_foreach_variable(${${log_type}}
+        "GINKGO_MIXED_PRECISION")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index 0df427f139e..2df1c397f4c 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -61,7 +61,9 @@ class Ell : public ::testing::Test {
     using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
-    Ell() : rand_engine(42) {}
+    Ell()
+        : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600}
+    {}
 
     void SetUp()
     {
@@ -126,6 +128,9 @@ class Ell : public ::testing::Test {
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
     std::ranlux48 rand_engine;
+    gko::dim<2> size;
+    gko::size_type num_els_rowwise;
+    gko::size_type ell_stride;
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
@@ -239,7 +244,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
 
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -250,7 +255,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
@@ -261,7 +266,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected.get());
     dmtx->apply(dy2.get(), dresult.get());
@@ -272,7 +277,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
@@ -283,7 +288,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
 
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
@@ -293,7 +298,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
@@ -304,7 +309,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
@@ -315,7 +320,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
@@ -326,7 +331,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
 
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -337,7 +342,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
@@ -348,7 +353,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected.get());
     dmtx->apply(dy2.get(), dresult.get());
@@ -359,7 +364,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
@@ -370,7 +375,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
 
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -381,7 +386,7 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
@@ -392,7 +397,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
@@ -403,7 +408,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
@@ -503,10 +508,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
 TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(cuda);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(cuda);
     dcomplex_x->copy_from(complex_x.get());
 
@@ -520,10 +525,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(cuda);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(cuda);
     dcomplex_x->copy_from(complex_x.get());
 
diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp
index 9d1e1440544..13c81fdafcf 100644
--- a/hip/test/matrix/ell_kernels.hip.cpp
+++ b/hip/test/matrix/ell_kernels.hip.cpp
@@ -61,7 +61,9 @@ class Ell : public ::testing::Test {
     using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
-    Ell() : rand_engine(42) {}
+    Ell()
+        : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600}
+    {}
 
     void SetUp()
     {
@@ -127,6 +129,9 @@ class Ell : public ::testing::Test {
     std::shared_ptr<const gko::HipExecutor> hip;
 
     std::ranlux48 rand_engine;
+    gko::dim<2> size;
+    gko::size_type num_els_rowwise;
+    gko::size_type ell_stride;
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
@@ -240,7 +245,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
 
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -251,7 +256,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
@@ -262,7 +267,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected.get());
     dmtx->apply(dy2.get(), dresult.get());
@@ -273,7 +278,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
@@ -284,7 +289,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
 
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
@@ -294,7 +299,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
@@ -305,7 +310,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
@@ -316,7 +321,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
@@ -327,7 +332,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
 
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -338,7 +343,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected2.get());
     dmtx->apply(dy2.get(), dresult2.get());
@@ -349,7 +354,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y2.get(), expected.get());
     dmtx->apply(dy2.get(), dresult.get());
@@ -360,7 +365,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 
 TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected2.get());
     dmtx->apply(dy.get(), dresult2.get());
@@ -371,7 +376,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
 
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -382,7 +387,7 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
@@ -393,7 +398,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
     dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
@@ -404,7 +409,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
 
 TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
@@ -504,10 +509,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
 TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(hip);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(hip);
     dcomplex_x->copy_from(complex_x.get());
 
@@ -521,10 +526,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(hip);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(hip);
     dcomplex_x->copy_from(complex_x.get());
 
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 4c2624863e5..376f126f5c5 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -231,7 +231,7 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha,
  * converted into matrix::Dense<ValueType> as parameters.
  *
  * If GINKGO_MIXED_PRECISION is defined, this means that the function will be
- * called with its dynamic type with its static type, so the (templated/generic)
+ * called with its dynamic type as a static type, so the (templated/generic)
  * function will be instantiated with all pairs of Dense<ValueType> and
  * Dense<next_precision<ValueType>> parameter types, and the appropriate
  * overload will be called based on the dynamic type of the parameter.
@@ -239,16 +239,21 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha,
  * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like
  * precision_dispatch.
  *
+ * @param fn  the given function. It will be called with one const and one
+ *            non-const matrix::Dense<...> parameter based on the dynamic type
+ *            of the inputs (GINKGO_MIXED_PRECISION) or of type
+ *            matrix::Dense<ValueType> (no GINKGO_MIXED_PRECISION).
+ * @param in  The first parameter to be cast (GINKGO_MIXED_PRECISION) or
+ *            converted (no GINKGO_MIXED_PRECISION) and used to call `fn`.
+ * @param out  The second parameter to be cast (GINKGO_MIXED_PRECISION) or
+ *             converted (no GINKGO_MIXED_PRECISION) and used to call `fn`.
  *
- * @param fn  the given function. It will be passed one (potentially const)
- *            matrix::Dense<...>* parameter per parameter in the parameter
- *            pack `linops`.
- * @param linops  the given arguments to be converted and passed on to fn.
- *
- * @tparam ValueType  the value type to use for the parameters of `fn`.
+ * @tparam ValueType  the value type to use for the parameters of `fn` (no
+ *                    GINKGO_MIXED_PRECISION). With GINKGO_MIXED_PRECISION
+ *                    enabled, it only matters whether this type is complex or
+ *                    real.
  * @tparam Function  the function pointer, lambda or other functor type to call
  *                   with the converted arguments.
- * @tparam Args  the argument type list.
  */
 template <typename ValueType, typename Function>
 void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out)
@@ -290,36 +295,30 @@ void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out)
  *
  * @see mixed_precision_dispatch()
  */
-template <typename ValueType, typename Function>
+template <typename ValueType, typename Function,
+          std::enable_if_t<is_complex<ValueType>()> * = nullptr>
 void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in,
                                            LinOp *out)
 {
 #ifdef GINKGO_MIXED_PRECISION
-    // do we need to convert complex Dense to real Dense?
-    auto complex_to_real =
-        !(is_complex<ValueType>() ||
-          dynamic_cast<const ConvertibleTo<matrix::Dense<>> *>(in));
-    if (complex_to_real) {
+    mixed_precision_dispatch<ValueType>(fn, in, out);
+#else
+    precision_dispatch<ValueType>(fn, in, out);
+#endif
+}
+
+
+template <typename ValueType, typename Function,
+          std::enable_if_t<!is_complex<ValueType>()> * = nullptr>
+void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in,
+                                           LinOp *out)
+{
+#ifdef GINKGO_MIXED_PRECISION
+    if (!dynamic_cast<const ConvertibleTo<matrix::Dense<>> *>(in)) {
         mixed_precision_dispatch<to_complex<ValueType>>(
             [&fn](auto dense_in, auto dense_out) {
-                using DenseIn = typename std::decay_t<decltype(*dense_in)>;
-                using DenseOut = typename std::decay_t<decltype(*dense_out)>;
-                using DenseDummy = matrix::Dense<ValueType>;
-                using RealIn = remove_complex<typename DenseIn::value_type>;
-                using RealOut = remove_complex<typename DenseOut::value_type>;
-                using CastIn =
-                    std::conditional_t<is_complex<ValueType>(), DenseDummy,
-                                       matrix::Dense<RealIn>>;
-                using CastOut =
-                    std::conditional_t<is_complex<ValueType>(), DenseDummy,
-                                       matrix::Dense<RealOut>>;
-                // These dynamic_casts are only needed to make the code compile
-                // If ValueType is complex, this branch will never be taken
-                // If ValueType is real, the cast is a no-op
-                fn(dynamic_cast<const CastIn *>(
-                       dense_in->create_real_view().get()),
-                   dynamic_cast<CastOut *>(
-                       dense_out->create_real_view().get()));
+                fn(dense_in->create_real_view().get(),
+                   dense_out->create_real_view().get());
             },
             in, out);
     } else {
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index 58ef6c3a72e..96f4d5347d2 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -73,7 +73,6 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,
     const auto num_stored_elements_per_row =
         a->get_num_stored_elements_per_row();
     const auto stride = a->get_stride();
-    const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
@@ -117,7 +116,6 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
     const auto num_stored_elements_per_row =
         a->get_num_stored_elements_per_row();
     const auto stride = a->get_stride();
-    const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index 25de03cf709..24f166e52db 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -69,7 +69,6 @@ void spmv(std::shared_ptr<const ReferenceExecutor> exec,
     const auto num_stored_elements_per_row =
         a->get_num_stored_elements_per_row();
     const auto stride = a->get_stride();
-    const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());
@@ -112,7 +111,6 @@ void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
     const auto num_stored_elements_per_row =
         a->get_num_stored_elements_per_row();
     const auto stride = a->get_stride();
-    const auto num_rows = b->get_size()[0];
     const auto a_vals = gko::acc::range<a_accessor>(
         std::array<size_type, 1>{num_stored_elements_per_row * stride},
         a->get_const_values());

From e67a8b69381ba891d6a871fb7e1b31845158e397 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 19 Apr 2021 15:43:36 +0200
Subject: [PATCH 11/14] rename mixed precision ELL benchmark format to
 ell-mixed

Co-authored-by: Yuhsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com>
---
 benchmark/utils/formats.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index 8c1aabbcafb..64de4e10f90 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -59,7 +59,7 @@ namespace formats {
 
 
 std::string available_format =
-    "coo, csr, ell, fell, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
+    "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
     "hybrid40, "
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
@@ -91,7 +91,7 @@ std::string format_description =
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse "
     "Matrix-Vector Multiplication on CUDA.\n"
-    "fell: Mixed Precision Ellpack format according to Bell and Garland: "
+    "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: "
     "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ell and coo to represent the matrix.\n"
@@ -207,7 +207,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
-        {"ell_mixed",
+        {"ell-mixed",
          [](std::shared_ptr<const gko::Executor> exec,
             const gko::matrix_data<etype> &data) {
              gko::matrix_data<gko::next_precision<etype>> conv_data;

From 45f9923333aabba7188e387ee251cd44965353cf Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 19 Apr 2021 15:47:43 +0200
Subject: [PATCH 12/14] add GINKGO_MIXED_PRECISION to Windows CI

---
 .github/workflows/windows-msvc-cuda.yml | 6 +++---
 .github/workflows/windows-msvc-ref.yml  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml
index bc4557e96bf..28468950af8 100644
--- a/.github/workflows/windows-msvc-cuda.yml
+++ b/.github/workflows/windows-msvc-cuda.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {version: "10.2.89.20191206", name: "cuda102/release/shared"}
-        - {version: "latest", name: "cuda-latest/release/shared"}
+        - {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"}
+        - {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"}
     name: msvc/${{ matrix.config.name }} (only compile)
     runs-on: [windows-latest]
 
@@ -46,5 +46,5 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 ..
+        cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 ..
         cmake --build . -j4 --config Release
diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml
index 13a29f6cc51..833f3692676 100644
--- a/.github/workflows/windows-msvc-ref.yml
+++ b/.github/workflows/windows-msvc-ref.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "reference/release/static"}
+        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"}
+        - {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"}
         # Debug static needs too much storage
         # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
     name: msvc/${{ matrix.config.name }}
@@ -35,7 +35,7 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
+        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} ..
         cmake --build . -j4 --config ${{ matrix.config.build_type }}
         ctest . -C ${{ matrix.config.build_type }} --output-on-failure
 

From 535b5778bee6f9b6e8e7fb1fed0bc2771664bdc4 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 19 Apr 2021 15:50:03 +0200
Subject: [PATCH 13/14] add GINKGO_MIXED_PRECISION to OSX CI

---
 .github/workflows/osx.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
index 6827c5fe476..79caf2169a4 100644
--- a/.github/workflows/osx.yml
+++ b/.github/workflows/osx.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "omp/release/static"}
+        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"}
+        - {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"}
     name: ${{ matrix.config.name }}
     runs-on: [macos-latest]
 
@@ -40,7 +40,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }}
+        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }}
         make -j8
         ctest -j10 --output-on-failure
 

From 31fe679f83e0ca9e65010b2f561288f02264b566 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 4 May 2021 11:51:11 +0200
Subject: [PATCH 14/14] simplify mixed-precision instantiation macros

---
 core/base/mixed_precision_types.hpp | 99 +++++++++++------------------
 1 file changed, 37 insertions(+), 62 deletions(-)

diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
index 079fcb8a075..d725eff82bf 100644
--- a/core/base/mixed_precision_types.hpp
+++ b/core/base/mixed_precision_types.hpp
@@ -39,70 +39,45 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #ifdef GINKGO_MIXED_PRECISION
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(float, float, float, int32);                    \
-    template _macro(float, float, double, int32);                   \
-    template _macro(float, double, float, int32);                   \
-    template _macro(float, double, double, int32);                  \
-    template _macro(double, float, float, int32);                   \
-    template _macro(double, float, double, int32);                  \
-    template _macro(double, double, float, int32);                  \
-    template _macro(double, double, double, int32);                 \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<double>, int32);                   \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<double>, int32);                   \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<double>, int32);                   \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<double>, int32);                   \
-    template _macro(float, float, float, int64);                    \
-    template _macro(float, float, double, int64);                   \
-    template _macro(float, double, float, int64);                   \
-    template _macro(float, double, double, int64);                  \
-    template _macro(double, float, float, int64);                   \
-    template _macro(double, float, double, int64);                  \
-    template _macro(double, double, float, int64);                  \
-    template _macro(double, double, double, int64);                 \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<double>, int64);                   \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<float>, std::complex<double>,      \
-                    std::complex<double>, int64);                   \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<double>, std::complex<float>,      \
-                    std::complex<double>, int64);                   \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<double>, int64)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
+    template _macro(float, float, float, __VA_ARGS__);          \
+    template _macro(float, float, double, __VA_ARGS__);         \
+    template _macro(float, double, float, __VA_ARGS__);         \
+    template _macro(float, double, double, __VA_ARGS__);        \
+    template _macro(double, float, float, __VA_ARGS__);         \
+    template _macro(double, float, double, __VA_ARGS__);        \
+    template _macro(double, double, float, __VA_ARGS__);        \
+    template _macro(double, double, double, __VA_ARGS__);       \
+    template _macro(std::complex<float>, std::complex<float>,   \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<float>, std::complex<float>,   \
+                    std::complex<double>, __VA_ARGS__);         \
+    template _macro(std::complex<float>, std::complex<double>,  \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<float>, std::complex<double>,  \
+                    std::complex<double>, __VA_ARGS__);         \
+    template _macro(std::complex<double>, std::complex<float>,  \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<double>, std::complex<float>,  \
+                    std::complex<double>, __VA_ARGS__);         \
+    template _macro(std::complex<double>, std::complex<double>, \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<double>, std::complex<double>, \
+                    std::complex<double>, __VA_ARGS__)
 #else
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(float, float, float, int32);                    \
-    template _macro(double, double, double, int32);                 \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<float>, int32);                    \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<double>, int32);                   \
-    template _macro(float, float, float, int64);                    \
-    template _macro(double, double, double, int64);                 \
-    template _macro(std::complex<float>, std::complex<float>,       \
-                    std::complex<float>, int64);                    \
-    template _macro(std::complex<double>, std::complex<double>,     \
-                    std::complex<double>, int64)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
+    template _macro(float, float, float, __VA_ARGS__);          \
+    template _macro(double, double, double, __VA_ARGS__);       \
+    template _macro(std::complex<float>, std::complex<float>,   \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<double>, std::complex<double>, \
+                    std::complex<double>, __VA_ARGS__)
 #endif
 
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32);       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64)
+
+
 #endif  // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_