diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
index 6827c5fe476..79caf2169a4 100644
--- a/.github/workflows/osx.yml
+++ b/.github/workflows/osx.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "omp/release/static"}
+        - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"}
+        - {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"}
     name: ${{ matrix.config.name }}
     runs-on: [macos-latest]
 
@@ -40,7 +40,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }}
+        cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }}
         make -j8
         ctest -j10 --output-on-failure
 
diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml
index bc4557e96bf..28468950af8 100644
--- a/.github/workflows/windows-msvc-cuda.yml
+++ b/.github/workflows/windows-msvc-cuda.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {version: "10.2.89.20191206", name: "cuda102/release/shared"}
-        - {version: "latest", name: "cuda-latest/release/shared"}
+        - {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"}
+        - {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"}
     name: msvc/${{ matrix.config.name }} (only compile)
     runs-on: [windows-latest]
 
@@ -46,5 +46,5 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 ..
+        cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 ..
         cmake --build . -j4 --config Release
diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml
index 13a29f6cc51..833f3692676 100644
--- a/.github/workflows/windows-msvc-ref.yml
+++ b/.github/workflows/windows-msvc-ref.yml
@@ -15,8 +15,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
-        - {shared: "OFF", build_type: "Release", name: "reference/release/static"}
+        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"}
+        - {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"}
         # Debug static needs too much storage
         # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
     name: msvc/${{ matrix.config.name }}
@@ -35,7 +35,7 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
+        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} ..
         cmake --build . -j4 --config ${{ matrix.config.build_type }}
         ctest . -C ${{ matrix.config.build_type }} --output-on-failure
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bbe85b1fbe8..11e66475301 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -32,6 +32,7 @@ include:
   BUILD_HWLOC: "ON"
   FAST_TESTS: "OFF"
   DPCPP_SINGLE_MODE: "OFF"
+  MIXED_PRECISION: "ON"
   RUN_EXAMPLES: "OFF"
   CONFIG_LOG: "ON"
   CXX_FLAGS: ""
@@ -77,6 +78,7 @@ include:
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
+        -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
         -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
@@ -111,6 +113,7 @@ include:
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
+        -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
         -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
@@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static:
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OFF"
 
+build/nocuda-nomixed/gcc/omp/release/shared:
+  <<: *default_build_with_test
+  extends:
+    - .quick_test_condition
+    - .use_gko-nocuda-gnu9-llvm8
+  variables:
+    <<: *default_variables
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Release"
+    MIXED_PRECISION: "OFF"
+
+build/nocuda-nomixed/clang/omp/debug/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-nocuda-gnu9-llvm8
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    MIXED_PRECISION: "OFF"
+
+build/nocuda-nomixed/intel/omp/release/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-nocuda-gnu9-llvm8-intel
+  variables:
+    <<: *default_variables
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
+    BUILD_OMP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+    MIXED_PRECISION: "OFF"
 
 build/dpcpp/cpu/release/static:
   <<: *default_build_with_test
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1318a6900d4..e999611aa57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
 option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
 option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
+option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_EXPORT_BUILD_DIR
diff --git a/INSTALL.md b/INSTALL.md
index d85bcc48329..4fcc60d9ae4 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development
     (requires clang-format, will also download git-cmake-format),
     default is `OFF`.
+*   `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels
+    instead of converting data on the fly, default is `OFF`.
+    Enabling this flag increases the library size, but improves performance of
+    mixed-precision kernels.
 *   `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
     (will download googletest), default is `ON`.
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index d25be4657ec..64de4e10f90 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -59,7 +59,8 @@ namespace formats {
 
 
 std::string available_format =
-    "coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, "
+    "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
+    "hybrid40, "
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
 #ifdef HAS_CUDA
@@ -90,6 +91,8 @@ std::string format_description =
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse "
     "Matrix-Vector Multiplication on CUDA.\n"
+    "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: "
+    "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
     "sellp: Sliced Ellpack uses a default block size of 32.\n"
     "hybrid: Hybrid uses ell and coo to represent the matrix.\n"
     "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
@@ -204,6 +207,23 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
+        {"ell-mixed",
+         [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<etype> &data) {
+             gko::matrix_data<gko::next_precision<etype>> conv_data;
+             conv_data.size = data.size;
+             conv_data.nonzeros.resize(data.nonzeros.size());
+             auto it = conv_data.nonzeros.begin();
+             for (auto &el : data.nonzeros) {
+                 it->row = el.row;
+                 it->column = el.column;
+                 it->value = el.value;
+                 ++it;
+             }
+             auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(std::move(exec));
+             mat->read(conv_data);
+             return mat;
+         }},
 #ifdef HAS_CUDA
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         {"cusp_csr", read_matrix_from_data<cusp_csr>},
@@ -212,8 +232,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
-#else // CUDA_VERSION >= 11000
-        // cusp_csr, cusp_coo use the generic ones from CUDA 11
+#else  // CUDA_VERSION >= 11000
+       // cusp_csr, cusp_coo use the generic ones from CUDA 11
         {"cusp_csr", read_matrix_from_data<cusp_gcsr>},
         {"cusp_coo", read_matrix_from_data<cusp_gcoo>},
 #endif
@@ -260,7 +280,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"hybridminstorage",
          READ_MATRIX(hybrid,
                      std::make_shared<hybrid::minimal_storage_limit>())},
-        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}};
+        {"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}
+};
 // clang-format on
 
 
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 835bdf0c45b..bae1e864a02 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -128,6 +128,9 @@ foreach(log_type ${log_types})
     ginkgo_print_module_footer(${${log_type}} "  Enabled modules:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
+    ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
+    ginkgo_print_foreach_variable(${${log_type}}
+        "GINKGO_MIXED_PRECISION")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 00eb2d6871d..2323d512258 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -34,29 +34,30 @@ namespace kernel {
 namespace {
 
 
-template <int num_thread_per_worker, bool atomic, typename ValueType,
-          typename IndexType, typename Closure>
+template <int num_thread_per_worker, bool atomic, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType,
+          typename Closure>
 __device__ void spmv_kernel(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride, Closure op)
+    acc::range<b_accessor> b, OutputValueType *__restrict__ c,
+    const size_type c_stride, Closure op)
 {
     const auto tidx = thread::get_thread_id_flat();
-    const auto column_id = blockIdx.y;
+    const decltype(tidx) column_id = blockIdx.y;
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<OutputValueType>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
                 if (col_idx < idx) {
                     break;
                 } else {
-                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                    temp += val(ind) * b(col_idx, column_id);
                 }
             }
             const auto c_ind = tidx * c_stride + column_id;
@@ -68,14 +69,14 @@ __device__ void spmv_kernel(
             const auto x = tidx % num_rows;
             const auto worker_id = tidx / num_rows;
             const auto step_size = num_worker_per_row * num_thread_per_worker;
-            __shared__ UninitializedArray<ValueType, default_block_size /
-                                                         num_thread_per_worker>
+            __shared__ UninitializedArray<
+                OutputValueType, default_block_size / num_thread_per_worker>
                 storage;
             if (idx_in_worker == 0) {
                 storage[threadIdx.x] = 0;
             }
             __syncthreads();
-            ValueType temp = zero<ValueType>();
+            auto temp = zero<OutputValueType>();
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
                  idx < num_stored_elements_per_row; idx += step_size) {
@@ -84,7 +85,7 @@ __device__ void spmv_kernel(
                 if (col_idx < idx) {
                     break;
                 } else {
-                    temp += val[ind] * b[col_idx * b_stride + column_id];
+                    temp += val(ind) * b(col_idx, column_id);
                 }
             }
             atomic_add(&storage[threadIdx.x], temp);
@@ -102,35 +103,34 @@ __device__ void spmv_kernel(
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ val, const IndexType *__restrict__ col,
+    acc::range<a_accessor> val, const IndexType *__restrict__ col,
     const size_type stride, const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    ValueType *__restrict__ c, const size_type c_stride)
+    acc::range<b_accessor> b, OutputValueType *__restrict__ c,
+    const size_type c_stride)
 {
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
-        num_stored_elements_per_row, b, b_stride, c, c_stride,
-        [](const ValueType &x, const ValueType &y) { return x; });
+        num_stored_elements_per_row, b, c, c_stride,
+        [](const OutputValueType &x, const OutputValueType &y) { return x; });
 }
 
 
-template <int num_thread_per_worker, bool atomic = false, typename ValueType,
-          typename IndexType>
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv(
     const size_type num_rows, const int num_worker_per_row,
-    const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
+    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
     const IndexType *__restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row,
-    const ValueType *__restrict__ b, const size_type b_stride,
-    const ValueType *__restrict__ beta, ValueType *__restrict__ c,
+    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
+    const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
     const size_type c_stride)
 {
-    const ValueType alpha_val = alpha[0];
-    const ValueType beta_val = beta[0];
+    const OutputValueType alpha_val = alpha(0);
+    const OutputValueType beta_val = beta[0];
     // Because the atomic operation changes the values of c during computation,
     // it can not do the right alpha * a * b + beta * c operation.
     // Thus, the cuda kernel only computes alpha * a * b when it uses atomic
@@ -138,15 +138,16 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     if (atomic) {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val](const ValueType &x, const ValueType &y) {
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val](const OutputValueType &x, const OutputValueType &y) {
                 return alpha_val * x;
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, b_stride, c, c_stride,
-            [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val, &beta_val](const OutputValueType &x,
+                                    const OutputValueType &y) {
                 return alpha_val * x + beta_val * y;
             });
     }
diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
new file mode 100644
index 00000000000..d725eff82bf
--- /dev/null
+++ b/core/base/mixed_precision_types.hpp
@@ -0,0 +1,83 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
+#define GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#ifdef GINKGO_MIXED_PRECISION
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
+    template _macro(float, float, float, __VA_ARGS__);          \
+    template _macro(float, float, double, __VA_ARGS__);         \
+    template _macro(float, double, float, __VA_ARGS__);         \
+    template _macro(float, double, double, __VA_ARGS__);        \
+    template _macro(double, float, float, __VA_ARGS__);         \
+    template _macro(double, float, double, __VA_ARGS__);        \
+    template _macro(double, double, float, __VA_ARGS__);        \
+    template _macro(double, double, double, __VA_ARGS__);       \
+    template _macro(std::complex<float>, std::complex<float>,   \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<float>, std::complex<float>,   \
+                    std::complex<double>, __VA_ARGS__);         \
+    template _macro(std::complex<float>, std::complex<double>,  \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<float>, std::complex<double>,  \
+                    std::complex<double>, __VA_ARGS__);         \
+    template _macro(std::complex<double>, std::complex<float>,  \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<double>, std::complex<float>,  \
+                    std::complex<double>, __VA_ARGS__);         \
+    template _macro(std::complex<double>, std::complex<double>, \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<double>, std::complex<double>, \
+                    std::complex<double>, __VA_ARGS__)
+#else
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
+    template _macro(float, float, float, __VA_ARGS__);          \
+    template _macro(double, double, double, __VA_ARGS__);       \
+    template _macro(std::complex<float>, std::complex<float>,   \
+                    std::complex<float>, __VA_ARGS__);          \
+    template _macro(std::complex<double>, std::complex<double>, \
+                    std::complex<double>, __VA_ARGS__)
+#endif
+
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32);       \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64)
+
+
+#endif  // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 877051c3e04..810a3fab4e4 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/absolute_array.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/precision_conversion.hpp"
@@ -931,15 +932,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType)
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
+GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, OutputValueType,
+                            IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType)
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
+GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType,
+                                     OutputValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 template <typename ValueType, typename IndexType>
diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
index 0dc9a950f75..fcb198106cd 100644
--- a/core/matrix/ell.cpp
+++ b/core/matrix/ell.cpp
@@ -102,7 +102,7 @@ size_type calculate_max_nnz_per_row(
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const LinOp *b, LinOp *x) const
 {
-    precision_dispatch_real_complex<ValueType>(
+    mixed_precision_dispatch_real_complex<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->get_executor()->run(ell::make_spmv(this, dense_b, dense_x));
         },
@@ -114,12 +114,15 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
                                            const LinOp *beta, LinOp *x) const
 {
-    precision_dispatch_real_complex<ValueType>(
-        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
+    mixed_precision_dispatch_real_complex<ValueType>(
+        [this, alpha, beta](auto dense_b, auto dense_x) {
+            auto dense_alpha = make_temporary_conversion<ValueType>(alpha);
+            auto dense_beta = make_temporary_conversion<
+                typename std::decay_t<decltype(*dense_x)>::value_type>(beta);
             this->get_executor()->run(ell::make_advanced_spmv(
-                dense_alpha, this, dense_b, dense_beta, dense_x));
+                dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x));
         },
-        alpha, b, beta, x);
+        b, x);
 }
 
 
diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp
index 7a1ec261d15..7cbe54b3720 100644
--- a/core/matrix/ell_kernels.hpp
+++ b/core/matrix/ell_kernels.hpp
@@ -46,18 +46,21 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType)  \
-    void spmv(std::shared_ptr<const DefaultExecutor> exec, \
-              const matrix::Ell<ValueType, IndexType> *a,  \
-              const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
-
-#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType)  \
-    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec, \
-                       const matrix::Dense<ValueType> *alpha,       \
-                       const matrix::Ell<ValueType, IndexType> *a,  \
-                       const matrix::Dense<ValueType> *b,           \
-                       const matrix::Dense<ValueType> *beta,        \
-                       matrix::Dense<ValueType> *c)
+#define GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \
+                                    OutputValueType, IndexType)      \
+    void spmv(std::shared_ptr<const DefaultExecutor> exec,           \
+              const matrix::Ell<MatrixValueType, IndexType> *a,      \
+              const matrix::Dense<InputValueType> *b,                \
+              matrix::Dense<OutputValueType> *c)
+
+#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \
+                                             OutputValueType, IndexType)      \
+    void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,           \
+                       const matrix::Dense<MatrixValueType> *alpha,           \
+                       const matrix::Ell<MatrixValueType, IndexType> *a,      \
+                       const matrix::Dense<InputValueType> *b,                \
+                       const matrix::Dense<OutputValueType> *beta,            \
+                       matrix::Dense<OutputValueType> *c)
 
 #define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
     void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
@@ -87,10 +90,14 @@ namespace kernels {
                           matrix::Diagonal<ValueType> *diag)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                         \
-    template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType);                       \
-    template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType);              \
+    template <typename InputValueType, typename MatrixValueType,             \
+              typename OutputValueType, typename IndexType>                  \
+    GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType,             \
+                                OutputValueType, IndexType);                 \
+    template <typename InputValueType, typename MatrixValueType,             \
+              typename OutputValueType, typename IndexType>                  \
+    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType,    \
+                                         OutputValueType, IndexType);        \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType);           \
     template <typename ValueType, typename IndexType>                        \
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 9bc47f71e63..953ce4be8d7 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -43,6 +43,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -107,16 +109,37 @@ using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 
 namespace {
 
+template <int dim, typename Type1, typename Type2>
+GKO_INLINE auto as_cuda_accessor(
+    const acc::range<acc::reduced_row_major<dim, Type1, Type2>> &acc)
+{
+    return acc::range<
+        acc::reduced_row_major<dim, cuda_type<Type1>, cuda_type<Type2>>>(
+        acc.get_accessor().get_size(),
+        as_cuda_type(acc.get_accessor().get_stored_data()),
+        acc.get_accessor().get_stride());
+}
+
 
-template <int info, typename ValueType, typename IndexType>
+template <int info, typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   matrix::Dense<ValueType> *c,
-                   const matrix::Dense<ValueType> *alpha = nullptr,
-                   const matrix::Dense<ValueType> *beta = nullptr)
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   matrix::Dense<OutputValueType> *c,
+                   const matrix::Dense<MatrixValueType> *alpha = nullptr,
+                   const matrix::Dense<OutputValueType> *beta = nullptr)
 {
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
     const auto nrows = a->get_size()[0];
+    const auto stride = a->get_stride();
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+
     constexpr int num_thread_per_worker =
         (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
@@ -124,22 +147,29 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                           num_thread_per_worker, 1);
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
+
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<size_type, 1>{{num_stored_elements_per_row * stride}},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
+
     if (alpha == nullptr && beta == nullptr) {
         kernel::spmv<num_thread_per_worker, atomic>
             <<<grid_size, block_size, 0, 0>>>(
-                nrows, num_worker_per_row, as_cuda_type(a->get_const_values()),
-                a->get_const_col_idxs(), a->get_stride(),
-                a->get_num_stored_elements_per_row(),
-                as_cuda_type(b->get_const_values()), b->get_stride(),
-                as_cuda_type(c->get_values()), c->get_stride());
+                nrows, num_worker_per_row, as_cuda_accessor(a_vals),
+                a->get_const_col_idxs(), stride, num_stored_elements_per_row,
+                as_cuda_accessor(b_vals), as_cuda_type(c->get_values()),
+                c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
+        const auto alpha_val = gko::acc::range<a_accessor>(
+            std::array<size_type, 1>{1}, alpha->get_const_values());
         kernel::spmv<num_thread_per_worker, atomic>
             <<<grid_size, block_size, 0, 0>>>(
-                nrows, num_worker_per_row,
-                as_cuda_type(alpha->get_const_values()),
-                as_cuda_type(a->get_const_values()), a->get_const_col_idxs(),
-                a->get_stride(), a->get_num_stored_elements_per_row(),
-                as_cuda_type(b->get_const_values()), b->get_stride(),
+                nrows, num_worker_per_row, as_cuda_accessor(alpha_val),
+                as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride,
+                num_stored_elements_per_row, as_cuda_accessor(b_vals),
                 as_cuda_type(beta->get_const_values()),
                 as_cuda_type(c->get_values()), c->get_stride());
     } else {
@@ -194,10 +224,12 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
 }  // namespace
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -212,7 +244,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+                               c->get_num_stored_elements(),
+                               zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
@@ -221,16 +254,18 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -253,7 +288,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index dff573a284f..2df1c397f4c 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -58,9 +58,12 @@ class Ell : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Ell<>;
     using Vec = gko::matrix::Dense<>;
+    using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
-    Ell() : rand_engine(42) {}
+    Ell()
+        : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600}
+    {}
 
     void SetUp()
     {
@@ -92,38 +95,62 @@ class Ell : public ::testing::Test {
                           stride);
         mtx->copy_from(gen_mtx(num_rows, num_cols));
         expected = gen_mtx(num_rows, num_vectors);
+        expected2 = Vec2::create(ref);
+        expected2->copy_from(expected.get());
         y = gen_mtx(num_cols, num_vectors);
+        y2 = Vec2::create(ref);
+        y2->copy_from(y.get());
         alpha = gko::initialize<Vec>({2.0}, ref);
+        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
+        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(cuda);
         dmtx->copy_from(mtx.get());
         dresult = Vec::create(cuda);
         dresult->copy_from(expected.get());
+        dresult2 = Vec2::create(cuda);
+        dresult2->copy_from(expected2.get());
         dy = Vec::create(cuda);
         dy->copy_from(y.get());
+        dy2 = Vec2::create(cuda);
+        dy2->copy_from(y2.get());
         dalpha = Vec::create(cuda);
         dalpha->copy_from(alpha.get());
+        dalpha2 = Vec2::create(cuda);
+        dalpha2->copy_from(alpha2.get());
         dbeta = Vec::create(cuda);
         dbeta->copy_from(beta.get());
+        dbeta2 = Vec2::create(cuda);
+        dbeta2->copy_from(beta2.get());
     }
 
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
 
     std::ranlux48 rand_engine;
+    gko::dim<2> size;
+    gko::size_type num_els_rowwise;
+    gko::size_type ell_stride;
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
+    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Vec2> dbeta2;
 };
 
 
@@ -138,6 +165,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -149,9 +209,42 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -160,9 +253,42 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
@@ -170,9 +296,42 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -181,9 +340,42 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -192,6 +384,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef)
 {
     set_up_apply_data(10, 10000);
@@ -283,10 +508,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
 TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(cuda);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(cuda);
     dcomplex_x->copy_from(complex_x.get());
 
@@ -300,10 +525,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(cuda);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(cuda);
     dcomplex_x->copy_from(complex_x.get());
 
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index 01a40b5d9b5..3c1d93eb323 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/mixed_precision_types.hpp"
 #include "dpcpp/components/format_conversion.dp.hpp"
 
 
@@ -56,24 +57,27 @@ namespace dpcpp {
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const DpcppExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b,
-          matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 2e7a4eeeb50..b1ad7bf4662 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -46,6 +46,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array.hpp"
 #include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -111,15 +113,37 @@ using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 namespace {
 
 
-template <int info, typename ValueType, typename IndexType>
+template <int dim, typename Type1, typename Type2>
+GKO_INLINE auto as_hip_accessor(
+    const acc::range<acc::reduced_row_major<dim, Type1, Type2>> &acc)
+{
+    return acc::range<
+        acc::reduced_row_major<dim, hip_type<Type1>, hip_type<Type2>>>(
+        acc.get_accessor().get_size(),
+        as_hip_type(acc.get_accessor().get_stored_data()),
+        acc.get_accessor().get_stride());
+}
+
+
+template <int info, typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   matrix::Dense<ValueType> *c,
-                   const matrix::Dense<ValueType> *alpha = nullptr,
-                   const matrix::Dense<ValueType> *beta = nullptr)
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   matrix::Dense<OutputValueType> *c,
+                   const matrix::Dense<MatrixValueType> *alpha = nullptr,
+                   const matrix::Dense<OutputValueType> *beta = nullptr)
 {
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
     const auto nrows = a->get_size()[0];
+    const auto stride = a->get_stride();
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+
     constexpr int num_thread_per_worker =
         (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
@@ -127,24 +151,31 @@ void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
                           num_thread_per_worker, 1);
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
+
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<size_type, 1>{{num_stored_elements_per_row * stride}},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
+
     if (alpha == nullptr && beta == nullptr) {
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
-            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
-            as_hip_type(b->get_const_values()), b->get_stride(),
+            as_hip_accessor(a_vals), a->get_const_col_idxs(), stride,
+            num_stored_elements_per_row, as_hip_accessor(b_vals),
             as_hip_type(c->get_values()), c->get_stride());
     } else if (alpha != nullptr && beta != nullptr) {
+        const auto alpha_val = gko::acc::range<a_accessor>(
+            std::array<size_type, 1>{1}, alpha->get_const_values());
         hipLaunchKernelGGL(
             HIP_KERNEL_NAME(kernel::spmv<num_thread_per_worker, atomic>),
             dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row,
-            as_hip_type(alpha->get_const_values()),
-            as_hip_type(a->get_const_values()), a->get_const_col_idxs(),
-            a->get_stride(), a->get_num_stored_elements_per_row(),
-            as_hip_type(b->get_const_values()), b->get_stride(),
-            as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()),
-            c->get_stride());
+            as_hip_accessor(alpha_val), as_hip_accessor(a_vals),
+            a->get_const_col_idxs(), stride, num_stored_elements_per_row,
+            as_hip_accessor(b_vals), as_hip_type(beta->get_const_values()),
+            as_hip_type(c->get_values()), c->get_stride());
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -197,10 +228,12 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
 }  // namespace
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -215,7 +248,8 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     const int info = (!atomic) * num_thread_per_worker;
     if (atomic) {
         components::fill_array(exec, c->get_values(),
-                               c->get_num_stored_elements(), zero<ValueType>());
+                               c->get_num_stored_elements(),
+                               zero<OutputValueType>());
     }
     select_abstract_spmv(
         compiled_kernels(),
@@ -224,16 +258,18 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         c);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
     const auto data = compute_thread_worker_and_atomicity(exec, a);
     const int num_thread_per_worker = std::get<0>(data);
@@ -256,7 +292,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         alpha, beta);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp
index e9a875eb544..13c81fdafcf 100644
--- a/hip/test/matrix/ell_kernels.hip.cpp
+++ b/hip/test/matrix/ell_kernels.hip.cpp
@@ -58,9 +58,12 @@ class Ell : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Ell<>;
     using Vec = gko::matrix::Dense<>;
+    using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
-    Ell() : rand_engine(42) {}
+    Ell()
+        : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600}
+    {}
 
     void SetUp()
     {
@@ -92,19 +95,33 @@ class Ell : public ::testing::Test {
                           stride);
         mtx->copy_from(gen_mtx(num_rows, num_cols));
         expected = gen_mtx(num_rows, num_vectors);
+        expected2 = Vec2::create(ref);
+        expected2->copy_from(expected.get());
         y = gen_mtx(num_cols, num_vectors);
+        y2 = Vec2::create(ref);
+        y2->copy_from(y.get());
         alpha = gko::initialize<Vec>({2.0}, ref);
+        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
+        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(hip);
         dmtx->copy_from(mtx.get());
         dresult = Vec::create(hip);
         dresult->copy_from(expected.get());
+        dresult2 = Vec2::create(hip);
+        dresult2->copy_from(expected2.get());
         dy = Vec::create(hip);
         dy->copy_from(y.get());
+        dy2 = Vec2::create(hip);
+        dy2->copy_from(y2.get());
         dalpha = Vec::create(hip);
         dalpha->copy_from(alpha.get());
+        dalpha2 = Vec2::create(hip);
+        dalpha2->copy_from(alpha2.get());
         dbeta = Vec::create(hip);
         dbeta->copy_from(beta.get());
+        dbeta2 = Vec2::create(hip);
+        dbeta2->copy_from(beta2.get());
     }
 
 
@@ -112,18 +129,29 @@ class Ell : public ::testing::Test {
     std::shared_ptr<const gko::HipExecutor> hip;
 
     std::ranlux48 rand_engine;
+    gko::dim<2> size;
+    gko::size_type num_els_rowwise;
+    gko::size_type ell_stride;
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
+    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Vec2> dbeta2;
 };
 
 
@@ -138,6 +166,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -149,9 +210,42 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -160,9 +254,42 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 1, 300, 600);
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
@@ -170,9 +297,42 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -181,9 +341,42 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(532, 231, 3, 300, 600);
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
@@ -192,6 +385,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6);
+}
+
+
 TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef)
 {
     set_up_apply_data(10, 10000);
@@ -283,10 +509,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef)
 TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(hip);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(hip);
     dcomplex_x->copy_from(complex_x.get());
 
@@ -300,10 +526,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexVec>(231, 3);
+    auto complex_b = gen_mtx<ComplexVec>(size[1], 3);
     auto dcomplex_b = ComplexVec::create(hip);
     dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexVec>(532, 3);
+    auto complex_x = gen_mtx<ComplexVec>(size[0], 3);
     auto dcomplex_x = ComplexVec::create(hip);
     dcomplex_x->copy_from(complex_x.get());
 
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index edef91e6d9e..1c6a31ea481 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -63,6 +63,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING
 
 
+/* Should we compile mixed-precision kernels for Ginkgo? */
+#cmakedefine GINKGO_MIXED_PRECISION
+
+
 /* What is HIP compiled for, hcc or nvcc? */
 // clang-format off
 #define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index 5588d566b41..376f126f5c5 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -226,6 +226,110 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha,
 }
 
 
+/**
+ * Calls the given function with each given argument LinOp
+ * converted into matrix::Dense<ValueType> as parameters.
+ *
+ * If GINKGO_MIXED_PRECISION is defined, this means that the function will be
+ * called with its dynamic type as a static type, so the (templated/generic)
+ * function will be instantiated with all pairs of Dense<ValueType> and
+ * Dense<next_precision<ValueType>> parameter types, and the appropriate
+ * overload will be called based on the dynamic type of the parameter.
+ *
+ * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like
+ * precision_dispatch.
+ *
+ * @param fn  the given function. It will be called with one const and one
+ *            non-const matrix::Dense<...> parameter based on the dynamic type
+ *            of the inputs (GINKGO_MIXED_PRECISION) or of type
+ *            matrix::Dense<ValueType> (no GINKGO_MIXED_PRECISION).
+ * @param in  The first parameter to be cast (GINKGO_MIXED_PRECISION) or
+ *            converted (no GINKGO_MIXED_PRECISION) and used to call `fn`.
+ * @param out  The second parameter to be cast (GINKGO_MIXED_PRECISION) or
+ *             converted (no GINKGO_MIXED_PRECISION) and used to call `fn`.
+ *
+ * @tparam ValueType  the value type to use for the parameters of `fn` (no
+ *                    GINKGO_MIXED_PRECISION). With GINKGO_MIXED_PRECISION
+ *                    enabled, it only matters whether this type is complex or
+ *                    real.
+ * @tparam Function  the function pointer, lambda or other functor type to call
+ *                   with the converted arguments.
+ */
+template <typename ValueType, typename Function>
+void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out)
+{
+#ifdef GINKGO_MIXED_PRECISION
+    using fst_type = matrix::Dense<ValueType>;
+    using snd_type = matrix::Dense<next_precision<ValueType>>;
+    if (auto dense_in = dynamic_cast<const fst_type *>(in)) {
+        if (auto dense_out = dynamic_cast<fst_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else if (auto dense_out = dynamic_cast<snd_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else {
+            GKO_NOT_SUPPORTED(out);
+        }
+    } else if (auto dense_in = dynamic_cast<const snd_type *>(in)) {
+        if (auto dense_out = dynamic_cast<fst_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else if (auto dense_out = dynamic_cast<snd_type *>(out)) {
+            fn(dense_in, dense_out);
+        } else {
+            GKO_NOT_SUPPORTED(out);
+        }
+    } else {
+        GKO_NOT_SUPPORTED(in);
+    }
+#else
+    precision_dispatch<ValueType>(fn, in, out);
+#endif
+}
+
+
+/**
+ * Calls the given function with the given LinOps cast to their dynamic type
+ * matrix::Dense<ValueType>* as parameters.
+ * If ValueType is real and both `in` and `out` are complex, uses
+ * matrix::Dense::get_real_view() to convert them into real matrices after
+ * precision conversion.
+ *
+ * @see mixed_precision_dispatch()
+ */
+template <typename ValueType, typename Function,
+          std::enable_if_t<is_complex<ValueType>()> * = nullptr>
+void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in,
+                                           LinOp *out)
+{
+#ifdef GINKGO_MIXED_PRECISION
+    mixed_precision_dispatch<ValueType>(fn, in, out);
+#else
+    precision_dispatch<ValueType>(fn, in, out);
+#endif
+}
+
+
+template <typename ValueType, typename Function,
+          std::enable_if_t<!is_complex<ValueType>()> * = nullptr>
+void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in,
+                                           LinOp *out)
+{
+#ifdef GINKGO_MIXED_PRECISION
+    if (!dynamic_cast<const ConvertibleTo<matrix::Dense<>> *>(in)) {
+        mixed_precision_dispatch<to_complex<ValueType>>(
+            [&fn](auto dense_in, auto dense_out) {
+                fn(dense_in->create_real_view().get(),
+                   dense_out->create_real_view().get());
+            },
+            in, out);
+    } else {
+        mixed_precision_dispatch<ValueType>(fn, in, out);
+    }
+#else
+    precision_dispatch_real_complex<ValueType>(fn, in, out);
+#endif
+}
+
+
 }  // namespace gko
 
 
diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp
index a590357a1c2..96f4d5347d2 100644
--- a/omp/matrix/ell_kernels.cpp
+++ b/omp/matrix/ell_kernels.cpp
@@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
 #include "omp/components/format_conversion.hpp"
 
 
@@ -56,42 +58,72 @@ namespace omp {
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const OmpExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
-            c->at(row, j) = zero<ValueType>();
+            c->at(row, j) = zero<OutputValueType>();
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += val * b->at(col, j);
+                c->at(row, j) += val * b_vals(col, j);
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
-    auto alpha_val = alpha->at(0, 0);
-    auto beta_val = beta->at(0, 0);
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
+    const auto alpha_val = OutputValueType(alpha->at(0, 0));
+    const auto beta_val = beta->at(0, 0);
 
 #pragma omp parallel for
     for (size_type row = 0; row < a->get_size()[0]; row++) {
@@ -99,16 +131,16 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
             c->at(row, j) *= beta_val;
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += alpha_val * val * b->at(col, j);
+                c->at(row, j) += alpha_val * val * b_vals(col, j);
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp
index 281292e9684..01f1a31121d 100644
--- a/omp/test/matrix/ell_kernels.cpp
+++ b/omp/test/matrix/ell_kernels.cpp
@@ -57,6 +57,7 @@ class Ell : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Ell<>;
     using Vec = gko::matrix::Dense<>;
+    using Vec2 = gko::matrix::Dense<float>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
 
     Ell() : rand_engine(42) {}
@@ -90,19 +91,33 @@ class Ell : public ::testing::Test {
         mtx = Mtx::create(ref, gko::dim<2>{}, max_nonzeros_per_row, stride);
         mtx->copy_from(gen_mtx(532, 231, 1));
         expected = gen_mtx(532, num_vectors, 1);
+        expected2 = Vec2::create(ref);
+        expected2->copy_from(expected.get());
         y = gen_mtx(231, num_vectors, 1);
+        y2 = Vec2::create(ref);
+        y2->copy_from(y.get());
         alpha = gko::initialize<Vec>({2.0}, ref);
+        alpha2 = gko::initialize<Vec2>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
+        beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(omp);
         dmtx->copy_from(mtx.get());
         dresult = Vec::create(omp);
         dresult->copy_from(expected.get());
+        dresult2 = Vec2::create(omp);
+        dresult2->copy_from(expected2.get());
         dy = Vec::create(omp);
         dy->copy_from(y.get());
+        dy2 = Vec2::create(omp);
+        dy2->copy_from(y2.get());
         dalpha = Vec::create(omp);
         dalpha->copy_from(alpha.get());
+        dalpha2 = Vec2::create(omp);
+        dalpha2->copy_from(alpha2.get());
         dbeta = Vec::create(omp);
         dbeta->copy_from(beta.get());
+        dbeta2 = Vec2::create(omp);
+        dbeta2->copy_from(beta2.get());
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -112,15 +127,23 @@ class Ell : public ::testing::Test {
 
     std::unique_ptr<Mtx> mtx;
     std::unique_ptr<Vec> expected;
+    std::unique_ptr<Vec2> expected2;
     std::unique_ptr<Vec> y;
+    std::unique_ptr<Vec2> y2;
     std::unique_ptr<Vec> alpha;
+    std::unique_ptr<Vec2> alpha2;
     std::unique_ptr<Vec> beta;
+    std::unique_ptr<Vec2> beta2;
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Vec> dresult;
+    std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
+    std::unique_ptr<Vec2> dy2;
     std::unique_ptr<Vec> dalpha;
+    std::unique_ptr<Vec2> dalpha2;
     std::unique_ptr<Vec> dbeta;
+    std::unique_ptr<Vec2> dbeta2;
 };
 
 
@@ -135,6 +158,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -146,6 +202,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3)
+{
+    set_up_apply_data();
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef)
 {
     set_up_apply_data(300, 600);
@@ -157,6 +246,39 @@ TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef)
 {
     set_up_apply_data(300, 600);
@@ -167,6 +289,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(0, 0, 3);
@@ -178,6 +333,39 @@ TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(0, 0, 3);
@@ -189,7 +377,40 @@ TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef)
 }
 
 
-TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef)
+TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(0, 0, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, SimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(300, 600, 3);
 
@@ -200,6 +421,39 @@ TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(y2.get(), expected2.get());
+    dmtx->apply(dy2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(y2.get(), expected.get());
+    dmtx->apply(dy2.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(y.get(), expected2.get());
+    dmtx->apply(dy.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef)
 {
     set_up_apply_data(300, 600, 3);
@@ -210,6 +464,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef1)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef2)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get());
+    dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef3)
+{
+    set_up_apply_data(300, 600, 3);
+
+    mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get());
+    dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14);
+}
+
+
 TEST_F(Ell, ApplyToComplexIsEquivalentToRef)
 {
     set_up_apply_data(300, 600);
diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp
index fdbaff5dfc5..24f166e52db 100644
--- a/reference/matrix/ell_kernels.cpp
+++ b/reference/matrix/ell_kernels.cpp
@@ -39,6 +39,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "accessor/reduced_row_major.hpp"
+#include "core/base/mixed_precision_types.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace reference {
@@ -50,57 +54,87 @@ namespace reference {
 namespace ell {
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void spmv(std::shared_ptr<const ReferenceExecutor> exec,
-          const matrix::Ell<ValueType, IndexType> *a,
-          const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
+          const matrix::Ell<MatrixValueType, IndexType> *a,
+          const matrix::Dense<InputValueType> *b,
+          matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
-            c->at(row, j) = zero<ValueType>();
+            c->at(row, j) = zero<OutputValueType>();
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += val * b->at(col, j);
+                c->at(row, j) += val * b_vals(col, j);
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELL_SPMV_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
+template <typename InputValueType, typename MatrixValueType,
+          typename OutputValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const ReferenceExecutor> exec,
-                   const matrix::Dense<ValueType> *alpha,
-                   const matrix::Ell<ValueType, IndexType> *a,
-                   const matrix::Dense<ValueType> *b,
-                   const matrix::Dense<ValueType> *beta,
-                   matrix::Dense<ValueType> *c)
+                   const matrix::Dense<MatrixValueType> *alpha,
+                   const matrix::Ell<MatrixValueType, IndexType> *a,
+                   const matrix::Dense<InputValueType> *b,
+                   const matrix::Dense<OutputValueType> *beta,
+                   matrix::Dense<OutputValueType> *c)
 {
-    auto num_stored_elements_per_row = a->get_num_stored_elements_per_row();
-    auto alpha_val = alpha->at(0, 0);
-    auto beta_val = beta->at(0, 0);
+    using a_accessor =
+        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+    using b_accessor =
+        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+
+    const auto num_stored_elements_per_row =
+        a->get_num_stored_elements_per_row();
+    const auto stride = a->get_stride();
+    const auto a_vals = gko::acc::range<a_accessor>(
+        std::array<size_type, 1>{num_stored_elements_per_row * stride},
+        a->get_const_values());
+    const auto b_vals = gko::acc::range<b_accessor>(
+        std::array<size_type, 2>{{b->get_size()[0], b->get_size()[1]}},
+        b->get_const_values(), std::array<size_type, 1>{{b->get_stride()}});
+    const auto alpha_val = OutputValueType(alpha->at(0, 0));
+    const auto beta_val = beta->at(0, 0);
 
     for (size_type row = 0; row < a->get_size()[0]; row++) {
         for (size_type j = 0; j < c->get_size()[1]; j++) {
             c->at(row, j) *= beta_val;
         }
         for (size_type i = 0; i < num_stored_elements_per_row; i++) {
-            auto val = a->val_at(row, i);
+            auto val = a_vals(row + i * stride);
             auto col = a->col_at(row, i);
             for (size_type j = 0; j < c->get_size()[1]; j++) {
-                c->at(row, j) += alpha_val * val * b->at(col, j);
+                c->at(row, j) += alpha_val * val * b_vals(col, j);
             }
         }
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
 
 
diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp
index 77436ad1287..c3326737e4f 100644
--- a/reference/test/matrix/ell_kernels.cpp
+++ b/reference/test/matrix/ell_kernels.cpp
@@ -120,11 +120,46 @@ TYPED_TEST(Ell, AppliesToDenseVector)
 }
 
 
-TYPED_TEST(Ell, AppliesToMixedDenseVector)
+TYPED_TEST(Ell, MixedAppliesToDenseVector1)
 {
-    using MixedVec = typename TestFixture::MixedVec;
-    auto x = gko::initialize<MixedVec>({2.0, 1.0, 4.0}, this->exec);
-    auto y = MixedVec::create(this->exec, gko::dim<2>{2, 1});
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = typename gko::matrix::Dense<next_T>;
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2, 1});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseVector2)
+{
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec2::create(this->exec, gko::dim<2>{2, 1});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseVector3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<gko::next_precision<T>>;
+    auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
+    auto y = Vec1::create(this->exec, gko::dim<2>{2, 1});
 
     this->mtx1->apply(x.get(), y.get());
 
@@ -154,6 +189,80 @@ TYPED_TEST(Ell, AppliesToDenseMatrix)
 }
 
 
+TYPED_TEST(Ell, MixedAppliesToDenseMatrix1)
+{
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = gko::matrix::Dense<next_T>;
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseMatrix2)
+{
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    // clang-format off
+    auto x = gko::initialize<Vec1>(
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec2::create(this->exec, gko::dim<2>{2});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesToDenseMatrix3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    // clang-format off
+    auto x = gko::initialize<Vec2>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    // clang-format on
+    auto y = Vec1::create(this->exec, gko::dim<2>{2});
+
+    this->mtx1->apply(x.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{13.0,  3.5},
+                           { 5.0, -7.5}}), 0.0);
+    // clang-format on
+}
+
+
 TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector)
 {
     using Vec = typename TestFixture::Vec;
@@ -168,13 +277,52 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector)
 }
 
 
-TYPED_TEST(Ell, AppliesLinearCombinationToMixedDenseVector)
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1)
+{
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    auto x = gko::initialize<Vec>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec>({1.0, 2.0}, this->exec);
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2)
 {
-    using MixedVec = typename TestFixture::MixedVec;
-    auto alpha = gko::initialize<MixedVec>({-1.0}, this->exec);
-    auto beta = gko::initialize<MixedVec>({2.0}, this->exec);
-    auto x = gko::initialize<MixedVec>({2.0, 1.0, 4.0}, this->exec);
-    auto y = gko::initialize<MixedVec>({1.0, 2.0}, this->exec);
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec2>({2.0}, this->exec);
+    auto x = gko::initialize<Vec1>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec2>({1.0, 2.0}, this->exec);
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0);
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec1>({2.0}, this->exec);
+    auto x = gko::initialize<Vec2>({2.0, 1.0, 4.0}, this->exec);
+    auto y = gko::initialize<Vec1>({1.0, 2.0}, this->exec);
 
     this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
 
@@ -208,6 +356,92 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseMatrix)
 }
 
 
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1)
+{
+    // Both vectors have the same value type which differs from the matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec>({2.0}, this->exec);
+    // clang-format off
+    auto x = gko::initialize<Vec>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    auto y = gko::initialize<Vec>(
+        {I<next_T>{1.0, 0.5},
+         I<next_T>{2.0, -1.5}}, this->exec);
+    // clang-format on
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2)
+{
+    // Input vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec1>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec2>({2.0}, this->exec);
+    // clang-format off
+    auto x = gko::initialize<Vec1>(
+        {I<T>{2.0, 3.0},
+         I<T>{1.0, -1.5},
+         I<T>{4.0, 2.5}}, this->exec);
+    auto y = gko::initialize<Vec2>(
+        {I<next_T>{1.0, 0.5},
+         I<next_T>{2.0, -1.5}}, this->exec);
+    // clang-format on
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
+    // clang-format on
+}
+
+
+TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3)
+{
+    // Output vector has same value type as matrix
+    using T = typename TestFixture::value_type;
+    using next_T = gko::next_precision<T>;
+    using Vec1 = typename TestFixture::Vec;
+    using Vec2 = gko::matrix::Dense<next_T>;
+    auto alpha = gko::initialize<Vec2>({-1.0}, this->exec);
+    auto beta = gko::initialize<Vec1>({2.0}, this->exec);
+    // clang-format off
+    auto x = gko::initialize<Vec2>(
+        {I<next_T>{2.0, 3.0},
+         I<next_T>{1.0, -1.5},
+         I<next_T>{4.0, 2.5}}, this->exec);
+    auto y = gko::initialize<Vec1>(
+        {I<T>{1.0, 0.5},
+         I<T>{2.0, -1.5}}, this->exec);
+    // clang-format on
+
+    this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    // clang-format off
+    GKO_ASSERT_MTX_NEAR(y,
+                        l({{-11.0, -2.5},
+                           { -1.0,  4.5}}), 0.0);
+    // clang-format on
+}
+
+
 TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
 {
     using Vec = typename TestFixture::Vec;