diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 6827c5fe476..79caf2169a4 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -15,8 +15,8 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "omp/release/static"} + - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"} + - {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"} name: ${{ matrix.config.name }} runs-on: [macos-latest] @@ -40,7 +40,7 @@ jobs: run: | mkdir build cd build - cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} + cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} make -j8 ctest -j10 --output-on-failure diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml index bc4557e96bf..28468950af8 100644 --- a/.github/workflows/windows-msvc-cuda.yml +++ b/.github/workflows/windows-msvc-cuda.yml @@ -15,8 +15,8 @@ jobs: fail-fast: false matrix: config: - - {version: "10.2.89.20191206", name: "cuda102/release/shared"} - - {version: "latest", name: "cuda-latest/release/shared"} + - {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"} + - {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"} name: msvc/${{ matrix.config.name }} (only compile) runs-on: [windows-latest] @@ -46,5 +46,5 @@ jobs: $env:PATH="$env:PATH;$pwd\build\windows_shared_library" mkdir build cd build - cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 .. + cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 .. cmake --build . -j4 --config Release diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml index 13a29f6cc51..833f3692676 100644 --- a/.github/workflows/windows-msvc-ref.yml +++ b/.github/workflows/windows-msvc-ref.yml @@ -15,8 +15,8 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "reference/release/static"} + - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"} + - {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"} # Debug static needs too much storage # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} name: msvc/${{ matrix.config.name }} @@ -35,7 +35,7 @@ jobs: $env:PATH="$env:PATH;$pwd\build\windows_shared_library" mkdir build cd build - cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF .. + cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} .. cmake --build . -j4 --config ${{ matrix.config.build_type }} ctest . -C ${{ matrix.config.build_type }} --output-on-failure diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bbe85b1fbe8..11e66475301 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,6 +32,7 @@ include: BUILD_HWLOC: "ON" FAST_TESTS: "OFF" DPCPP_SINGLE_MODE: "OFF" + MIXED_PRECISION: "ON" RUN_EXAMPLES: "OFF" CONFIG_LOG: "ON" CXX_FLAGS: "" @@ -77,6 +78,7 @@ include: -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} + -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION} -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES} -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG} -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE} @@ -111,6 +113,7 @@ include: -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} + -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION} -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG} -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE} -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES} @@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static: BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" +build/nocuda-nomixed/gcc/omp/release/shared: + <<: *default_build_with_test + extends: + - .quick_test_condition + - .use_gko-nocuda-gnu9-llvm8 + variables: + <<: *default_variables + BUILD_OMP: "ON" + BUILD_TYPE: "Release" + MIXED_PRECISION: "OFF" + +build/nocuda-nomixed/clang/omp/debug/static: + <<: *default_build_with_test + extends: + - .full_test_condition + - .use_gko-nocuda-gnu9-llvm8 + variables: + <<: *default_variables + C_COMPILER: "clang" + CXX_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_TYPE: "Debug" + BUILD_SHARED_LIBS: "OFF" + MIXED_PRECISION: "OFF" + +build/nocuda-nomixed/intel/omp/release/static: + <<: *default_build_with_test + extends: + - .full_test_condition + - .use_gko-nocuda-gnu9-llvm8-intel + variables: + <<: *default_variables + C_COMPILER: "icc" + CXX_COMPILER: "icpc" + BUILD_OMP: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" + MIXED_PRECISION: "OFF" build/dpcpp/cpu/release/static: <<: *default_build_with_test diff --git a/CMakeLists.txt b/CMakeLists.txt index 1318a6900d4..e999611aa57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP}) option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) +option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_EXPORT_BUILD_DIR diff --git a/INSTALL.md b/INSTALL.md index d85bcc48329..4fcc60d9ae4 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development (requires clang-format, will also download git-cmake-format), default is `OFF`. +* `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels + instead of converting data on the fly, default is `OFF`. + Enabling this flag increases the library size, but improves performance of + mixed-precision kernels. * `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests (will download googletest), default is `ON`. * `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index d25be4657ec..64de4e10f90 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -59,7 +59,8 @@ namespace formats { std::string available_format = - "coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, " + "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, " + "hybrid40, " "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, " "hybridminstorage" #ifdef HAS_CUDA @@ -90,6 +91,8 @@ std::string format_description = "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse " "Matrix-Vector Multiplication on CUDA.\n" + "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: " + "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n" "sellp: Sliced Ellpack uses a default block size of 32.\n" "hybrid: Hybrid uses ell and coo to represent the matrix.\n" "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses " @@ -204,6 +207,23 @@ const std::map( {"csrc", READ_MATRIX(csr, std::make_shared())}, {"coo", read_matrix_from_data>}, {"ell", read_matrix_from_data>}, + {"ell-mixed", + [](std::shared_ptr exec, + const gko::matrix_data &data) { + gko::matrix_data> conv_data; + conv_data.size = data.size; + conv_data.nonzeros.resize(data.nonzeros.size()); + auto it = conv_data.nonzeros.begin(); + for (auto &el : data.nonzeros) { + it->row = el.row; + it->column = el.column; + it->value = el.value; + ++it; + } + auto mat = gko::matrix::Ell>::create(std::move(exec)); + mat->read(conv_data); + return mat; + }}, #ifdef HAS_CUDA #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) {"cusp_csr", read_matrix_from_data}, @@ -212,8 +232,8 @@ const std::map( {"cusp_hybrid", read_matrix_from_data}, {"cusp_coo", read_matrix_from_data}, {"cusp_ell", read_matrix_from_data}, -#else // CUDA_VERSION >= 11000 - // cusp_csr, cusp_coo use the generic ones from CUDA 11 +#else // CUDA_VERSION >= 11000 + // cusp_csr, cusp_coo use the generic ones from CUDA 11 {"cusp_csr", read_matrix_from_data}, {"cusp_coo", read_matrix_from_data}, #endif @@ -260,7 +280,8 @@ const std::map( {"hybridminstorage", READ_MATRIX(hybrid, std::make_shared())}, - {"sellp", read_matrix_from_data>}}; + {"sellp", read_matrix_from_data>} +}; // clang-format on diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 835bdf0c45b..bae1e864a02 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -128,6 +128,9 @@ foreach(log_type ${log_types}) ginkgo_print_module_footer(${${log_type}} " Enabled modules:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP") + ginkgo_print_module_footer(${${log_type}} " Enabled features:") + ginkgo_print_foreach_variable(${${log_type}} + "GINKGO_MIXED_PRECISION") ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING") diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 00eb2d6871d..2323d512258 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -34,29 +34,30 @@ namespace kernel { namespace { -template +template __device__ void spmv_kernel( const size_type num_rows, const int num_worker_per_row, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, + acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride, Closure op) + acc::range b, OutputValueType *__restrict__ c, + const size_type c_stride, Closure op) { const auto tidx = thread::get_thread_id_flat(); - const auto column_id = blockIdx.y; + const decltype(tidx) column_id = blockIdx.y; if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - ValueType temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; if (col_idx < idx) { break; } else { - temp += val[ind] * b[col_idx * b_stride + column_id]; + temp += val(ind) * b(col_idx, column_id); } } const auto c_ind = tidx * c_stride + column_id; @@ -68,14 +69,14 @@ __device__ void spmv_kernel( const auto x = tidx % num_rows; const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; - __shared__ UninitializedArray + __shared__ UninitializedArray< + OutputValueType, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { storage[threadIdx.x] = 0; } __syncthreads(); - ValueType temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -84,7 +85,7 @@ __device__ void spmv_kernel( if (col_idx < idx) { break; } else { - temp += val[ind] * b[col_idx * b_stride + column_id]; + temp += val(ind) * b(col_idx, column_id); } } atomic_add(&storage[threadIdx.x], temp); @@ -102,35 +103,34 @@ __device__ void spmv_kernel( } -template +template __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, + acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) + acc::range b, OutputValueType *__restrict__ c, + const size_type c_stride) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, - [](const ValueType &x, const ValueType &y) { return x; }); + num_stored_elements_per_row, b, c, c_stride, + [](const OutputValueType &x, const OutputValueType &y) { return x; }); } -template +template __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, + acc::range alpha, acc::range val, const IndexType *__restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - const ValueType *__restrict__ beta, ValueType *__restrict__ c, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c, const size_type c_stride) { - const ValueType alpha_val = alpha[0]; - const ValueType beta_val = beta[0]; + const OutputValueType alpha_val = alpha(0); + const OutputValueType beta_val = beta[0]; // Because the atomic operation changes the values of c during computation, // it can not do the right alpha * a * b + beta * c operation. // Thus, the cuda kernel only computes alpha * a * b when it uses atomic @@ -138,15 +138,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( if (atomic) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val](const ValueType &x, const ValueType &y) { + num_stored_elements_per_row, b, c, c_stride, + [&alpha_val](const OutputValueType &x, const OutputValueType &y) { return alpha_val * x; }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { + num_stored_elements_per_row, b, c, c_stride, + [&alpha_val, &beta_val](const OutputValueType &x, + const OutputValueType &y) { return alpha_val * x + beta_val * y; }); } diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp new file mode 100644 index 00000000000..d725eff82bf --- /dev/null +++ b/core/base/mixed_precision_types.hpp @@ -0,0 +1,83 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ +#define GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ + + +#include +#include + + +#ifdef GINKGO_MIXED_PRECISION +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + template _macro(float, float, double, __VA_ARGS__); \ + template _macro(float, double, float, __VA_ARGS__); \ + template _macro(float, double, double, __VA_ARGS__); \ + template _macro(double, float, float, __VA_ARGS__); \ + template _macro(double, float, double, __VA_ARGS__); \ + template _macro(double, double, float, __VA_ARGS__); \ + template _macro(double, double, double, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__) +#else +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + template _macro(double, double, double, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__) +#endif + + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64) + + +#endif // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 877051c3e04..810a3fab4e4 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/mixed_precision_types.hpp" #include "core/components/absolute_array.hpp" #include "core/components/fill_array.hpp" #include "core/components/precision_conversion.hpp" @@ -931,15 +932,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( namespace ell { -template -GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType) +template +GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, OutputValueType, + IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template -GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType) +template +GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, + OutputValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); template diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 0dc9a950f75..fcb198106cd 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -102,7 +102,7 @@ size_type calculate_max_nnz_per_row( template void Ell::apply_impl(const LinOp *b, LinOp *x) const { - precision_dispatch_real_complex( + mixed_precision_dispatch_real_complex( [this](auto dense_b, auto dense_x) { this->get_executor()->run(ell::make_spmv(this, dense_b, dense_x)); }, @@ -114,12 +114,15 @@ template void Ell::apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, LinOp *x) const { - precision_dispatch_real_complex( - [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { + mixed_precision_dispatch_real_complex( + [this, alpha, beta](auto dense_b, auto dense_x) { + auto dense_alpha = make_temporary_conversion(alpha); + auto dense_beta = make_temporary_conversion< + typename std::decay_t::value_type>(beta); this->get_executor()->run(ell::make_advanced_spmv( - dense_alpha, this, dense_b, dense_beta, dense_x)); + dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x)); }, - alpha, b, beta, x); + b, x); } diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp index 7a1ec261d15..7cbe54b3720 100644 --- a/core/matrix/ell_kernels.hpp +++ b/core/matrix/ell_kernels.hpp @@ -46,18 +46,21 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType) \ - void spmv(std::shared_ptr exec, \ - const matrix::Ell *a, \ - const matrix::Dense *b, matrix::Dense *c) - -#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType) \ - void advanced_spmv(std::shared_ptr exec, \ - const matrix::Dense *alpha, \ - const matrix::Ell *a, \ - const matrix::Dense *b, \ - const matrix::Dense *beta, \ - matrix::Dense *c) +#define GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType) \ + void spmv(std::shared_ptr exec, \ + const matrix::Ell *a, \ + const matrix::Dense *b, \ + matrix::Dense *c) + +#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType) \ + void advanced_spmv(std::shared_ptr exec, \ + const matrix::Dense *alpha, \ + const matrix::Ell *a, \ + const matrix::Dense *b, \ + const matrix::Dense *beta, \ + matrix::Dense *c) #define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ void convert_to_dense(std::shared_ptr exec, \ @@ -87,10 +90,14 @@ namespace kernels { matrix::Diagonal *diag) #define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType); \ template \ GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ template \ diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 9bc47f71e63..953ce4be8d7 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -43,6 +43,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" @@ -107,16 +109,37 @@ using compiled_kernels = syn::value_list; namespace { +template +GKO_INLINE auto as_cuda_accessor( + const acc::range> &acc) +{ + return acc::range< + acc::reduced_row_major, cuda_type>>( + acc.get_accessor().get_size(), + as_cuda_type(acc.get_accessor().get_stored_data()), + acc.get_accessor().get_stride()); +} + -template +template void abstract_spmv(syn::value_list, int num_worker_per_row, - const matrix::Ell *a, - const matrix::Dense *b, - matrix::Dense *c, - const matrix::Dense *alpha = nullptr, - const matrix::Dense *beta = nullptr) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) { + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + const auto nrows = a->get_size()[0]; + const auto stride = a->get_stride(); + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + constexpr int num_thread_per_worker = (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); @@ -124,22 +147,29 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, num_thread_per_worker, 1); const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); + + const auto a_vals = gko::acc::range( + std::array{{num_stored_elements_per_row * stride}}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); + if (alpha == nullptr && beta == nullptr) { kernel::spmv <<>>( - nrows, num_worker_per_row, as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), a->get_stride(), - a->get_num_stored_elements_per_row(), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(c->get_values()), c->get_stride()); + nrows, num_worker_per_row, as_cuda_accessor(a_vals), + a->get_const_col_idxs(), stride, num_stored_elements_per_row, + as_cuda_accessor(b_vals), as_cuda_type(c->get_values()), + c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = gko::acc::range( + std::array{1}, alpha->get_const_values()); kernel::spmv <<>>( - nrows, num_worker_per_row, - as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_cuda_type(b->get_const_values()), b->get_stride(), + nrows, num_worker_per_row, as_cuda_accessor(alpha_val), + as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride, + num_stored_elements_per_row, as_cuda_accessor(b_vals), as_cuda_type(beta->get_const_values()), as_cuda_type(c->get_values()), c->get_stride()); } else { @@ -194,10 +224,12 @@ std::array compute_thread_worker_and_atomicity( } // namespace -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -212,7 +244,8 @@ void spmv(std::shared_ptr exec, const int info = (!atomic) * num_thread_per_worker; if (atomic) { components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + c->get_num_stored_elements(), + zero()); } select_abstract_spmv( compiled_kernels(), @@ -221,16 +254,18 @@ void spmv(std::shared_ptr exec, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -253,7 +288,7 @@ void advanced_spmv(std::shared_ptr exec, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index dff573a284f..2df1c397f4c 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -58,9 +58,12 @@ class Ell : public ::testing::Test { protected: using Mtx = gko::matrix::Ell<>; using Vec = gko::matrix::Dense<>; + using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; - Ell() : rand_engine(42) {} + Ell() + : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600} + {} void SetUp() { @@ -92,38 +95,62 @@ class Ell : public ::testing::Test { stride); mtx->copy_from(gen_mtx(num_rows, num_cols)); expected = gen_mtx(num_rows, num_vectors); + expected2 = Vec2::create(ref); + expected2->copy_from(expected.get()); y = gen_mtx(num_cols, num_vectors); + y2 = Vec2::create(ref); + y2->copy_from(y.get()); alpha = gko::initialize({2.0}, ref); + alpha2 = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(cuda); dmtx->copy_from(mtx.get()); dresult = Vec::create(cuda); dresult->copy_from(expected.get()); + dresult2 = Vec2::create(cuda); + dresult2->copy_from(expected2.get()); dy = Vec::create(cuda); dy->copy_from(y.get()); + dy2 = Vec2::create(cuda); + dy2->copy_from(y2.get()); dalpha = Vec::create(cuda); dalpha->copy_from(alpha.get()); + dalpha2 = Vec2::create(cuda); + dalpha2->copy_from(alpha2.get()); dbeta = Vec::create(cuda); dbeta->copy_from(beta.get()); + dbeta2 = Vec2::create(cuda); + dbeta2->copy_from(beta2.get()); } - std::shared_ptr ref; std::shared_ptr cuda; std::ranlux48 rand_engine; + gko::dim<2> size; + gko::size_type num_els_rowwise; + gko::size_type ell_stride; std::unique_ptr mtx; std::unique_ptr expected; + std::unique_ptr expected2; std::unique_ptr y; + std::unique_ptr y2; std::unique_ptr alpha; + std::unique_ptr alpha2; std::unique_ptr beta; + std::unique_ptr beta2; std::unique_ptr dmtx; std::unique_ptr dresult; + std::unique_ptr dresult2; std::unique_ptr dy; + std::unique_ptr dy2; std::unique_ptr dalpha; + std::unique_ptr dalpha2; std::unique_ptr dbeta; + std::unique_ptr dbeta2; }; @@ -138,6 +165,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -149,9 +209,42 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -160,9 +253,42 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -170,9 +296,42 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -181,9 +340,42 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -192,6 +384,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef) { set_up_apply_data(10, 10000); @@ -283,10 +508,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef) TEST_F(Ell, ApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(cuda); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(cuda); dcomplex_x->copy_from(complex_x.get()); @@ -300,10 +525,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef) TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(cuda); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(cuda); dcomplex_x->copy_from(complex_x.get()); diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 01a40b5d9b5..3c1d93eb323 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/mixed_precision_types.hpp" #include "dpcpp/components/format_conversion.dp.hpp" @@ -56,24 +57,27 @@ namespace dpcpp { namespace ell { -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2e7a4eeeb50..b1ad7bf4662 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -46,6 +46,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" @@ -111,15 +113,37 @@ using compiled_kernels = syn::value_list; namespace { -template +template +GKO_INLINE auto as_hip_accessor( + const acc::range> &acc) +{ + return acc::range< + acc::reduced_row_major, hip_type>>( + acc.get_accessor().get_size(), + as_hip_type(acc.get_accessor().get_stored_data()), + acc.get_accessor().get_stride()); +} + + +template void abstract_spmv(syn::value_list, int num_worker_per_row, - const matrix::Ell *a, - const matrix::Dense *b, - matrix::Dense *c, - const matrix::Dense *alpha = nullptr, - const matrix::Dense *beta = nullptr) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) { + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + const auto nrows = a->get_size()[0]; + const auto stride = a->get_stride(); + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + constexpr int num_thread_per_worker = (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); @@ -127,24 +151,31 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, num_thread_per_worker, 1); const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); + + const auto a_vals = gko::acc::range( + std::array{{num_stored_elements_per_row * stride}}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); + if (alpha == nullptr && beta == nullptr) { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, - as_hip_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_accessor(a_vals), a->get_const_col_idxs(), stride, + num_stored_elements_per_row, as_hip_accessor(b_vals), as_hip_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = gko::acc::range( + std::array{1}, alpha->get_const_values()); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, - as_hip_type(alpha->get_const_values()), - as_hip_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_hip_type(b->get_const_values()), b->get_stride(), - as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()), - c->get_stride()); + as_hip_accessor(alpha_val), as_hip_accessor(a_vals), + a->get_const_col_idxs(), stride, num_stored_elements_per_row, + as_hip_accessor(b_vals), as_hip_type(beta->get_const_values()), + as_hip_type(c->get_values()), c->get_stride()); } else { GKO_KERNEL_NOT_FOUND; } @@ -197,10 +228,12 @@ std::array compute_thread_worker_and_atomicity( } // namespace -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -215,7 +248,8 @@ void spmv(std::shared_ptr exec, const int info = (!atomic) * num_thread_per_worker; if (atomic) { components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + c->get_num_stored_elements(), + zero()); } select_abstract_spmv( compiled_kernels(), @@ -224,16 +258,18 @@ void spmv(std::shared_ptr exec, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -256,7 +292,7 @@ void advanced_spmv(std::shared_ptr exec, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp index e9a875eb544..13c81fdafcf 100644 --- a/hip/test/matrix/ell_kernels.hip.cpp +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -58,9 +58,12 @@ class Ell : public ::testing::Test { protected: using Mtx = gko::matrix::Ell<>; using Vec = gko::matrix::Dense<>; + using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; - Ell() : rand_engine(42) {} + Ell() + : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600} + {} void SetUp() { @@ -92,19 +95,33 @@ class Ell : public ::testing::Test { stride); mtx->copy_from(gen_mtx(num_rows, num_cols)); expected = gen_mtx(num_rows, num_vectors); + expected2 = Vec2::create(ref); + expected2->copy_from(expected.get()); y = gen_mtx(num_cols, num_vectors); + y2 = Vec2::create(ref); + y2->copy_from(y.get()); alpha = gko::initialize({2.0}, ref); + alpha2 = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(hip); dmtx->copy_from(mtx.get()); dresult = Vec::create(hip); dresult->copy_from(expected.get()); + dresult2 = Vec2::create(hip); + dresult2->copy_from(expected2.get()); dy = Vec::create(hip); dy->copy_from(y.get()); + dy2 = Vec2::create(hip); + dy2->copy_from(y2.get()); dalpha = Vec::create(hip); dalpha->copy_from(alpha.get()); + dalpha2 = Vec2::create(hip); + dalpha2->copy_from(alpha2.get()); dbeta = Vec::create(hip); dbeta->copy_from(beta.get()); + dbeta2 = Vec2::create(hip); + dbeta2->copy_from(beta2.get()); } @@ -112,18 +129,29 @@ class Ell : public ::testing::Test { std::shared_ptr hip; std::ranlux48 rand_engine; + gko::dim<2> size; + gko::size_type num_els_rowwise; + gko::size_type ell_stride; std::unique_ptr mtx; std::unique_ptr expected; + std::unique_ptr expected2; std::unique_ptr y; + std::unique_ptr y2; std::unique_ptr alpha; + std::unique_ptr alpha2; std::unique_ptr beta; + std::unique_ptr beta2; std::unique_ptr dmtx; std::unique_ptr dresult; + std::unique_ptr dresult2; std::unique_ptr dy; + std::unique_ptr dy2; std::unique_ptr dalpha; + std::unique_ptr dalpha2; std::unique_ptr dbeta; + std::unique_ptr dbeta2; }; @@ -138,6 +166,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -149,9 +210,42 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -160,9 +254,42 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -170,9 +297,42 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -181,9 +341,42 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -192,6 +385,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); +} + + TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef) { set_up_apply_data(10, 10000); @@ -283,10 +509,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef) TEST_F(Ell, ApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(hip); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(hip); dcomplex_x->copy_from(complex_x.get()); @@ -300,10 +526,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef) TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(hip); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(hip); dcomplex_x->copy_from(complex_x.get()); diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index edef91e6d9e..1c6a31ea481 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -63,6 +63,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING +/* Should we compile mixed-precision kernels for Ginkgo? */ +#cmakedefine GINKGO_MIXED_PRECISION + + /* What is HIP compiled for, hcc or nvcc? */ // clang-format off #define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@ diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 5588d566b41..376f126f5c5 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -226,6 +226,110 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha, } +/** + * Calls the given function with each given argument LinOp + * converted into matrix::Dense as parameters. + * + * If GINKGO_MIXED_PRECISION is defined, this means that the function will be + * called with its dynamic type as a static type, so the (templated/generic) + * function will be instantiated with all pairs of Dense and + * Dense> parameter types, and the appropriate + * overload will be called based on the dynamic type of the parameter. + * + * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like + * precision_dispatch. + * + * @param fn the given function. It will be called with one const and one + * non-const matrix::Dense<...> parameter based on the dynamic type + * of the inputs (GINKGO_MIXED_PRECISION) or of type + * matrix::Dense (no GINKGO_MIXED_PRECISION). + * @param in The first parameter to be cast (GINKGO_MIXED_PRECISION) or + * converted (no GINKGO_MIXED_PRECISION) and used to call `fn`. + * @param out The second parameter to be cast (GINKGO_MIXED_PRECISION) or + * converted (no GINKGO_MIXED_PRECISION) and used to call `fn`. + * + * @tparam ValueType the value type to use for the parameters of `fn` (no + * GINKGO_MIXED_PRECISION). With GINKGO_MIXED_PRECISION + * enabled, it only matters whether this type is complex or + * real. + * @tparam Function the function pointer, lambda or other functor type to call + * with the converted arguments. + */ +template +void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out) +{ +#ifdef GINKGO_MIXED_PRECISION + using fst_type = matrix::Dense; + using snd_type = matrix::Dense>; + if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else { + GKO_NOT_SUPPORTED(in); + } +#else + precision_dispatch(fn, in, out); +#endif +} + + +/** + * Calls the given function with the given LinOps cast to their dynamic type + * matrix::Dense* as parameters. + * If ValueType is real and both `in` and `out` are complex, uses + * matrix::Dense::get_real_view() to convert them into real matrices after + * precision conversion. + * + * @see mixed_precision_dispatch() + */ +template ()> * = nullptr> +void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in, + LinOp *out) +{ +#ifdef GINKGO_MIXED_PRECISION + mixed_precision_dispatch(fn, in, out); +#else + precision_dispatch(fn, in, out); +#endif +} + + +template ()> * = nullptr> +void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in, + LinOp *out) +{ +#ifdef GINKGO_MIXED_PRECISION + if (!dynamic_cast> *>(in)) { + mixed_precision_dispatch>( + [&fn](auto dense_in, auto dense_out) { + fn(dense_in->create_real_view().get(), + dense_out->create_real_view().get()); + }, + in, out); + } else { + mixed_precision_dispatch(fn, in, out); + } +#else + precision_dispatch_real_complex(fn, in, out); +#endif +} + + } // namespace gko diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index a590357a1c2..96f4d5347d2 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" #include "omp/components/format_conversion.hpp" @@ -56,42 +58,72 @@ namespace omp { namespace ell { -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) = zero(); + c->at(row, j) = zero(); } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += val * b->at(col, j); + c->at(row, j) += val * b_vals(col, j); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - auto alpha_val = alpha->at(0, 0); - auto beta_val = beta->at(0, 0); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); + const auto alpha_val = OutputValueType(alpha->at(0, 0)); + const auto beta_val = beta->at(0, 0); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; row++) { @@ -99,16 +131,16 @@ void advanced_spmv(std::shared_ptr exec, c->at(row, j) *= beta_val; } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += alpha_val * val * b->at(col, j); + c->at(row, j) += alpha_val * val * b_vals(col, j); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp index 281292e9684..01f1a31121d 100644 --- a/omp/test/matrix/ell_kernels.cpp +++ b/omp/test/matrix/ell_kernels.cpp @@ -57,6 +57,7 @@ class Ell : public ::testing::Test { protected: using Mtx = gko::matrix::Ell<>; using Vec = gko::matrix::Dense<>; + using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; Ell() : rand_engine(42) {} @@ -90,19 +91,33 @@ class Ell : public ::testing::Test { mtx = Mtx::create(ref, gko::dim<2>{}, max_nonzeros_per_row, stride); mtx->copy_from(gen_mtx(532, 231, 1)); expected = gen_mtx(532, num_vectors, 1); + expected2 = Vec2::create(ref); + expected2->copy_from(expected.get()); y = gen_mtx(231, num_vectors, 1); + y2 = Vec2::create(ref); + y2->copy_from(y.get()); alpha = gko::initialize({2.0}, ref); + alpha2 = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(omp); dmtx->copy_from(mtx.get()); dresult = Vec::create(omp); dresult->copy_from(expected.get()); + dresult2 = Vec2::create(omp); + dresult2->copy_from(expected2.get()); dy = Vec::create(omp); dy->copy_from(y.get()); + dy2 = Vec2::create(omp); + dy2->copy_from(y2.get()); dalpha = Vec::create(omp); dalpha->copy_from(alpha.get()); + dalpha2 = Vec2::create(omp); + dalpha2->copy_from(alpha2.get()); dbeta = Vec::create(omp); dbeta->copy_from(beta.get()); + dbeta2 = Vec2::create(omp); + dbeta2->copy_from(beta2.get()); } std::shared_ptr ref; @@ -112,15 +127,23 @@ class Ell : public ::testing::Test { std::unique_ptr mtx; std::unique_ptr expected; + std::unique_ptr expected2; std::unique_ptr y; + std::unique_ptr y2; std::unique_ptr alpha; + std::unique_ptr alpha2; std::unique_ptr beta; + std::unique_ptr beta2; std::unique_ptr dmtx; std::unique_ptr dresult; + std::unique_ptr dresult2; std::unique_ptr dy; + std::unique_ptr dy2; std::unique_ptr dalpha; + std::unique_ptr dalpha2; std::unique_ptr dbeta; + std::unique_ptr dbeta2; }; @@ -135,6 +158,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -146,6 +202,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef) { set_up_apply_data(300, 600); @@ -157,6 +246,39 @@ TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef1) +{ + set_up_apply_data(300, 600); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef2) +{ + set_up_apply_data(300, 600); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef3) +{ + set_up_apply_data(300, 600); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef) { set_up_apply_data(300, 600); @@ -167,6 +289,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef1) +{ + set_up_apply_data(300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef2) +{ + set_up_apply_data(300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef3) +{ + set_up_apply_data(300, 600); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef) { set_up_apply_data(0, 0, 3); @@ -178,6 +333,39 @@ TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef) { set_up_apply_data(0, 0, 3); @@ -189,7 +377,40 @@ TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef) } -TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef) +TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, SimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef) { set_up_apply_data(300, 600, 3); @@ -200,6 +421,39 @@ TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef) { set_up_apply_data(300, 600, 3); @@ -210,6 +464,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, ApplyToComplexIsEquivalentToRef) { set_up_apply_data(300, 600); diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index fdbaff5dfc5..24f166e52db 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -39,6 +39,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" + + namespace gko { namespace kernels { namespace reference { @@ -50,57 +54,87 @@ namespace reference { namespace ell { -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) = zero(); + c->at(row, j) = zero(); } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += val * b->at(col, j); + c->at(row, j) += val * b_vals(col, j); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - auto alpha_val = alpha->at(0, 0); - auto beta_val = beta->at(0, 0); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); + const auto alpha_val = OutputValueType(alpha->at(0, 0)); + const auto beta_val = beta->at(0, 0); for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { c->at(row, j) *= beta_val; } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += alpha_val * val * b->at(col, j); + c->at(row, j) += alpha_val * val * b_vals(col, j); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 77436ad1287..c3326737e4f 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -120,11 +120,46 @@ TYPED_TEST(Ell, AppliesToDenseVector) } -TYPED_TEST(Ell, AppliesToMixedDenseVector) +TYPED_TEST(Ell, MixedAppliesToDenseVector1) { - using MixedVec = typename TestFixture::MixedVec; - auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); - auto y = MixedVec::create(this->exec, gko::dim<2>{2, 1}); + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = typename gko::matrix::Dense; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); + + this->mtx1->apply(x.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesToDenseVector2) +{ + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec2::create(this->exec, gko::dim<2>{2, 1}); + + this->mtx1->apply(x.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesToDenseVector3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense>; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); this->mtx1->apply(x.get(), y.get()); @@ -154,6 +189,80 @@ TYPED_TEST(Ell, AppliesToDenseMatrix) } +TYPED_TEST(Ell, MixedAppliesToDenseMatrix1) +{ + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = gko::matrix::Dense; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + this->mtx1->apply(x.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesToDenseMatrix2) +{ + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec2::create(this->exec, gko::dim<2>{2}); + + this->mtx1->apply(x.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesToDenseMatrix3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec1::create(this->exec, gko::dim<2>{2}); + + this->mtx1->apply(x.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); + // clang-format on +} + + TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector) { using Vec = typename TestFixture::Vec; @@ -168,13 +277,52 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector) } -TYPED_TEST(Ell, AppliesLinearCombinationToMixedDenseVector) +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1) +{ + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2) { - using MixedVec = typename TestFixture::MixedVec; - auto alpha = gko::initialize({-1.0}, this->exec); - auto beta = gko::initialize({2.0}, this->exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); - auto y = gko::initialize({1.0, 2.0}, this->exec); + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); @@ -208,6 +356,92 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseMatrix) } +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1) +{ + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + auto y = gko::initialize( + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); + // clang-format on + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2) +{ + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + auto y = gko::initialize( + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); + // clang-format on + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + auto y = gko::initialize( + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); + // clang-format on + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); + // clang-format on +} + + TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension) { using Vec = typename TestFixture::Vec;