From 19cbd4e0374e589e021bbb7b0da678b7d1f56896 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 17 Dec 2020 02:58:25 +0100 Subject: [PATCH 01/14] add ELL mixed-precision support --- benchmark/utils/formats.hpp | 30 ++++++++++-- common/matrix/ell_kernels.hpp.inc | 62 ++++++++++++++---------- core/device_hooks/common_kernels.inc.cpp | 17 ++++--- core/matrix/ell_kernels.hpp | 39 +++++++++------ cuda/matrix/ell_kernels.cu | 42 +++++++++------- include/ginkgo/core/base/types.hpp | 51 +++++++++++++++++++ omp/matrix/ell_kernels.cpp | 35 +++++++------ reference/matrix/ell_kernels.cpp | 34 +++++++------ 8 files changed, 212 insertions(+), 98 deletions(-) diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index d25be4657ec..6b344aec06d 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -59,7 +59,8 @@ namespace formats { std::string available_format = - "coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, " + "coo, csr, ell, fell, sellp, hybrid, hybrid0, hybrid25, hybrid33, " + "hybrid40, " "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, " "hybridminstorage" #ifdef HAS_CUDA @@ -90,6 +91,9 @@ std::string format_description = "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse " "Matrix-Vector Multiplication on CUDA.\n" + "fell: float Ellpack format according to Bell and Garland: Efficient " + "Sparse " + "Matrix-Vector Multiplication on CUDA.\n" "sellp: Sliced Ellpack uses a default block size of 32.\n" "hybrid: Hybrid uses ell and coo to represent the matrix.\n" "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses " @@ -204,6 +208,23 @@ const std::map( {"csrc", READ_MATRIX(csr, std::make_shared())}, {"coo", read_matrix_from_data>}, {"ell", read_matrix_from_data>}, + {"fell", + [](std::shared_ptr exec, + const gko::matrix_data<> &data) { + gko::matrix_data conv_data; + conv_data.size = data.size; + conv_data.nonzeros.resize(data.nonzeros.size()); + auto it = conv_data.nonzeros.begin(); + for (auto &el : data.nonzeros) { + it->row = el.row; + it->column = el.column; + it->value = el.value; + ++it; + } + auto mat = gko::matrix::Ell::create(std::move(exec)); + mat->read(conv_data); + return mat; + }}, #ifdef HAS_CUDA #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) {"cusp_csr", read_matrix_from_data}, @@ -212,8 +233,8 @@ const std::map( {"cusp_hybrid", read_matrix_from_data}, {"cusp_coo", read_matrix_from_data}, {"cusp_ell", read_matrix_from_data}, -#else // CUDA_VERSION >= 11000 - // cusp_csr, cusp_coo use the generic ones from CUDA 11 +#else // CUDA_VERSION >= 11000 + // cusp_csr, cusp_coo use the generic ones from CUDA 11 {"cusp_csr", read_matrix_from_data}, {"cusp_coo", read_matrix_from_data}, #endif @@ -260,7 +281,8 @@ const std::map( {"hybridminstorage", READ_MATRIX(hybrid, std::make_shared())}, - {"sellp", read_matrix_from_data>}}; + {"sellp", read_matrix_from_data>} +}; // clang-format on diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 00eb2d6871d..2f155142efd 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -34,22 +34,25 @@ namespace kernel { namespace { -template __device__ void spmv_kernel( const size_type num_rows, const int num_worker_per_row, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride, Closure op) + const InputValueType *__restrict__ b, const size_type b_stride, + OutputValueType *__restrict__ c, const size_type c_stride, Closure op) { const auto tidx = thread::get_thread_id_flat(); const auto column_id = blockIdx.y; + using compute_type = + decltype(InputValueType{} + MatrixValueType{} + OutputValueType{}); if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - ValueType temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -68,14 +71,14 @@ __device__ void spmv_kernel( const auto x = tidx % num_rows; const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; - __shared__ UninitializedArray + __shared__ UninitializedArray< + compute_type, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { storage[threadIdx.x] = 0; } __syncthreads(); - ValueType temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -102,35 +105,41 @@ __device__ void spmv_kernel( } -template +template __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, - const ValueType *__restrict__ val, const IndexType *__restrict__ col, + const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - ValueType *__restrict__ c, const size_type c_stride) + const InputValueType *__restrict__ b, const size_type b_stride, + OutputValueType *__restrict__ c, const size_type c_stride) { + using compute_type = + decltype(InputValueType{} + MatrixValueType{} + OutputValueType{}); spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, b_stride, c, c_stride, - [](const ValueType &x, const ValueType &y) { return x; }); + [](const compute_type &x, const OutputValueType &y) { return x; }); } -template +template __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, - const ValueType *__restrict__ alpha, const ValueType *__restrict__ val, - const IndexType *__restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, - const ValueType *__restrict__ b, const size_type b_stride, - const ValueType *__restrict__ beta, ValueType *__restrict__ c, + const MatrixValueType *__restrict__ alpha, + const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + const InputValueType *__restrict__ b, const size_type b_stride, + const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c, const size_type c_stride) { - const ValueType alpha_val = alpha[0]; - const ValueType beta_val = beta[0]; + using compute_type = + decltype(InputValueType{} + MatrixValueType{} + OutputValueType{}); + const compute_type alpha_val = alpha[0]; + const compute_type beta_val = beta[0]; // Because the atomic operation changes the values of c during computation, // it can not do the right alpha * a * b + beta * c operation. // Thus, the cuda kernel only computes alpha * a * b when it uses atomic @@ -139,15 +148,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val](const ValueType &x, const ValueType &y) { + [&alpha_val](const compute_type &x, const OutputValueType &y) { return alpha_val * x; }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val, &beta_val](const ValueType &x, const ValueType &y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const compute_type &x, + const OutputValueType &y) { + return alpha_val * x + beta_val * compute_type{y}; }); } } diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 877051c3e04..42121074df4 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -931,15 +931,20 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( namespace ell { -template -GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType) +template +GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, OutputValueType, + IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template -GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType) +template +GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, + OutputValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); template diff --git a/core/matrix/ell_kernels.hpp b/core/matrix/ell_kernels.hpp index 7a1ec261d15..7cbe54b3720 100644 --- a/core/matrix/ell_kernels.hpp +++ b/core/matrix/ell_kernels.hpp @@ -46,18 +46,21 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType) \ - void spmv(std::shared_ptr exec, \ - const matrix::Ell *a, \ - const matrix::Dense *b, matrix::Dense *c) - -#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType) \ - void advanced_spmv(std::shared_ptr exec, \ - const matrix::Dense *alpha, \ - const matrix::Ell *a, \ - const matrix::Dense *b, \ - const matrix::Dense *beta, \ - matrix::Dense *c) +#define GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType) \ + void spmv(std::shared_ptr exec, \ + const matrix::Ell *a, \ + const matrix::Dense *b, \ + matrix::Dense *c) + +#define GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType) \ + void advanced_spmv(std::shared_ptr exec, \ + const matrix::Dense *alpha, \ + const matrix::Ell *a, \ + const matrix::Dense *b, \ + const matrix::Dense *beta, \ + matrix::Dense *c) #define GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType) \ void convert_to_dense(std::shared_ptr exec, \ @@ -87,10 +90,14 @@ namespace kernels { matrix::Diagonal *diag) #define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_ELL_SPMV_KERNEL(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType); \ + template \ + GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL(InputValueType, MatrixValueType, \ + OutputValueType, IndexType); \ template \ GKO_DECLARE_ELL_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType); \ template \ diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 9bc47f71e63..df223dfd7cf 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -108,13 +108,14 @@ using compiled_kernels = syn::value_list; namespace { -template +template void abstract_spmv(syn::value_list, int num_worker_per_row, - const matrix::Ell *a, - const matrix::Dense *b, - matrix::Dense *c, - const matrix::Dense *alpha = nullptr, - const matrix::Dense *beta = nullptr) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) { const auto nrows = a->get_size()[0]; constexpr int num_thread_per_worker = @@ -194,10 +195,12 @@ std::array compute_thread_worker_and_atomicity( } // namespace -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -212,7 +215,8 @@ void spmv(std::shared_ptr exec, const int info = (!atomic) * num_thread_per_worker; if (atomic) { components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + c->get_num_stored_elements(), + zero()); } select_abstract_spmv( compiled_kernels(), @@ -221,16 +225,18 @@ void spmv(std::shared_ptr exec, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -253,7 +259,7 @@ void advanced_spmv(std::shared_ptr exec, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 2c77aaef826..2f6da1e9e42 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -524,6 +524,57 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #endif +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(float, float, float, int32); \ + template _macro(float, float, double, int32); \ + template _macro(float, double, float, int32); \ + template _macro(float, double, double, int32); \ + template _macro(double, float, float, int32); \ + template _macro(double, float, double, int32); \ + template _macro(double, double, float, int32); \ + template _macro(double, double, double, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(float, float, float, int64); \ + template _macro(float, float, double, int64); \ + template _macro(float, double, float, int64); \ + template _macro(float, double, double, int64); \ + template _macro(double, float, float, int64); \ + template _macro(double, float, double, int64); \ + template _macro(double, double, float, int64); \ + template _macro(double, double, double, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64) + + /** * Instantiates a template for each value type conversion pair compiled by * Ginkgo. diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index a590357a1c2..621e1cfe940 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -56,38 +56,44 @@ namespace omp { namespace ell { -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) = zero(); + c->at(row, j) = zero(); } for (size_type i = 0; i < num_stored_elements_per_row; i++) { auto val = a->val_at(row, i); auto col = a->col_at(row, i); + for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += val * b->at(col, j); + c->at(row, j) += + OutputValueType(val * MatrixValueType(b->at(col, j))); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); auto alpha_val = alpha->at(0, 0); @@ -102,13 +108,14 @@ void advanced_spmv(std::shared_ptr exec, auto val = a->val_at(row, i); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += alpha_val * val * b->at(col, j); + c->at(row, j) += OutputValueType( + alpha_val * val * MatrixValueType(b->at(col, j))); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index fdbaff5dfc5..fee0dd5f7fc 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -50,37 +50,42 @@ namespace reference { namespace ell { -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) = zero(); + c->at(row, j) = zero(); } for (size_type i = 0; i < num_stored_elements_per_row; i++) { auto val = a->val_at(row, i); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += val * b->at(col, j); + c->at(row, j) += + OutputValueType(val * MatrixValueType(b->at(col, j))); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); auto alpha_val = alpha->at(0, 0); @@ -94,13 +99,14 @@ void advanced_spmv(std::shared_ptr exec, auto val = a->val_at(row, i); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += alpha_val * val * b->at(col, j); + c->at(row, j) += OutputValueType( + alpha_val * val * MatrixValueType(b->at(col, j))); } } } } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); From 89ffe458a572e9ca257a994d7f291b0413d69159 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Fri, 12 Mar 2021 16:40:50 +0100 Subject: [PATCH 02/14] fix benchmark formats, make ell spmv use accessor --- benchmark/utils/formats.hpp | 8 +-- common/matrix/ell_kernels.hpp.inc | 65 ++++++++++------------ cuda/matrix/ell_kernels.cu | 49 +++++++++++++---- hip/matrix/ell_kernels.hip.cpp | 89 ++++++++++++++++++++++--------- omp/matrix/ell_kernels.cpp | 48 +++++++++++++---- reference/matrix/ell_kernels.cpp | 49 +++++++++++++---- 6 files changed, 210 insertions(+), 98 deletions(-) diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index 6b344aec06d..ed20f91119e 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -208,10 +208,10 @@ const std::map( {"csrc", READ_MATRIX(csr, std::make_shared())}, {"coo", read_matrix_from_data>}, {"ell", read_matrix_from_data>}, - {"fell", + {"ell_mixed", [](std::shared_ptr exec, - const gko::matrix_data<> &data) { - gko::matrix_data conv_data; + const gko::matrix_data &data) { + gko::matrix_data> conv_data; conv_data.size = data.size; conv_data.nonzeros.resize(data.nonzeros.size()); auto it = conv_data.nonzeros.begin(); @@ -221,7 +221,7 @@ const std::map( it->value = el.value; ++it; } - auto mat = gko::matrix::Ell::create(std::move(exec)); + auto mat = gko::matrix::Ell>::create(std::move(exec)); mat->read(conv_data); return mat; }}, diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 2f155142efd..79ed4ed979b 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -34,32 +34,30 @@ namespace kernel { namespace { -template +template __device__ void spmv_kernel( const size_type num_rows, const int num_worker_per_row, - const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col, + acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - const InputValueType *__restrict__ b, const size_type b_stride, + acc::range b, const size_type b_stride, OutputValueType *__restrict__ c, const size_type c_stride, Closure op) { const auto tidx = thread::get_thread_id_flat(); - const auto column_id = blockIdx.y; - using compute_type = - decltype(InputValueType{} + MatrixValueType{} + OutputValueType{}); + const decltype(tidx) column_id = blockIdx.y; if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; if (col_idx < idx) { break; } else { - temp += val[ind] * b[col_idx * b_stride + column_id]; + temp += val(ind) * b(col_idx, column_id); } } const auto c_ind = tidx * c_stride + column_id; @@ -72,13 +70,13 @@ __device__ void spmv_kernel( const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; __shared__ UninitializedArray< - compute_type, default_block_size / num_thread_per_worker> + OutputValueType, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { storage[threadIdx.x] = 0; } __syncthreads(); - auto temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -87,7 +85,7 @@ __device__ void spmv_kernel( if (col_idx < idx) { break; } else { - temp += val[ind] * b[col_idx * b_stride + column_id]; + temp += val(ind) * b(col_idx, column_id); } } atomic_add(&storage[threadIdx.x], temp); @@ -105,41 +103,34 @@ __device__ void spmv_kernel( } -template +template __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, - const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col, + acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - const InputValueType *__restrict__ b, const size_type b_stride, + acc::range b, const size_type b_stride, OutputValueType *__restrict__ c, const size_type c_stride) { - using compute_type = - decltype(InputValueType{} + MatrixValueType{} + OutputValueType{}); spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, b_stride, c, c_stride, - [](const compute_type &x, const OutputValueType &y) { return x; }); + [](const OutputValueType &x, const OutputValueType &y) { return x; }); } -template +template __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, - const MatrixValueType *__restrict__ alpha, - const MatrixValueType *__restrict__ val, const IndexType *__restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - const InputValueType *__restrict__ b, const size_type b_stride, - const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c, - const size_type c_stride) + acc::range alpha, acc::range val, + const IndexType *__restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const size_type b_stride, const OutputValueType *__restrict__ beta, + OutputValueType *__restrict__ c, const size_type c_stride) { - using compute_type = - decltype(InputValueType{} + MatrixValueType{} + OutputValueType{}); - const compute_type alpha_val = alpha[0]; - const compute_type beta_val = beta[0]; + const OutputValueType alpha_val = alpha(0); + const OutputValueType beta_val = beta[0]; // Because the atomic operation changes the values of c during computation, // it can not do the right alpha * a * b + beta * c operation. // Thus, the cuda kernel only computes alpha * a * b when it uses atomic @@ -148,16 +139,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val](const compute_type &x, const OutputValueType &y) { + [&alpha_val](const OutputValueType &x, const OutputValueType &y) { return alpha_val * x; }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, b_stride, c, c_stride, - [&alpha_val, &beta_val](const compute_type &x, + [&alpha_val, &beta_val](const OutputValueType &x, const OutputValueType &y) { - return alpha_val * x + beta_val * compute_type{y}; + return alpha_val * x + beta_val * y; }); } } diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index df223dfd7cf..33db65c4c1c 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" #include "core/components/fill_array.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" @@ -107,6 +108,17 @@ using compiled_kernels = syn::value_list; namespace { +template +GKO_INLINE auto as_cuda_accessor( + const acc::range> &acc) +{ + return acc::range< + acc::reduced_row_major, cuda_type>>( + acc.get_accessor().get_size(), + as_cuda_type(acc.get_accessor().get_stored_data()), + acc.get_accessor().get_stride()); +} + template @@ -117,7 +129,16 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, const matrix::Dense *alpha = nullptr, const matrix::Dense *beta = nullptr) { + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + const auto nrows = a->get_size()[0]; + const auto stride = a->get_stride(); + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + constexpr int num_thread_per_worker = (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); @@ -125,23 +146,31 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, num_thread_per_worker, 1); const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); + + const auto a_vals = gko::acc::range( + std::array{ + {num_stored_elements_per_row * stride}}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{nrows, b->get_stride()}}, + b->get_const_values()); + if (alpha == nullptr && beta == nullptr) { kernel::spmv <<>>( - nrows, num_worker_per_row, as_cuda_type(a->get_const_values()), - a->get_const_col_idxs(), a->get_stride(), - a->get_num_stored_elements_per_row(), - as_cuda_type(b->get_const_values()), b->get_stride(), + nrows, num_worker_per_row, as_cuda_accessor(a_vals), + a->get_const_col_idxs(), stride, num_stored_elements_per_row, + as_cuda_accessor(b_vals), b->get_stride(), as_cuda_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = gko::acc::range( + std::array{1}, alpha->get_const_values()); kernel::spmv <<>>( - nrows, num_worker_per_row, - as_cuda_type(alpha->get_const_values()), - as_cuda_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_cuda_type(b->get_const_values()), b->get_stride(), - as_cuda_type(beta->get_const_values()), + nrows, num_worker_per_row, as_cuda_accessor(alpha_val), + as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride, + num_stored_elements_per_row, as_cuda_accessor(b_vals), + b->get_stride(), as_cuda_type(beta->get_const_values()), as_cuda_type(c->get_values()), c->get_stride()); } else { GKO_KERNEL_NOT_FOUND; diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2e7a4eeeb50..8bcadd02370 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" #include "core/components/fill_array.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" @@ -111,15 +112,37 @@ using compiled_kernels = syn::value_list; namespace { -template +template +GKO_INLINE auto as_hip_accessor( + const acc::range> &acc) +{ + return acc::range< + acc::reduced_row_major, hip_type>>( + acc.get_accessor().get_size(), + as_hip_type(acc.get_accessor().get_stored_data()), + acc.get_accessor().get_stride()); +} + + +template void abstract_spmv(syn::value_list, int num_worker_per_row, - const matrix::Ell *a, - const matrix::Dense *b, - matrix::Dense *c, - const matrix::Dense *alpha = nullptr, - const matrix::Dense *beta = nullptr) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c, + const matrix::Dense *alpha = nullptr, + const matrix::Dense *beta = nullptr) { + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + const auto nrows = a->get_size()[0]; + const auto stride = a->get_stride(); + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + constexpr int num_thread_per_worker = (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); @@ -127,22 +150,31 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, num_thread_per_worker, 1); const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); + + const auto a_vals = gko::acc::range( + std::array{ + {num_stored_elements_per_row * stride}}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{{nrows, b->get_stride()}}, + b->get_const_values()); + if (alpha == nullptr && beta == nullptr) { hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, - as_hip_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_hip_type(b->get_const_values()), b->get_stride(), - as_hip_type(c->get_values()), c->get_stride()); + as_hip_accessor(a_vals), a->get_const_col_idxs(), stride, + num_stored_elements_per_row, as_hip_accessor(b_vals), + b->get_stride(), as_hip_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { + const auto alpha_val = gko::acc::range( + std::array{1}, alpha->get_const_values()); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, - as_hip_type(alpha->get_const_values()), - as_hip_type(a->get_const_values()), a->get_const_col_idxs(), - a->get_stride(), a->get_num_stored_elements_per_row(), - as_hip_type(b->get_const_values()), b->get_stride(), + as_hip_accessor(alpha_val), as_hip_accessor(a_vals), + a->get_const_col_idxs(), stride, num_stored_elements_per_row, + as_hip_accessor(b_vals), b->get_stride(), as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()), c->get_stride()); } else { @@ -197,10 +229,12 @@ std::array compute_thread_worker_and_atomicity( } // namespace -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, matrix::Dense *c) + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -215,7 +249,8 @@ void spmv(std::shared_ptr exec, const int info = (!atomic) * num_thread_per_worker; if (atomic) { components::fill_array(exec, c->get_values(), - c->get_num_stored_elements(), zero()); + c->get_num_stored_elements(), + zero()); } select_abstract_spmv( compiled_kernels(), @@ -224,16 +259,18 @@ void spmv(std::shared_ptr exec, c); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) { const auto data = compute_thread_worker_and_atomicity(exec, a); const int num_thread_per_worker = std::get<0>(data); @@ -256,7 +293,7 @@ void advanced_spmv(std::shared_ptr exec, alpha, beta); } -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 621e1cfe940..76e7eea80ed 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" #include "omp/components/format_conversion.hpp" @@ -63,7 +64,21 @@ void spmv(std::shared_ptr exec, const matrix::Dense *b, matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto num_rows = b->get_size()[0]; + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{num_rows, b->get_stride()}, + b->get_const_values()); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; row++) { @@ -71,12 +86,10 @@ void spmv(std::shared_ptr exec, c->at(row, j) = zero(); } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); - for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += - OutputValueType(val * MatrixValueType(b->at(col, j))); + c->at(row, j) += val * b_vals(col, j); } } } @@ -95,9 +108,23 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense *beta, matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - auto alpha_val = alpha->at(0, 0); - auto beta_val = beta->at(0, 0); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto num_rows = b->get_size()[0]; + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{num_rows, b->get_stride()}, + b->get_const_values()); + const auto alpha_val = OutputValueType(alpha->at(0, 0)); + const auto beta_val = beta->at(0, 0); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; row++) { @@ -105,11 +132,10 @@ void advanced_spmv(std::shared_ptr exec, c->at(row, j) *= beta_val; } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += OutputValueType( - alpha_val * val * MatrixValueType(b->at(col, j))); + c->at(row, j) += alpha_val * val * b_vals(col, j); } } } diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index fee0dd5f7fc..ba6d687d18a 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "accessor/reduced_row_major.hpp" + + namespace gko { namespace kernels { namespace reference { @@ -57,18 +60,31 @@ void spmv(std::shared_ptr exec, const matrix::Dense *b, matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto num_rows = b->get_size()[0]; + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{num_rows, b->get_stride()}, + b->get_const_values()); for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { c->at(row, j) = zero(); } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += - OutputValueType(val * MatrixValueType(b->at(col, j))); + c->at(row, j) += val * b_vals(col, j); } } } @@ -87,20 +103,33 @@ void advanced_spmv(std::shared_ptr exec, const matrix::Dense *beta, matrix::Dense *c) { - auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - auto alpha_val = alpha->at(0, 0); - auto beta_val = beta->at(0, 0); + using a_accessor = + gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + using b_accessor = + gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + + const auto num_stored_elements_per_row = + a->get_num_stored_elements_per_row(); + const auto stride = a->get_stride(); + const auto num_rows = b->get_size()[0]; + const auto a_vals = gko::acc::range( + std::array{num_stored_elements_per_row * stride}, + a->get_const_values()); + const auto b_vals = gko::acc::range( + std::array{num_rows, b->get_stride()}, + b->get_const_values()); + const auto alpha_val = OutputValueType(alpha->at(0, 0)); + const auto beta_val = beta->at(0, 0); for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { c->at(row, j) *= beta_val; } for (size_type i = 0; i < num_stored_elements_per_row; i++) { - auto val = a->val_at(row, i); + auto val = a_vals(row + i * stride); auto col = a->col_at(row, i); for (size_type j = 0; j < c->get_size()[1]; j++) { - c->at(row, j) += OutputValueType( - alpha_val * val * MatrixValueType(b->at(col, j))); + c->at(row, j) += alpha_val * val * b_vals(col, j); } } } From b9ba1787efa4d7b426cf13caf9b631e8c336a878 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Fri, 12 Mar 2021 17:55:32 +0100 Subject: [PATCH 03/14] Add tests for mixed precision ELL --- cuda/test/matrix/ell_kernels.cpp | 222 +++++++++++++++++++- hip/test/matrix/ell_kernels.hip.cpp | 221 ++++++++++++++++++++ omp/test/matrix/ell_kernels.cpp | 289 +++++++++++++++++++++++++- reference/test/matrix/ell_kernels.cpp | 254 +++++++++++++++++++++- 4 files changed, 974 insertions(+), 12 deletions(-) diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index dff573a284f..e24f7083728 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -58,6 +58,7 @@ class Ell : public ::testing::Test { protected: using Mtx = gko::matrix::Ell<>; using Vec = gko::matrix::Dense<>; + using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; Ell() : rand_engine(42) {} @@ -92,22 +93,35 @@ class Ell : public ::testing::Test { stride); mtx->copy_from(gen_mtx(num_rows, num_cols)); expected = gen_mtx(num_rows, num_vectors); + expected2 = Vec2::create(ref); + expected2->copy_from(expected.get()); y = gen_mtx(num_cols, num_vectors); + y2 = Vec2::create(ref); + y2->copy_from(y.get()); alpha = gko::initialize({2.0}, ref); + alpha2 = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(cuda); dmtx->copy_from(mtx.get()); dresult = Vec::create(cuda); dresult->copy_from(expected.get()); + dresult2 = Vec2::create(cuda); + dresult2->copy_from(expected2.get()); dy = Vec::create(cuda); dy->copy_from(y.get()); + dy2 = Vec2::create(cuda); + dy2->copy_from(y2.get()); dalpha = Vec::create(cuda); dalpha->copy_from(alpha.get()); + dalpha2 = Vec2::create(cuda); + dalpha2->copy_from(alpha2.get()); dbeta = Vec::create(cuda); dbeta->copy_from(beta.get()); + dbeta2 = Vec2::create(cuda); + dbeta2->copy_from(beta2.get()); } - std::shared_ptr ref; std::shared_ptr cuda; @@ -115,15 +129,23 @@ class Ell : public ::testing::Test { std::unique_ptr mtx; std::unique_ptr expected; + std::unique_ptr expected2; std::unique_ptr y; + std::unique_ptr y2; std::unique_ptr alpha; + std::unique_ptr alpha2; std::unique_ptr beta; + std::unique_ptr beta2; std::unique_ptr dmtx; std::unique_ptr dresult; + std::unique_ptr dresult2; std::unique_ptr dy; + std::unique_ptr dy2; std::unique_ptr dalpha; + std::unique_ptr dalpha2; std::unique_ptr dbeta; + std::unique_ptr dbeta2; }; @@ -138,6 +160,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -149,6 +204,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { set_up_apply_data(532, 231, 1, 300, 600); @@ -160,6 +248,39 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { set_up_apply_data(532, 231, 1, 300, 600); @@ -170,6 +291,39 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { set_up_apply_data(532, 231, 3, 300, 600); @@ -181,6 +335,39 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { set_up_apply_data(532, 231, 3, 300, 600); @@ -192,6 +379,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef) { set_up_apply_data(10, 10000); diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp index e9a875eb544..77253125abe 100644 --- a/hip/test/matrix/ell_kernels.hip.cpp +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -58,6 +58,7 @@ class Ell : public ::testing::Test { protected: using Mtx = gko::matrix::Ell<>; using Vec = gko::matrix::Dense<>; + using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; Ell() : rand_engine(42) {} @@ -92,19 +93,33 @@ class Ell : public ::testing::Test { stride); mtx->copy_from(gen_mtx(num_rows, num_cols)); expected = gen_mtx(num_rows, num_vectors); + expected2 = Vec2::create(ref); + expected2->copy_from(expected.get()); y = gen_mtx(num_cols, num_vectors); + y2 = Vec2::create(ref); + y2->copy_from(y.get()); alpha = gko::initialize({2.0}, ref); + alpha2 = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(hip); dmtx->copy_from(mtx.get()); dresult = Vec::create(hip); dresult->copy_from(expected.get()); + dresult2 = Vec2::create(hip); + dresult2->copy_from(expected2.get()); dy = Vec::create(hip); dy->copy_from(y.get()); + dy2 = Vec2::create(hip); + dy2->copy_from(y2.get()); dalpha = Vec::create(hip); dalpha->copy_from(alpha.get()); + dalpha2 = Vec2::create(hip); + dalpha2->copy_from(alpha2.get()); dbeta = Vec::create(hip); dbeta->copy_from(beta.get()); + dbeta2 = Vec2::create(hip); + dbeta2->copy_from(beta2.get()); } @@ -115,15 +130,23 @@ class Ell : public ::testing::Test { std::unique_ptr mtx; std::unique_ptr expected; + std::unique_ptr expected2; std::unique_ptr y; + std::unique_ptr y2; std::unique_ptr alpha; + std::unique_ptr alpha2; std::unique_ptr beta; + std::unique_ptr beta2; std::unique_ptr dmtx; std::unique_ptr dresult; + std::unique_ptr dresult2; std::unique_ptr dy; + std::unique_ptr dy2; std::unique_ptr dalpha; + std::unique_ptr dalpha2; std::unique_ptr dbeta; + std::unique_ptr dbeta2; }; @@ -138,6 +161,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -149,6 +205,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { set_up_apply_data(532, 231, 1, 300, 600); @@ -160,6 +249,39 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { set_up_apply_data(532, 231, 1, 300, 600); @@ -170,6 +292,39 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 1, 300, 600); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { set_up_apply_data(532, 231, 3, 300, 600); @@ -181,6 +336,39 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { set_up_apply_data(532, 231, 3, 300, 600); @@ -192,6 +380,39 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(532, 231, 3, 300, 600); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyByAtomicIsEquivalentToRef) { set_up_apply_data(10, 10000); diff --git a/omp/test/matrix/ell_kernels.cpp b/omp/test/matrix/ell_kernels.cpp index 281292e9684..01f1a31121d 100644 --- a/omp/test/matrix/ell_kernels.cpp +++ b/omp/test/matrix/ell_kernels.cpp @@ -57,6 +57,7 @@ class Ell : public ::testing::Test { protected: using Mtx = gko::matrix::Ell<>; using Vec = gko::matrix::Dense<>; + using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; Ell() : rand_engine(42) {} @@ -90,19 +91,33 @@ class Ell : public ::testing::Test { mtx = Mtx::create(ref, gko::dim<2>{}, max_nonzeros_per_row, stride); mtx->copy_from(gen_mtx(532, 231, 1)); expected = gen_mtx(532, num_vectors, 1); + expected2 = Vec2::create(ref); + expected2->copy_from(expected.get()); y = gen_mtx(231, num_vectors, 1); + y2 = Vec2::create(ref); + y2->copy_from(y.get()); alpha = gko::initialize({2.0}, ref); + alpha2 = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + beta2 = gko::initialize({-1.0}, ref); dmtx = Mtx::create(omp); dmtx->copy_from(mtx.get()); dresult = Vec::create(omp); dresult->copy_from(expected.get()); + dresult2 = Vec2::create(omp); + dresult2->copy_from(expected2.get()); dy = Vec::create(omp); dy->copy_from(y.get()); + dy2 = Vec2::create(omp); + dy2->copy_from(y2.get()); dalpha = Vec::create(omp); dalpha->copy_from(alpha.get()); + dalpha2 = Vec2::create(omp); + dalpha2->copy_from(alpha2.get()); dbeta = Vec::create(omp); dbeta->copy_from(beta.get()); + dbeta2 = Vec2::create(omp); + dbeta2->copy_from(beta2.get()); } std::shared_ptr ref; @@ -112,15 +127,23 @@ class Ell : public ::testing::Test { std::unique_ptr mtx; std::unique_ptr expected; + std::unique_ptr expected2; std::unique_ptr y; + std::unique_ptr y2; std::unique_ptr alpha; + std::unique_ptr alpha2; std::unique_ptr beta; + std::unique_ptr beta2; std::unique_ptr dmtx; std::unique_ptr dresult; + std::unique_ptr dresult2; std::unique_ptr dy; + std::unique_ptr dy2; std::unique_ptr dalpha; + std::unique_ptr dalpha2; std::unique_ptr dbeta; + std::unique_ptr dbeta2; }; @@ -135,6 +158,39 @@ TEST_F(Ell, SimpleApplyIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -146,6 +202,39 @@ TEST_F(Ell, AdvancedApplyIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef2) +{ + set_up_apply_data(); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) +{ + set_up_apply_data(); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef) { set_up_apply_data(300, 600); @@ -157,6 +246,39 @@ TEST_F(Ell, SimpleApplyWithPaddingIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef1) +{ + set_up_apply_data(300, 600); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef2) +{ + set_up_apply_data(300, 600); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingIsEquivalentToRef3) +{ + set_up_apply_data(300, 600); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef) { set_up_apply_data(300, 600); @@ -167,6 +289,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef1) +{ + set_up_apply_data(300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef2) +{ + set_up_apply_data(300, 600); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingIsEquivalentToRef3) +{ + set_up_apply_data(300, 600); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef) { set_up_apply_data(0, 0, 3); @@ -178,6 +333,39 @@ TEST_F(Ell, SimpleApplyToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef) { set_up_apply_data(0, 0, 3); @@ -189,7 +377,40 @@ TEST_F(Ell, AdvancedApplyToDenseMatrixIsEquivalentToRef) } -TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef) +TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(0, 0, 3); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, SimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef) { set_up_apply_data(300, 600, 3); @@ -200,6 +421,39 @@ TEST_F(Ell, SimpleApplyWithPaddinToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(y2.get(), expected2.get()); + dmtx->apply(dy2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(y2.get(), expected.get()); + dmtx->apply(dy2.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedSimpleApplyWithPaddingToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(y.get(), expected2.get()); + dmtx->apply(dy.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef) { set_up_apply_data(300, 600, 3); @@ -210,6 +464,39 @@ TEST_F(Ell, AdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef) } +TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef1) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef2) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); + dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Ell, MixedAdvancedApplyWithPaddingToDenseMatrixIsEquivalentToRef3) +{ + set_up_apply_data(300, 600, 3); + + mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); + dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); + + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); +} + + TEST_F(Ell, ApplyToComplexIsEquivalentToRef) { set_up_apply_data(300, 600); diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 77436ad1287..c3326737e4f 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -120,11 +120,46 @@ TYPED_TEST(Ell, AppliesToDenseVector) } -TYPED_TEST(Ell, AppliesToMixedDenseVector) +TYPED_TEST(Ell, MixedAppliesToDenseVector1) { - using MixedVec = typename TestFixture::MixedVec; - auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); - auto y = MixedVec::create(this->exec, gko::dim<2>{2, 1}); + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = typename gko::matrix::Dense; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); + + this->mtx1->apply(x.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesToDenseVector2) +{ + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec2::create(this->exec, gko::dim<2>{2, 1}); + + this->mtx1->apply(x.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({13.0, 5.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesToDenseVector3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense>; + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); this->mtx1->apply(x.get(), y.get()); @@ -154,6 +189,80 @@ TYPED_TEST(Ell, AppliesToDenseMatrix) } +TYPED_TEST(Ell, MixedAppliesToDenseMatrix1) +{ + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = gko::matrix::Dense; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec::create(this->exec, gko::dim<2>{2}); + + this->mtx1->apply(x.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesToDenseMatrix2) +{ + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec2::create(this->exec, gko::dim<2>{2}); + + this->mtx1->apply(x.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesToDenseMatrix3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + // clang-format on + auto y = Vec1::create(this->exec, gko::dim<2>{2}); + + this->mtx1->apply(x.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{13.0, 3.5}, + { 5.0, -7.5}}), 0.0); + // clang-format on +} + + TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector) { using Vec = typename TestFixture::Vec; @@ -168,13 +277,52 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseVector) } -TYPED_TEST(Ell, AppliesLinearCombinationToMixedDenseVector) +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1) +{ + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2) { - using MixedVec = typename TestFixture::MixedVec; - auto alpha = gko::initialize({-1.0}, this->exec); - auto beta = gko::initialize({2.0}, this->exec); - auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); - auto y = gko::initialize({1.0, 2.0}, this->exec); + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + GKO_ASSERT_MTX_NEAR(y, l({-11.0, -1.0}), 0.0); +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); + auto y = gko::initialize({1.0, 2.0}, this->exec); this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); @@ -208,6 +356,92 @@ TYPED_TEST(Ell, AppliesLinearCombinationToDenseMatrix) } +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1) +{ + // Both vectors have the same value type which differs from the matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + auto y = gko::initialize( + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); + // clang-format on + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2) +{ + // Input vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + auto y = gko::initialize( + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); + // clang-format on + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); + // clang-format on +} + + +TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3) +{ + // Output vector has same value type as matrix + using T = typename TestFixture::value_type; + using next_T = gko::next_precision; + using Vec1 = typename TestFixture::Vec; + using Vec2 = gko::matrix::Dense; + auto alpha = gko::initialize({-1.0}, this->exec); + auto beta = gko::initialize({2.0}, this->exec); + // clang-format off + auto x = gko::initialize( + {I{2.0, 3.0}, + I{1.0, -1.5}, + I{4.0, 2.5}}, this->exec); + auto y = gko::initialize( + {I{1.0, 0.5}, + I{2.0, -1.5}}, this->exec); + // clang-format on + + this->mtx1->apply(alpha.get(), x.get(), beta.get(), y.get()); + + // clang-format off + GKO_ASSERT_MTX_NEAR(y, + l({{-11.0, -2.5}, + { -1.0, 4.5}}), 0.0); + // clang-format on +} + + TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension) { using Vec = typename TestFixture::Vec; From 420384c9574c04505ff8944796a9377576def567 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Mon, 15 Mar 2021 12:27:24 +0100 Subject: [PATCH 04/14] adjust tolerances for lp value types --- cuda/test/matrix/ell_kernels.cpp | 24 ++++++++++++------------ hip/test/matrix/ell_kernels.hip.cpp | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index e24f7083728..0df427f139e 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -167,7 +167,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -189,7 +189,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -211,7 +211,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -233,7 +233,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -255,7 +255,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -277,7 +277,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -298,7 +298,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -320,7 +320,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -342,7 +342,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -364,7 +364,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -386,7 +386,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -408,7 +408,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp index 77253125abe..9d1e1440544 100644 --- a/hip/test/matrix/ell_kernels.hip.cpp +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -168,7 +168,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef1) mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -190,7 +190,7 @@ TEST_F(Ell, MixedSimpleApplyIsEquivalentToRef3) mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -212,7 +212,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef1) mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -234,7 +234,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -256,7 +256,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -278,7 +278,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -299,7 +299,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -321,7 +321,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -343,7 +343,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -365,7 +365,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -387,7 +387,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } @@ -409,7 +409,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); - GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult2, expected2, 1e-6); } From b2e10a77f606f71eb2804a0b51aa305521905bc5 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Mon, 15 Mar 2021 13:43:42 +0100 Subject: [PATCH 05/14] use size_type for accessor dimensions --- cuda/matrix/ell_kernels.cu | 7 +++---- hip/matrix/ell_kernels.hip.cpp | 7 +++---- omp/matrix/ell_kernels.cpp | 8 ++++---- reference/matrix/ell_kernels.cpp | 8 ++++---- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 33db65c4c1c..2dc669ba5a7 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -148,11 +148,10 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, b->get_size()[1], 1); const auto a_vals = gko::acc::range( - std::array{ - {num_stored_elements_per_row * stride}}, + std::array{{num_stored_elements_per_row * stride}}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{{nrows, b->get_stride()}}, + std::array{{nrows, b->get_stride()}}, b->get_const_values()); if (alpha == nullptr && beta == nullptr) { @@ -164,7 +163,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, as_cuda_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( - std::array{1}, alpha->get_const_values()); + std::array{1}, alpha->get_const_values()); kernel::spmv <<>>( nrows, num_worker_per_row, as_cuda_accessor(alpha_val), diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 8bcadd02370..24966a64113 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -152,11 +152,10 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, b->get_size()[1], 1); const auto a_vals = gko::acc::range( - std::array{ - {num_stored_elements_per_row * stride}}, + std::array{{num_stored_elements_per_row * stride}}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{{nrows, b->get_stride()}}, + std::array{{nrows, b->get_stride()}}, b->get_const_values()); if (alpha == nullptr && beta == nullptr) { @@ -168,7 +167,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, b->get_stride(), as_hip_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( - std::array{1}, alpha->get_const_values()); + std::array{1}, alpha->get_const_values()); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel::spmv), dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 76e7eea80ed..040176053d6 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -74,10 +74,10 @@ void spmv(std::shared_ptr exec, const auto stride = a->get_stride(); const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( - std::array{num_stored_elements_per_row * stride}, + std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, + std::array{num_rows, b->get_stride()}, b->get_const_values()); #pragma omp parallel for @@ -118,10 +118,10 @@ void advanced_spmv(std::shared_ptr exec, const auto stride = a->get_stride(); const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( - std::array{num_stored_elements_per_row * stride}, + std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, + std::array{num_rows, b->get_stride()}, b->get_const_values()); const auto alpha_val = OutputValueType(alpha->at(0, 0)); const auto beta_val = beta->at(0, 0); diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index ba6d687d18a..8b0f541799a 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -70,10 +70,10 @@ void spmv(std::shared_ptr exec, const auto stride = a->get_stride(); const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( - std::array{num_stored_elements_per_row * stride}, + std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, + std::array{num_rows, b->get_stride()}, b->get_const_values()); for (size_type row = 0; row < a->get_size()[0]; row++) { @@ -113,10 +113,10 @@ void advanced_spmv(std::shared_ptr exec, const auto stride = a->get_stride(); const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( - std::array{num_stored_elements_per_row * stride}, + std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, + std::array{num_rows, b->get_stride()}, b->get_const_values()); const auto alpha_val = OutputValueType(alpha->at(0, 0)); const auto beta_val = beta->at(0, 0); From d0a4e44d5ad45793e303f033d9864b1bd4d131d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Wed, 17 Mar 2021 11:29:28 +0100 Subject: [PATCH 06/14] Fix HIP and CUDA ELL SpMV Debug error --- cuda/matrix/ell_kernels.cu | 4 ++-- hip/matrix/ell_kernels.hip.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 2dc669ba5a7..57617269ed0 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -151,8 +151,8 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, std::array{{num_stored_elements_per_row * stride}}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{{nrows, b->get_stride()}}, - b->get_const_values()); + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); if (alpha == nullptr && beta == nullptr) { kernel::spmv diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 24966a64113..f1972635385 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -155,8 +155,8 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, std::array{{num_stored_elements_per_row * stride}}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{{nrows, b->get_stride()}}, - b->get_const_values()); + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); if (alpha == nullptr && beta == nullptr) { hipLaunchKernelGGL( From 319491cebabf8346248ffc9156667a94d9893d0c Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Thu, 18 Mar 2021 11:07:22 +0100 Subject: [PATCH 07/14] Code Review --- common/matrix/ell_kernels.hpp.inc | 18 +++++++++--------- cuda/matrix/ell_kernels.cu | 6 +++--- dpcpp/matrix/ell_kernels.dp.cpp | 27 +++++++++++++++------------ hip/matrix/ell_kernels.hip.cpp | 7 +++---- omp/matrix/ell_kernels.cpp | 8 ++++---- reference/matrix/ell_kernels.cpp | 8 ++++---- 6 files changed, 38 insertions(+), 36 deletions(-) diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 79ed4ed979b..2323d512258 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -41,8 +41,8 @@ __device__ void spmv_kernel( const size_type num_rows, const int num_worker_per_row, acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, const size_type b_stride, - OutputValueType *__restrict__ c, const size_type c_stride, Closure op) + acc::range b, OutputValueType *__restrict__ c, + const size_type c_stride, Closure op) { const auto tidx = thread::get_thread_id_flat(); const decltype(tidx) column_id = blockIdx.y; @@ -109,12 +109,12 @@ __global__ __launch_bounds__(default_block_size) void spmv( const size_type num_rows, const int num_worker_per_row, acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, const size_type b_stride, - OutputValueType *__restrict__ c, const size_type c_stride) + acc::range b, OutputValueType *__restrict__ c, + const size_type c_stride) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, + num_stored_elements_per_row, b, c, c_stride, [](const OutputValueType &x, const OutputValueType &y) { return x; }); } @@ -126,8 +126,8 @@ __global__ __launch_bounds__(default_block_size) void spmv( acc::range alpha, acc::range val, const IndexType *__restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, acc::range b, - const size_type b_stride, const OutputValueType *__restrict__ beta, - OutputValueType *__restrict__ c, const size_type c_stride) + const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c, + const size_type c_stride) { const OutputValueType alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; @@ -138,14 +138,14 @@ __global__ __launch_bounds__(default_block_size) void spmv( if (atomic) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, + num_stored_elements_per_row, b, c, c_stride, [&alpha_val](const OutputValueType &x, const OutputValueType &y) { return alpha_val * x; }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, b_stride, c, c_stride, + num_stored_elements_per_row, b, c, c_stride, [&alpha_val, &beta_val](const OutputValueType &x, const OutputValueType &y) { return alpha_val * x + beta_val * y; diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 57617269ed0..8808a502ebb 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -159,8 +159,8 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, <<>>( nrows, num_worker_per_row, as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride, num_stored_elements_per_row, - as_cuda_accessor(b_vals), b->get_stride(), - as_cuda_type(c->get_values()), c->get_stride()); + as_cuda_accessor(b_vals), as_cuda_type(c->get_values()), + c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( std::array{1}, alpha->get_const_values()); @@ -169,7 +169,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, nrows, num_worker_per_row, as_cuda_accessor(alpha_val), as_cuda_accessor(a_vals), a->get_const_col_idxs(), stride, num_stored_elements_per_row, as_cuda_accessor(b_vals), - b->get_stride(), as_cuda_type(beta->get_const_values()), + as_cuda_type(beta->get_const_values()), as_cuda_type(c->get_values()), c->get_stride()); } else { GKO_KERNEL_NOT_FOUND; diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 01a40b5d9b5..b4dfeafeb54 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -56,24 +56,27 @@ namespace dpcpp { namespace ell { -template +template void spmv(std::shared_ptr exec, - const matrix::Ell *a, - const matrix::Dense *b, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + const matrix::Ell *a, + const matrix::Dense *b, + matrix::Dense *c) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELL_SPMV_KERNEL); +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELL_SPMV_KERNEL); -template +template void advanced_spmv(std::shared_ptr exec, - const matrix::Dense *alpha, - const matrix::Ell *a, - const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + const matrix::Dense *alpha, + const matrix::Ell *a, + const matrix::Dense *b, + const matrix::Dense *beta, + matrix::Dense *c) GKO_NOT_IMPLEMENTED; -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index f1972635385..fc10fc7d0e1 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -164,7 +164,7 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, as_hip_accessor(a_vals), a->get_const_col_idxs(), stride, num_stored_elements_per_row, as_hip_accessor(b_vals), - b->get_stride(), as_hip_type(c->get_values()), c->get_stride()); + as_hip_type(c->get_values()), c->get_stride()); } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( std::array{1}, alpha->get_const_values()); @@ -173,9 +173,8 @@ void abstract_spmv(syn::value_list, int num_worker_per_row, dim3(grid_size), dim3(block_size), 0, 0, nrows, num_worker_per_row, as_hip_accessor(alpha_val), as_hip_accessor(a_vals), a->get_const_col_idxs(), stride, num_stored_elements_per_row, - as_hip_accessor(b_vals), b->get_stride(), - as_hip_type(beta->get_const_values()), as_hip_type(c->get_values()), - c->get_stride()); + as_hip_accessor(b_vals), as_hip_type(beta->get_const_values()), + as_hip_type(c->get_values()), c->get_stride()); } else { GKO_KERNEL_NOT_FOUND; } diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 040176053d6..db7fe3cbc0d 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -77,8 +77,8 @@ void spmv(std::shared_ptr exec, std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, - b->get_const_values()); + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); #pragma omp parallel for for (size_type row = 0; row < a->get_size()[0]; row++) { @@ -121,8 +121,8 @@ void advanced_spmv(std::shared_ptr exec, std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, - b->get_const_values()); + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); const auto alpha_val = OutputValueType(alpha->at(0, 0)); const auto beta_val = beta->at(0, 0); diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 8b0f541799a..58df834bfba 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -73,8 +73,8 @@ void spmv(std::shared_ptr exec, std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, - b->get_const_values()); + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); for (size_type row = 0; row < a->get_size()[0]; row++) { for (size_type j = 0; j < c->get_size()[1]; j++) { @@ -116,8 +116,8 @@ void advanced_spmv(std::shared_ptr exec, std::array{num_stored_elements_per_row * stride}, a->get_const_values()); const auto b_vals = gko::acc::range( - std::array{num_rows, b->get_stride()}, - b->get_const_values()); + std::array{{b->get_size()[0], b->get_size()[1]}}, + b->get_const_values(), std::array{{b->get_stride()}}); const auto alpha_val = OutputValueType(alpha->at(0, 0)); const auto beta_val = beta->at(0, 0); From 07bf3be16d26de25a66051bd82070bd89c4743f9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Apr 2021 13:44:43 +0200 Subject: [PATCH 08/14] add GINKGO_MIXED_PRECISION flag and dispatch --- .gitlab-ci.yml | 41 +++++++ CMakeLists.txt | 1 + INSTALL.md | 4 + cmake/get_info.cmake | 2 +- core/base/mixed_precision_types.hpp | 108 ++++++++++++++++++ core/device_hooks/common_kernels.inc.cpp | 1 + core/matrix/ell.cpp | 13 ++- cuda/matrix/ell_kernels.cu | 1 + dpcpp/matrix/ell_kernels.dp.cpp | 1 + hip/matrix/ell_kernels.hip.cpp | 1 + include/ginkgo/config.hpp.in | 4 + .../ginkgo/core/base/precision_dispatch.hpp | 94 +++++++++++++++ include/ginkgo/core/base/types.hpp | 51 --------- omp/matrix/ell_kernels.cpp | 1 + reference/matrix/ell_kernels.cpp | 1 + 15 files changed, 267 insertions(+), 57 deletions(-) create mode 100644 core/base/mixed_precision_types.hpp diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bbe85b1fbe8..11e66475301 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,6 +32,7 @@ include: BUILD_HWLOC: "ON" FAST_TESTS: "OFF" DPCPP_SINGLE_MODE: "OFF" + MIXED_PRECISION: "ON" RUN_EXAMPLES: "OFF" CONFIG_LOG: "ON" CXX_FLAGS: "" @@ -77,6 +78,7 @@ include: -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} + -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION} -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES} -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG} -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE} @@ -111,6 +113,7 @@ include: -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC} -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON -DGINKGO_FAST_TESTS=${FAST_TESTS} + -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION} -DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG} -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE} -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES} @@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static: BUILD_TYPE: "Release" BUILD_SHARED_LIBS: "OFF" +build/nocuda-nomixed/gcc/omp/release/shared: + <<: *default_build_with_test + extends: + - .quick_test_condition + - .use_gko-nocuda-gnu9-llvm8 + variables: + <<: *default_variables + BUILD_OMP: "ON" + BUILD_TYPE: "Release" + MIXED_PRECISION: "OFF" + +build/nocuda-nomixed/clang/omp/debug/static: + <<: *default_build_with_test + extends: + - .full_test_condition + - .use_gko-nocuda-gnu9-llvm8 + variables: + <<: *default_variables + C_COMPILER: "clang" + CXX_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_TYPE: "Debug" + BUILD_SHARED_LIBS: "OFF" + MIXED_PRECISION: "OFF" + +build/nocuda-nomixed/intel/omp/release/static: + <<: *default_build_with_test + extends: + - .full_test_condition + - .use_gko-nocuda-gnu9-llvm8-intel + variables: + <<: *default_variables + C_COMPILER: "icc" + CXX_COMPILER: "icpc" + BUILD_OMP: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" + MIXED_PRECISION: "OFF" build/dpcpp/cpu/release/static: <<: *default_build_with_test diff --git a/CMakeLists.txt b/CMakeLists.txt index 1318a6900d4..f0d68b51a34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP}) option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) +option(GINKGO_MIXED_PRECISION "Add true mixed-precision kernel (otherwise they will be conversion-based)" OFF) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_EXPORT_BUILD_DIR diff --git a/INSTALL.md b/INSTALL.md index d85bcc48329..4fcc60d9ae4 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built: * `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development (requires clang-format, will also download git-cmake-format), default is `OFF`. +* `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels + instead of converting data on the fly, default is `OFF`. + Enabling this flag increases the library size, but improves performance of + mixed-precision kernels. * `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests (will download googletest), default is `ON`. * `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 835bdf0c45b..508bb6e0191 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -127,7 +127,7 @@ foreach(log_type ${log_types}) ginkgo_print_module_footer(${${log_type}} "User configuration:") ginkgo_print_module_footer(${${log_type}} " Enabled modules:") ginkgo_print_foreach_variable(${${log_type}} - "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP") + "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP;GINKGO_MIXED_PRECISION") ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING") diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp new file mode 100644 index 00000000000..079fcb8a075 --- /dev/null +++ b/core/base/mixed_precision_types.hpp @@ -0,0 +1,108 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ +#define GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ + + +#include +#include + + +#ifdef GINKGO_MIXED_PRECISION +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(float, float, float, int32); \ + template _macro(float, float, double, int32); \ + template _macro(float, double, float, int32); \ + template _macro(float, double, double, int32); \ + template _macro(double, float, float, int32); \ + template _macro(double, float, double, int32); \ + template _macro(double, double, float, int32); \ + template _macro(double, double, double, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(float, float, float, int64); \ + template _macro(float, float, double, int64); \ + template _macro(float, double, float, int64); \ + template _macro(float, double, double, int64); \ + template _macro(double, float, float, int64); \ + template _macro(double, float, double, int64); \ + template _macro(double, double, float, int64); \ + template _macro(double, double, double, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64) +#else +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(float, float, float, int32); \ + template _macro(double, double, double, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(std::complex, std::complex, \ + std::complex, int32); \ + template _macro(float, float, float, int64); \ + template _macro(double, double, double, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64); \ + template _macro(std::complex, std::complex, \ + std::complex, int64) +#endif + + +#endif // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_ diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 42121074df4..810a3fab4e4 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/mixed_precision_types.hpp" #include "core/components/absolute_array.hpp" #include "core/components/fill_array.hpp" #include "core/components/precision_conversion.hpp" diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 0dc9a950f75..fcb198106cd 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -102,7 +102,7 @@ size_type calculate_max_nnz_per_row( template void Ell::apply_impl(const LinOp *b, LinOp *x) const { - precision_dispatch_real_complex( + mixed_precision_dispatch_real_complex( [this](auto dense_b, auto dense_x) { this->get_executor()->run(ell::make_spmv(this, dense_b, dense_x)); }, @@ -114,12 +114,15 @@ template void Ell::apply_impl(const LinOp *alpha, const LinOp *b, const LinOp *beta, LinOp *x) const { - precision_dispatch_real_complex( - [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { + mixed_precision_dispatch_real_complex( + [this, alpha, beta](auto dense_b, auto dense_x) { + auto dense_alpha = make_temporary_conversion(alpha); + auto dense_beta = make_temporary_conversion< + typename std::decay_t::value_type>(beta); this->get_executor()->run(ell::make_advanced_spmv( - dense_alpha, this, dense_b, dense_beta, dense_x)); + dense_alpha.get(), this, dense_b, dense_beta.get(), dense_x)); }, - alpha, b, beta, x); + b, x); } diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 8808a502ebb..953ce4be8d7 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index b4dfeafeb54..3c1d93eb323 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/mixed_precision_types.hpp" #include "dpcpp/components/format_conversion.dp.hpp" diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index fc10fc7d0e1..b1ad7bf4662 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array.hpp" #include "core/components/prefix_sum.hpp" #include "core/matrix/dense_kernels.hpp" diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index edef91e6d9e..1c6a31ea481 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -63,6 +63,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #cmakedefine GINKGO_BENCHMARK_ENABLE_TUNING +/* Should we compile mixed-precision kernels for Ginkgo? */ +#cmakedefine GINKGO_MIXED_PRECISION + + /* What is HIP compiled for, hcc or nvcc? */ // clang-format off #define GINKGO_HIP_PLATFORM_HCC @GINKGO_HIP_PLATFORM_HCC@ diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 5588d566b41..17901e207c1 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -226,6 +226,100 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha, } +/** + * Calls the given function with each given argument LinOp + * converted into matrix::Dense as parameters. + * + * If GINKGO_MIXED_PRECISION is defined, this means that the function will be + * called with its dynamic type with its static type, so the (templated/generic) + * function will be instantiated with all pairs of Dense and + * Dense> parameter types, and the appropriate + * overload will be called based on the dynamic type of the parameter. + * + * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like + * precision_dispatch. + * + * + * @param fn the given function. It will be passed one (potentially const) + * matrix::Dense<...>* parameter per parameter in the parameter + * pack `linops`. + * @param linops the given arguments to be converted and passed on to fn. + * + * @tparam ValueType the value type to use for the parameters of `fn`. + * @tparam Function the function pointer, lambda or other functor type to call + * with the converted arguments. + * @tparam Args the argument type list. + */ +template +void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out) +{ +#ifdef GINKGO_MIXED_PRECISION + using fst_type = matrix::Dense; + using snd_type = matrix::Dense>; + if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else { + GKO_NOT_SUPPORTED(in); + } +#else + precision_dispatch(fn, in, out); +#endif +} + + +/** + * Calls the given function with the given LinOps cast to their dynamic type + * matrix::Dense* as parameters. + * If ValueType is real and both `in` and `out` are complex, uses + * matrix::Dense::get_real_view() to convert them into real matrices after + * precision conversion. + * + * @see mixed_precision_dispatch() + */ +template +void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in, + LinOp *out) +{ +#ifdef GINKGO_MIXED_PRECISION + // do we need to convert complex Dense to real Dense? + auto complex_to_real = + !(is_complex() || + dynamic_cast> *>(in)); + if (complex_to_real) { + mixed_precision_dispatch>( + [&fn](auto dense_in, auto dense_out) { + using Dense = matrix::Dense; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dynamic_cast( + dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + }, + in, out); + } else { + mixed_precision_dispatch(fn, in, out); + } +#else + precision_dispatch_real_complex(fn, in, out); +#endif +} + + } // namespace gko diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 2f6da1e9e42..2c77aaef826 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -524,57 +524,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #endif -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(float, float, float, int32); \ - template _macro(float, float, double, int32); \ - template _macro(float, double, float, int32); \ - template _macro(float, double, double, int32); \ - template _macro(double, float, float, int32); \ - template _macro(double, float, double, int32); \ - template _macro(double, double, float, int32); \ - template _macro(double, double, double, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(float, float, float, int64); \ - template _macro(float, float, double, int64); \ - template _macro(float, double, float, int64); \ - template _macro(float, double, double, int64); \ - template _macro(double, float, float, int64); \ - template _macro(double, float, double, int64); \ - template _macro(double, double, float, int64); \ - template _macro(double, double, double, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64) - - /** * Instantiates a template for each value type conversion pair compiled by * Ginkgo. diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index db7fe3cbc0d..58ef6c3a72e 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" #include "omp/components/format_conversion.hpp" diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 58df834bfba..25de03cf709 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "accessor/reduced_row_major.hpp" +#include "core/base/mixed_precision_types.hpp" namespace gko { From 9618ef30ba9b04551c2960621a2a2aa0e483d66c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Apr 2021 16:42:53 +0200 Subject: [PATCH 09/14] fix mixed_precision_dispatch no-op casts --- include/ginkgo/core/base/precision_dispatch.hpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 17901e207c1..4c2624863e5 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -302,13 +302,24 @@ void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in, if (complex_to_real) { mixed_precision_dispatch>( [&fn](auto dense_in, auto dense_out) { - using Dense = matrix::Dense; + using DenseIn = typename std::decay_t; + using DenseOut = typename std::decay_t; + using DenseDummy = matrix::Dense; + using RealIn = remove_complex; + using RealOut = remove_complex; + using CastIn = + std::conditional_t(), DenseDummy, + matrix::Dense>; + using CastOut = + std::conditional_t(), DenseDummy, + matrix::Dense>; // These dynamic_casts are only needed to make the code compile // If ValueType is complex, this branch will never be taken // If ValueType is real, the cast is a no-op - fn(dynamic_cast( + fn(dynamic_cast( dense_in->create_real_view().get()), - dynamic_cast(dense_out->create_real_view().get())); + dynamic_cast( + dense_out->create_real_view().get())); }, in, out); } else { From cb0b181f674bdd3f5186b0ad7597a8dc0471dd04 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 17 Apr 2021 19:08:10 +0200 Subject: [PATCH 10/14] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * formatting and documentation * improve CMake configuration output * remove unused variables * extract test dimensions from ell_kernels tests * use SFINAE instead of casting in precision_dispatch.hpp Co-authored-by: Thomas Grützmacher Co-authored-by: Pratik Nayak --- CMakeLists.txt | 2 +- benchmark/utils/formats.hpp | 5 +- cmake/get_info.cmake | 5 +- cuda/test/matrix/ell_kernels.cpp | 47 +++++++------- hip/test/matrix/ell_kernels.hip.cpp | 47 +++++++------- .../ginkgo/core/base/precision_dispatch.hpp | 63 +++++++++---------- omp/matrix/ell_kernels.cpp | 2 - reference/matrix/ell_kernels.cpp | 2 - 8 files changed, 90 insertions(+), 83 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f0d68b51a34..e999611aa57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA}) option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP}) option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) -option(GINKGO_MIXED_PRECISION "Add true mixed-precision kernel (otherwise they will be conversion-based)" OFF) +option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_EXPORT_BUILD_DIR diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index ed20f91119e..8c1aabbcafb 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -91,9 +91,8 @@ std::string format_description = "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse " "Matrix-Vector Multiplication on CUDA.\n" - "fell: float Ellpack format according to Bell and Garland: Efficient " - "Sparse " - "Matrix-Vector Multiplication on CUDA.\n" + "fell: Mixed Precision Ellpack format according to Bell and Garland: " + "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n" "sellp: Sliced Ellpack uses a default block size of 32.\n" "hybrid: Hybrid uses ell and coo to represent the matrix.\n" "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses " diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 508bb6e0191..bae1e864a02 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -127,7 +127,10 @@ foreach(log_type ${log_types}) ginkgo_print_module_footer(${${log_type}} "User configuration:") ginkgo_print_module_footer(${${log_type}} " Enabled modules:") ginkgo_print_foreach_variable(${${log_type}} - "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP;GINKGO_MIXED_PRECISION") + "GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP") + ginkgo_print_module_footer(${${log_type}} " Enabled features:") + ginkgo_print_foreach_variable(${${log_type}} + "GINKGO_MIXED_PRECISION") ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING") diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 0df427f139e..2df1c397f4c 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -61,7 +61,9 @@ class Ell : public ::testing::Test { using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; - Ell() : rand_engine(42) {} + Ell() + : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600} + {} void SetUp() { @@ -126,6 +128,9 @@ class Ell : public ::testing::Test { std::shared_ptr cuda; std::ranlux48 rand_engine; + gko::dim<2> size; + gko::size_type num_els_rowwise; + gko::size_type ell_stride; std::unique_ptr mtx; std::unique_ptr expected; @@ -239,7 +244,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -250,7 +255,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); @@ -261,7 +266,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected.get()); dmtx->apply(dy2.get(), dresult.get()); @@ -272,7 +277,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); @@ -283,7 +288,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -293,7 +298,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); @@ -304,7 +309,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); @@ -315,7 +320,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); @@ -326,7 +331,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -337,7 +342,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); @@ -348,7 +353,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected.get()); dmtx->apply(dy2.get(), dresult.get()); @@ -359,7 +364,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); @@ -370,7 +375,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -381,7 +386,7 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); @@ -392,7 +397,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); @@ -403,7 +408,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); @@ -503,10 +508,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef) TEST_F(Ell, ApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(cuda); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(cuda); dcomplex_x->copy_from(complex_x.get()); @@ -520,10 +525,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef) TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(cuda); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(cuda); dcomplex_x->copy_from(complex_x.get()); diff --git a/hip/test/matrix/ell_kernels.hip.cpp b/hip/test/matrix/ell_kernels.hip.cpp index 9d1e1440544..13c81fdafcf 100644 --- a/hip/test/matrix/ell_kernels.hip.cpp +++ b/hip/test/matrix/ell_kernels.hip.cpp @@ -61,7 +61,9 @@ class Ell : public ::testing::Test { using Vec2 = gko::matrix::Dense; using ComplexVec = gko::matrix::Dense>; - Ell() : rand_engine(42) {} + Ell() + : rand_engine(42), size{532, 231}, num_els_rowwise{300}, ell_stride{600} + {} void SetUp() { @@ -127,6 +129,9 @@ class Ell : public ::testing::Test { std::shared_ptr hip; std::ranlux48 rand_engine; + gko::dim<2> size; + gko::size_type num_els_rowwise; + gko::size_type ell_stride; std::unique_ptr mtx; std::unique_ptr expected; @@ -240,7 +245,7 @@ TEST_F(Ell, MixedAdvancedApplyIsEquivalentToRef3) TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -251,7 +256,7 @@ TEST_F(Ell, SimpleApplyWithStrideIsEquivalentToRef) TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); @@ -262,7 +267,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef1) TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected.get()); dmtx->apply(dy2.get(), dresult.get()); @@ -273,7 +278,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef2) TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); @@ -284,7 +289,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideIsEquivalentToRef3) TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -294,7 +299,7 @@ TEST_F(Ell, AdvancedApplyWithStrideIsEquivalentToRef) TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); @@ -305,7 +310,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef1) TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); @@ -316,7 +321,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef2) TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) { - set_up_apply_data(532, 231, 1, 300, 600); + set_up_apply_data(size[0], size[1], 1, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); @@ -327,7 +332,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideIsEquivalentToRef3) TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected.get()); dmtx->apply(dy.get(), dresult.get()); @@ -338,7 +343,7 @@ TEST_F(Ell, SimpleApplyWithStrideToDenseMatrixIsEquivalentToRef) TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected2.get()); dmtx->apply(dy2.get(), dresult2.get()); @@ -349,7 +354,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef1) TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y2.get(), expected.get()); dmtx->apply(dy2.get(), dresult.get()); @@ -360,7 +365,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef2) TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(y.get(), expected2.get()); dmtx->apply(dy.get(), dresult2.get()); @@ -371,7 +376,7 @@ TEST_F(Ell, MixedSimpleApplyWithStrideToDenseMatrixIsEquivalentToRef3) TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta.get(), expected.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); @@ -382,7 +387,7 @@ TEST_F(Ell, AdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef) TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta2.get(), dresult2.get()); @@ -393,7 +398,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef1) TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha2.get(), y2.get(), beta.get(), expected.get()); dmtx->apply(dalpha2.get(), dy2.get(), dbeta.get(), dresult.get()); @@ -404,7 +409,7 @@ TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef2) TEST_F(Ell, MixedAdvancedApplyWithStrideToDenseMatrixIsEquivalentToRef3) { - set_up_apply_data(532, 231, 3, 300, 600); + set_up_apply_data(size[0], size[1], 3, num_els_rowwise, ell_stride); mtx->apply(alpha.get(), y.get(), beta2.get(), expected2.get()); dmtx->apply(dalpha.get(), dy.get(), dbeta2.get(), dresult2.get()); @@ -504,10 +509,10 @@ TEST_F(Ell, AdvancedApplyOnSmallMatrixIsEquivalentToRef) TEST_F(Ell, ApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(hip); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(hip); dcomplex_x->copy_from(complex_x.get()); @@ -521,10 +526,10 @@ TEST_F(Ell, ApplyToComplexIsEquivalentToRef) TEST_F(Ell, AdvancedApplyToComplexIsEquivalentToRef) { set_up_apply_data(); - auto complex_b = gen_mtx(231, 3); + auto complex_b = gen_mtx(size[1], 3); auto dcomplex_b = ComplexVec::create(hip); dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(532, 3); + auto complex_x = gen_mtx(size[0], 3); auto dcomplex_x = ComplexVec::create(hip); dcomplex_x->copy_from(complex_x.get()); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 4c2624863e5..376f126f5c5 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -231,7 +231,7 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha, * converted into matrix::Dense as parameters. * * If GINKGO_MIXED_PRECISION is defined, this means that the function will be - * called with its dynamic type with its static type, so the (templated/generic) + * called with its dynamic type as a static type, so the (templated/generic) * function will be instantiated with all pairs of Dense and * Dense> parameter types, and the appropriate * overload will be called based on the dynamic type of the parameter. @@ -239,16 +239,21 @@ void precision_dispatch_real_complex(Function fn, const LinOp *alpha, * If GINKGO_MIXED_PRECISION is not defined, it will behave exactly like * precision_dispatch. * + * @param fn the given function. It will be called with one const and one + * non-const matrix::Dense<...> parameter based on the dynamic type + * of the inputs (GINKGO_MIXED_PRECISION) or of type + * matrix::Dense (no GINKGO_MIXED_PRECISION). + * @param in The first parameter to be cast (GINKGO_MIXED_PRECISION) or + * converted (no GINKGO_MIXED_PRECISION) and used to call `fn`. + * @param out The second parameter to be cast (GINKGO_MIXED_PRECISION) or + * converted (no GINKGO_MIXED_PRECISION) and used to call `fn`. * - * @param fn the given function. It will be passed one (potentially const) - * matrix::Dense<...>* parameter per parameter in the parameter - * pack `linops`. - * @param linops the given arguments to be converted and passed on to fn. - * - * @tparam ValueType the value type to use for the parameters of `fn`. + * @tparam ValueType the value type to use for the parameters of `fn` (no + * GINKGO_MIXED_PRECISION). With GINKGO_MIXED_PRECISION + * enabled, it only matters whether this type is complex or + * real. * @tparam Function the function pointer, lambda or other functor type to call * with the converted arguments. - * @tparam Args the argument type list. */ template void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out) @@ -290,36 +295,30 @@ void mixed_precision_dispatch(Function fn, const LinOp *in, LinOp *out) * * @see mixed_precision_dispatch() */ -template +template ()> * = nullptr> void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in, LinOp *out) { #ifdef GINKGO_MIXED_PRECISION - // do we need to convert complex Dense to real Dense? - auto complex_to_real = - !(is_complex() || - dynamic_cast> *>(in)); - if (complex_to_real) { + mixed_precision_dispatch(fn, in, out); +#else + precision_dispatch(fn, in, out); +#endif +} + + +template ()> * = nullptr> +void mixed_precision_dispatch_real_complex(Function fn, const LinOp *in, + LinOp *out) +{ +#ifdef GINKGO_MIXED_PRECISION + if (!dynamic_cast> *>(in)) { mixed_precision_dispatch>( [&fn](auto dense_in, auto dense_out) { - using DenseIn = typename std::decay_t; - using DenseOut = typename std::decay_t; - using DenseDummy = matrix::Dense; - using RealIn = remove_complex; - using RealOut = remove_complex; - using CastIn = - std::conditional_t(), DenseDummy, - matrix::Dense>; - using CastOut = - std::conditional_t(), DenseDummy, - matrix::Dense>; - // These dynamic_casts are only needed to make the code compile - // If ValueType is complex, this branch will never be taken - // If ValueType is real, the cast is a no-op - fn(dynamic_cast( - dense_in->create_real_view().get()), - dynamic_cast( - dense_out->create_real_view().get())); + fn(dense_in->create_real_view().get(), + dense_out->create_real_view().get()); }, in, out); } else { diff --git a/omp/matrix/ell_kernels.cpp b/omp/matrix/ell_kernels.cpp index 58ef6c3a72e..96f4d5347d2 100644 --- a/omp/matrix/ell_kernels.cpp +++ b/omp/matrix/ell_kernels.cpp @@ -73,7 +73,6 @@ void spmv(std::shared_ptr exec, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); const auto stride = a->get_stride(); - const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( std::array{num_stored_elements_per_row * stride}, a->get_const_values()); @@ -117,7 +116,6 @@ void advanced_spmv(std::shared_ptr exec, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); const auto stride = a->get_stride(); - const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( std::array{num_stored_elements_per_row * stride}, a->get_const_values()); diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 25de03cf709..24f166e52db 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -69,7 +69,6 @@ void spmv(std::shared_ptr exec, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); const auto stride = a->get_stride(); - const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( std::array{num_stored_elements_per_row * stride}, a->get_const_values()); @@ -112,7 +111,6 @@ void advanced_spmv(std::shared_ptr exec, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); const auto stride = a->get_stride(); - const auto num_rows = b->get_size()[0]; const auto a_vals = gko::acc::range( std::array{num_stored_elements_per_row * stride}, a->get_const_values()); From e67a8b69381ba891d6a871fb7e1b31845158e397 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 19 Apr 2021 15:43:36 +0200 Subject: [PATCH 11/14] rename mixed precision ELL benchmark format to ell-mixed Co-authored-by: Yuhsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com> --- benchmark/utils/formats.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index 8c1aabbcafb..64de4e10f90 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -59,7 +59,7 @@ namespace formats { std::string available_format = - "coo, csr, ell, fell, sellp, hybrid, hybrid0, hybrid25, hybrid33, " + "coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, " "hybrid40, " "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, " "hybridminstorage" @@ -91,7 +91,7 @@ std::string format_description = "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "ell: Ellpack format according to Bell and Garland: Efficient Sparse " "Matrix-Vector Multiplication on CUDA.\n" - "fell: Mixed Precision Ellpack format according to Bell and Garland: " + "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: " "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n" "sellp: Sliced Ellpack uses a default block size of 32.\n" "hybrid: Hybrid uses ell and coo to represent the matrix.\n" @@ -207,7 +207,7 @@ const std::map( {"csrc", READ_MATRIX(csr, std::make_shared())}, {"coo", read_matrix_from_data>}, {"ell", read_matrix_from_data>}, - {"ell_mixed", + {"ell-mixed", [](std::shared_ptr exec, const gko::matrix_data &data) { gko::matrix_data> conv_data; From 45f9923333aabba7188e387ee251cd44965353cf Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 19 Apr 2021 15:47:43 +0200 Subject: [PATCH 12/14] add GINKGO_MIXED_PRECISION to Windows CI --- .github/workflows/windows-msvc-cuda.yml | 6 +++--- .github/workflows/windows-msvc-ref.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/windows-msvc-cuda.yml b/.github/workflows/windows-msvc-cuda.yml index bc4557e96bf..28468950af8 100644 --- a/.github/workflows/windows-msvc-cuda.yml +++ b/.github/workflows/windows-msvc-cuda.yml @@ -15,8 +15,8 @@ jobs: fail-fast: false matrix: config: - - {version: "10.2.89.20191206", name: "cuda102/release/shared"} - - {version: "latest", name: "cuda-latest/release/shared"} + - {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"} + - {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"} name: msvc/${{ matrix.config.name }} (only compile) runs-on: [windows-latest] @@ -46,5 +46,5 @@ jobs: $env:PATH="$env:PATH;$pwd\build\windows_shared_library" mkdir build cd build - cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 .. + cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 .. cmake --build . -j4 --config Release diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml index 13a29f6cc51..833f3692676 100644 --- a/.github/workflows/windows-msvc-ref.yml +++ b/.github/workflows/windows-msvc-ref.yml @@ -15,8 +15,8 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "reference/release/static"} + - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"} + - {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"} # Debug static needs too much storage # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} name: msvc/${{ matrix.config.name }} @@ -35,7 +35,7 @@ jobs: $env:PATH="$env:PATH;$pwd\build\windows_shared_library" mkdir build cd build - cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF .. + cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} .. cmake --build . -j4 --config ${{ matrix.config.build_type }} ctest . -C ${{ matrix.config.build_type }} --output-on-failure From 535b5778bee6f9b6e8e7fb1fed0bc2771664bdc4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 19 Apr 2021 15:50:03 +0200 Subject: [PATCH 13/14] add GINKGO_MIXED_PRECISION to OSX CI --- .github/workflows/osx.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 6827c5fe476..79caf2169a4 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -15,8 +15,8 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "omp/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "omp/release/static"} + - {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"} + - {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"} name: ${{ matrix.config.name }} runs-on: [macos-latest] @@ -40,7 +40,7 @@ jobs: run: | mkdir build cd build - cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} + cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} make -j8 ctest -j10 --output-on-failure From 31fe679f83e0ca9e65010b2f561288f02264b566 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 4 May 2021 11:51:11 +0200 Subject: [PATCH 14/14] simplify mixed-precision instantiation macros --- core/base/mixed_precision_types.hpp | 99 +++++++++++------------------ 1 file changed, 37 insertions(+), 62 deletions(-) diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 079fcb8a075..d725eff82bf 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -39,70 +39,45 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(float, float, float, int32); \ - template _macro(float, float, double, int32); \ - template _macro(float, double, float, int32); \ - template _macro(float, double, double, int32); \ - template _macro(double, float, float, int32); \ - template _macro(double, float, double, int32); \ - template _macro(double, double, float, int32); \ - template _macro(double, double, double, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(float, float, float, int64); \ - template _macro(float, float, double, int64); \ - template _macro(float, double, float, int64); \ - template _macro(float, double, double, int64); \ - template _macro(double, float, float, int64); \ - template _macro(double, float, double, int64); \ - template _macro(double, double, float, int64); \ - template _macro(double, double, double, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + template _macro(float, float, double, __VA_ARGS__); \ + template _macro(float, double, float, __VA_ARGS__); \ + template _macro(float, double, double, __VA_ARGS__); \ + template _macro(double, float, float, __VA_ARGS__); \ + template _macro(double, float, double, __VA_ARGS__); \ + template _macro(double, double, float, __VA_ARGS__); \ + template _macro(double, double, double, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__) #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(float, float, float, int32); \ - template _macro(double, double, double, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(std::complex, std::complex, \ - std::complex, int32); \ - template _macro(float, float, float, int64); \ - template _macro(double, double, double, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64); \ - template _macro(std::complex, std::complex, \ - std::complex, int64) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + template _macro(double, double, double, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__) #endif +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64) + + #endif // GKO_CORE_BASE_MIXED_PRECISION_TYPES_HPP_