From 49e600f6cf978738c2e4da32d06a861e5d68c87e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 8 May 2021 23:12:31 +0200 Subject: [PATCH] parallelize COO OpenMP SpMV --- omp/components/atomic.hpp | 73 ++++++++++++++++++++++++++ omp/matrix/coo_kernels.cpp | 102 +++++++++++++++++++++++++++++-------- 2 files changed, 155 insertions(+), 20 deletions(-) create mode 100644 omp/components/atomic.hpp diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp new file mode 100644 index 00000000000..07dd0972ed6 --- /dev/null +++ b/omp/components/atomic.hpp @@ -0,0 +1,73 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_OMP_COMPONENTS_ATOMIC_HPP_ +#define GKO_OMP_COMPONENTS_ATOMIC_HPP_ + + +#include + + +#include + + +namespace gko { +namespace kernels { +namespace omp { + + +template ()> * = nullptr> +void atomic_add(ValueType &out, ValueType val) +{ +#pragma omp atomic + out += val; +} + +template ()> * = nullptr> +void atomic_add(ValueType &out, ValueType val) +{ + auto values = reinterpret_cast *>(&out); +#pragma omp atomic + values[0] += real(val); +#pragma omp atomic + values[1] += imag(val); +} + + +} // namespace omp +} // namespace kernels +} // namespace gko + + +#endif // GKO_OMP_COMPONENTS_ATOMIC_HPP_ diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index 676051af076..18855024163 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "omp/components/atomic.hpp" #include "omp/components/format_conversion.hpp" @@ -103,15 +104,45 @@ void spmv2(std::shared_ptr exec, const matrix::Coo *a, const matrix::Dense *b, matrix::Dense *c) { - auto coo_val = a->get_const_values(); - auto coo_col = a->get_const_col_idxs(); - auto coo_row = a->get_const_row_idxs(); - auto num_cols = b->get_size()[1]; - -#pragma omp parallel for - for (size_type j = 0; j < num_cols; j++) { - for (size_type i = 0; i < a->get_num_stored_elements(); i++) { - c->at(coo_row[i], j) += coo_val[i] * b->at(coo_col[i], j); + const auto coo_val = a->get_const_values(); + const auto coo_col = a->get_const_col_idxs(); + const auto coo_row = a->get_const_row_idxs(); + const auto num_rhs = b->get_size()[1]; + const auto sentinel_row = a->get_size()[0] + 1; + const auto nnz = a->get_num_stored_elements(); + const auto num_threads = omp_get_max_threads(); + const auto work_per_thread = ceildiv(nnz, num_threads); + +#pragma omp parallel num_threads(num_threads) + { + const auto thread_id = static_cast(omp_get_thread_num()); + const auto begin = work_per_thread * thread_id; + const auto end = std::min(begin + work_per_thread, nnz); + if (begin < end) { + const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row; + const auto last = end < nnz ? coo_row[end] : sentinel_row; + auto nz = begin; + for (; nz < end && coo_row[nz] == first; nz++) { + const auto row = first; + const auto col = coo_col[nz]; + for (size_type rhs = 0; rhs < num_rhs; rhs++) { + atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs)); + } + } + for (; nz < end && coo_row[nz] != last; nz++) { + const auto row = coo_row[nz]; + const auto col = coo_col[nz]; + for (size_type rhs = 0; rhs < num_rhs; rhs++) { + c->at(row, rhs) += coo_val[nz] * b->at(col, rhs); + } + } + for (; nz < end; nz++) { + const auto row = last; + const auto col = coo_col[nz]; + for (size_type rhs = 0; rhs < num_rhs; rhs++) { + atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs)); + } + } } } } @@ -126,17 +157,48 @@ void advanced_spmv2(std::shared_ptr exec, const matrix::Dense *b, matrix::Dense *c) { - auto coo_val = a->get_const_values(); - auto coo_col = a->get_const_col_idxs(); - auto coo_row = a->get_const_row_idxs(); - auto alpha_val = alpha->at(0, 0); - auto num_cols = b->get_size()[1]; - -#pragma omp parallel for - for (size_type j = 0; j < num_cols; j++) { - for (size_type i = 0; i < a->get_num_stored_elements(); i++) { - c->at(coo_row[i], j) += - alpha_val * coo_val[i] * b->at(coo_col[i], j); + const auto coo_val = a->get_const_values(); + const auto coo_col = a->get_const_col_idxs(); + const auto coo_row = a->get_const_row_idxs(); + const auto num_rhs = b->get_size()[1]; + const auto sentinel_row = a->get_size()[0] + 1; + const auto nnz = a->get_num_stored_elements(); + const auto num_threads = omp_get_max_threads(); + const auto work_per_thread = ceildiv(nnz, num_threads); + const auto scale = alpha->at(0, 0); + +#pragma omp parallel num_threads(num_threads) + { + const auto thread_id = static_cast(omp_get_thread_num()); + const auto begin = work_per_thread * thread_id; + const auto end = std::min(begin + work_per_thread, nnz); + if (begin < end) { + const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row; + const auto last = end < nnz ? coo_row[end] : sentinel_row; + auto nz = begin; + for (; nz < end && coo_row[nz] == first; nz++) { + const auto row = first; + const auto col = coo_col[nz]; + for (size_type rhs = 0; rhs < num_rhs; rhs++) { + atomic_add(c->at(row, rhs), + scale * coo_val[nz] * b->at(col, rhs)); + } + } + for (; nz < end && coo_row[nz] != last; nz++) { + const auto row = coo_row[nz]; + const auto col = coo_col[nz]; + for (size_type rhs = 0; rhs < num_rhs; rhs++) { + c->at(row, rhs) += scale * coo_val[nz] * b->at(col, rhs); + } + } + for (; nz < end; nz++) { + const auto row = last; + const auto col = coo_col[nz]; + for (size_type rhs = 0; rhs < num_rhs; rhs++) { + atomic_add(c->at(row, rhs), + scale * coo_val[nz] * b->at(col, rhs)); + } + } } } }