From 49e600f6cf978738c2e4da32d06a861e5d68c87e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sat, 8 May 2021 23:12:31 +0200
Subject: [PATCH] parallelize COO OpenMP SpMV

---
 omp/components/atomic.hpp  |  73 ++++++++++++++++++++++++++
 omp/matrix/coo_kernels.cpp | 102 +++++++++++++++++++++++++++++--------
 2 files changed, 155 insertions(+), 20 deletions(-)
 create mode 100644 omp/components/atomic.hpp
diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
new file mode 100644
index 00000000000..07dd0972ed6
--- /dev/null
+++ b/omp/components/atomic.hpp
@@ -0,0 +1,73 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_OMP_COMPONENTS_ATOMIC_HPP_
+#define GKO_OMP_COMPONENTS_ATOMIC_HPP_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+template <typename ValueType,
+          std::enable_if_t<!is_complex<ValueType>()> * = nullptr>
+void atomic_add(ValueType &out, ValueType val)
+{
+#pragma omp atomic
+    out += val;
+}
+
+template <typename ValueType,
+          std::enable_if_t<is_complex<ValueType>()> * = nullptr>
+void atomic_add(ValueType &out, ValueType val)
+{
+    auto values = reinterpret_cast<gko::remove_complex<ValueType> *>(&out);
+#pragma omp atomic
+    values[0] += real(val);
+#pragma omp atomic
+    values[1] += imag(val);
+}
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_OMP_COMPONENTS_ATOMIC_HPP_
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index 676051af076..18855024163 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "omp/components/atomic.hpp"
 #include "omp/components/format_conversion.hpp"
 
 
@@ -103,15 +104,45 @@ void spmv2(std::shared_ptr<const OmpExecutor> exec,
            const matrix::Coo<ValueType, IndexType> *a,
            const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
 {
-    auto coo_val = a->get_const_values();
-    auto coo_col = a->get_const_col_idxs();
-    auto coo_row = a->get_const_row_idxs();
-    auto num_cols = b->get_size()[1];
-
-#pragma omp parallel for
-    for (size_type j = 0; j < num_cols; j++) {
-        for (size_type i = 0; i < a->get_num_stored_elements(); i++) {
-            c->at(coo_row[i], j) += coo_val[i] * b->at(coo_col[i], j);
+    const auto coo_val = a->get_const_values();
+    const auto coo_col = a->get_const_col_idxs();
+    const auto coo_row = a->get_const_row_idxs();
+    const auto num_rhs = b->get_size()[1];
+    const auto sentinel_row = a->get_size()[0] + 1;
+    const auto nnz = a->get_num_stored_elements();
+    const auto num_threads = omp_get_max_threads();
+    const auto work_per_thread = ceildiv(nnz, num_threads);
+
+#pragma omp parallel num_threads(num_threads)
+    {
+        const auto thread_id = static_cast<size_type>(omp_get_thread_num());
+        const auto begin = work_per_thread * thread_id;
+        const auto end = std::min(begin + work_per_thread, nnz);
+        if (begin < end) {
+            const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row;
+            const auto last = end < nnz ? coo_row[end] : sentinel_row;
+            auto nz = begin;
+            for (; nz < end && coo_row[nz] == first; nz++) {
+                const auto row = first;
+                const auto col = coo_col[nz];
+                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                    atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs));
+                }
+            }
+            for (; nz < end && coo_row[nz] != last; nz++) {
+                const auto row = coo_row[nz];
+                const auto col = coo_col[nz];
+                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                    c->at(row, rhs) += coo_val[nz] * b->at(col, rhs);
+                }
+            }
+            for (; nz < end; nz++) {
+                const auto row = last;
+                const auto col = coo_col[nz];
+                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                    atomic_add(c->at(row, rhs), coo_val[nz] * b->at(col, rhs));
+                }
+            }
         }
     }
 }
@@ -126,17 +157,48 @@ void advanced_spmv2(std::shared_ptr<const OmpExecutor> exec,
                     const matrix::Dense<ValueType> *b,
                     matrix::Dense<ValueType> *c)
 {
-    auto coo_val = a->get_const_values();
-    auto coo_col = a->get_const_col_idxs();
-    auto coo_row = a->get_const_row_idxs();
-    auto alpha_val = alpha->at(0, 0);
-    auto num_cols = b->get_size()[1];
-
-#pragma omp parallel for
-    for (size_type j = 0; j < num_cols; j++) {
-        for (size_type i = 0; i < a->get_num_stored_elements(); i++) {
-            c->at(coo_row[i], j) +=
-                alpha_val * coo_val[i] * b->at(coo_col[i], j);
+    const auto coo_val = a->get_const_values();
+    const auto coo_col = a->get_const_col_idxs();
+    const auto coo_row = a->get_const_row_idxs();
+    const auto num_rhs = b->get_size()[1];
+    const auto sentinel_row = a->get_size()[0] + 1;
+    const auto nnz = a->get_num_stored_elements();
+    const auto num_threads = omp_get_max_threads();
+    const auto work_per_thread = ceildiv(nnz, num_threads);
+    const auto scale = alpha->at(0, 0);
+
+#pragma omp parallel num_threads(num_threads)
+    {
+        const auto thread_id = static_cast<size_type>(omp_get_thread_num());
+        const auto begin = work_per_thread * thread_id;
+        const auto end = std::min(begin + work_per_thread, nnz);
+        if (begin < end) {
+            const auto first = begin > 0 ? coo_row[begin - 1] : sentinel_row;
+            const auto last = end < nnz ? coo_row[end] : sentinel_row;
+            auto nz = begin;
+            for (; nz < end && coo_row[nz] == first; nz++) {
+                const auto row = first;
+                const auto col = coo_col[nz];
+                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                    atomic_add(c->at(row, rhs),
+                               scale * coo_val[nz] * b->at(col, rhs));
+                }
+            }
+            for (; nz < end && coo_row[nz] != last; nz++) {
+                const auto row = coo_row[nz];
+                const auto col = coo_col[nz];
+                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                    c->at(row, rhs) += scale * coo_val[nz] * b->at(col, rhs);
+                }
+            }
+            for (; nz < end; nz++) {
+                const auto row = last;
+                const auto col = coo_col[nz];
+                for (size_type rhs = 0; rhs < num_rhs; rhs++) {
+                    atomic_add(c->at(row, rhs),
+                               scale * coo_val[nz] * b->at(col, rhs));
+                }
+            }
         }
     }
 }