ginkgo-project · upsj · Jul 6, 2021 · Jul 2, 2021 · Jul 3, 2021 · Jul 4, 2021
diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
@@ -88,9 +88,9 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
             matrix_to->copy_from(matrix_from);
         }
         add_or_set_member(conversion_case[conversion_name], "time",
-                          timer->compute_average_time(), allocator);
+                          ic.compute_average_time(), allocator);
         add_or_set_member(conversion_case[conversion_name], "repetitions",
-                          timer->get_num_repetitions(), allocator);
+                          ic.get_num_repetitions(), allocator);
 
         // compute and write benchmark data
         add_or_set_member(conversion_case[conversion_name], "completed", true,
@@ -156,8 +156,7 @@ int main(int argc, char *argv[])
             try {
                 auto matrix_from =
                     share(formats::matrix_factory.at(format_from)(exec, data));
-                for (const auto &format : formats::matrix_factory) {
-                    const auto format_to = std::get<0>(format);
+                for (const auto &format_to : formats) {
                     if (format_from == format_to) {
                         continue;
                     }

diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh
@@ -16,6 +16,11 @@ if [ ! "${EXECUTOR}" ]; then
     echo "EXECUTOR    environment variable not set - assuming \"${EXECUTOR}\"" 1>&2
 fi
 
+if [ ! "${REPETITIONS}" ]; then
+    REPETITIONS=10
+    echo "REPETITIONS    environment variable not set - assuming ${REPETITIONS}" 1>&2
+fi
+
 if [ ! "${SEGMENTS}" ]; then
     echo "SEGMENTS    environment variable not set - running entire suite" 1>&2
     SEGMENTS=1
@@ -35,6 +40,11 @@ if [ ! "${FORMATS}" ]; then
     FORMATS="csr,coo,ell,hybrid,sellp"
 fi
 
+if [ ! "${ELL_IMBALANCE_LIMIT}" ]; then
+    echo "ELL_IMBALANCE_LIMIT    environment variable not set - assuming 100" 1>&2
+    ELL_IMBALANCE_LIMIT=100
+fi
+
 if [ ! "${SOLVERS}" ]; then
     SOLVERS="bicgstab,cg,cgs,fcg,gmres,cb_gmres_reduce1,idr"
     echo "SOLVERS    environment variable not set - assuming \"${SOLVERS}\"" 1>&2
@@ -67,7 +77,7 @@ fi
 
 if [ ! "${SOLVERS_JACOBI_MAX_BS}" ]; then
     SOLVERS_JACOBI_MAX_BS="32"
-    "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2
+    echo "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2
 fi
 
 if [ ! "${BENCHMARK_PRECISION}" ]; then
@@ -191,6 +201,17 @@ compute_matrix_statistics() {
 }
 
 
+remove_ell_worstcase() {
+    local IMBALANCE=$(jq '.[0].problem.row_distribution | .max / (.mean + 1) | floor' $1)
+    # if the imbalance is too large, remove ELL formats from the list.
+    if [[ "${IMBALANCE}" -gt "${ELL_IMBALANCE_LIMIT}" ]]; then
+        echo -n $FORMATS | tr ',' '\n' | grep -vE '^ell' | tr '\n' ',' | sed 's/,$//'
+    else
+        echo -n $FORMATS
+    fi
+}
+
+
 # Runs the conversion benchmarks for all matrix formats by using file $1 as the
 # input, and updating it with the results. Backups are created after each
 # benchmark run, to prevent data loss in case of a crash. Once the benchmarking
@@ -199,9 +220,11 @@ compute_matrix_statistics() {
 run_conversion_benchmarks() {
     [ "${DRY_RUN}" == "true" ] && return
     cp "$1" "$1.imd" # make sure we're not loosing the original input
+    local LOCAL_FORMATS=$(remove_ell_worstcase "$1")
     ./conversions/conversions${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \
-                --executor="${EXECUTOR}" --formats="${FORMATS}" \
+                --executor="${EXECUTOR}" --formats="${LOCAL_FORMATS}" \
                 --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
+                --repetitions="${REPETITIONS}" \
                 <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -214,10 +237,12 @@ run_conversion_benchmarks() {
 # taken as the final result.
 run_spmv_benchmarks() {
     [ "${DRY_RUN}" == "true" ] && return
+    local LOCAL_FORMATS=$(remove_ell_worstcase "$1")
     cp "$1" "$1.imd" # make sure we're not loosing the original input
     ./spmv/spmv${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \
-                --executor="${EXECUTOR}" --formats="${FORMATS}" \
+                --executor="${EXECUTOR}" --formats="${LOCAL_FORMATS}" \
                 --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
+                --repetitions="${REPETITIONS}" \
                 <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -239,6 +264,7 @@ run_solver_benchmarks() {
                     --gpu_timer=${GPU_TIMER} \
                     --jacobi_max_block_size=${SOLVERS_JACOBI_MAX_BS} --device_id="${DEVICE_ID}" \
                     --gmres_restart="${SOLVERS_GMRES_RESTART}" \
+                    --repetitions="${REPETITIONS}" \
                     <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -265,6 +291,7 @@ run_preconditioner_benchmarks() {
                 --jacobi_max_block_size="${bsize}" \
                 --jacobi_storage="${prec}" \
                 --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
+                --repetitions="${REPETITIONS}" \
                 <"$1.imd" 2>&1 >"$1"
             keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
         done

diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
@@ -130,7 +130,7 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
             for (auto _ : ic_tuning.run()) {
                 system_matrix->apply(lend(b), lend(x_clone));
             }
-            tuning_case["time"].PushBack(tuning_timer->compute_average_time(),
+            tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
                                          allocator);
             tuning_case["values"].PushBack(val, allocator);
         }

diff --git a/omp/components/format_conversion.hpp b/omp/components/format_conversion.hpp
@@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -84,16 +87,25 @@ inline void convert_unsorted_idxs_to_ptrs(const IndexType *idxs,
 template <typename IndexType>
 inline void convert_sorted_idxs_to_ptrs(const IndexType *idxs,
                                         size_type num_nonzeros, IndexType *ptrs,
-                                        size_type length)
+                                        size_type num_rows)
 {
     ptrs[0] = 0;
-    ptrs[length - 1] = num_nonzeros;
 
-#pragma omp parallel for schedule( \
-    static, ceildiv(num_nonzeros, omp_get_max_threads()))
-    for (size_type i = 0; i < num_nonzeros - 1; i++) {
-        for (size_type j = idxs[i] + 1; j <= idxs[i + 1]; j++) {
-            ptrs[j] = i + 1;
+    if (num_nonzeros == 0) {
+#pragma omp parallel for
+        for (size_type row = 0; row < num_rows; row++) {
+            ptrs[row + 1] = 0;
+        }
+    } else {
+        // add virtual sentinel values 0 and num_rows to handle empty first and
+        // last rows
+#pragma omp parallel for
+        for (size_type i = 0; i <= num_nonzeros; i++) {
+            auto begin_row = i == 0 ? size_type{} : idxs[i - 1];
+            auto end_row = i == num_nonzeros ? num_rows : idxs[i];
+            for (auto row = begin_row; row < end_row; row++) {
+                ptrs[row + 1] = i;
+            }
         }
     }
 }

diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
@@ -204,9 +204,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename IndexType>
 void convert_row_idxs_to_ptrs(std::shared_ptr<const OmpExecutor> exec,
                               const IndexType *idxs, size_type num_nonzeros,
-                              IndexType *ptrs, size_type length)
+                              IndexType *ptrs, size_type num_rows)
 {
-    convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, length);
+    convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, num_rows);
 }
 
 
@@ -222,8 +222,7 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
 
     const auto source_row_idxs = source->get_const_row_idxs();
 
-    convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs,
-                             num_rows + 1);
+    convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, num_rows);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp
@@ -112,58 +112,30 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
     const auto coo_row = source->get_const_coo_row_idxs();
     const auto coo_nnz = source->get_coo_num_stored_elements();
     const auto num_rows = source->get_size()[0];
-    auto coo_row_ptrs = Array<IndexType>(exec, num_rows + 1);
-    auto coo_row_ptrs_val = coo_row_ptrs.get_data();
-    convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs_val,
-                                num_rows + 1);
-
-    // Compute the row offset of Coo without zeros
-    auto coo_offset = Array<IndexType>(exec, num_rows);
-    auto coo_offset_val = coo_offset.get_data();
-    for (size_type i = 0; i < num_rows; i++) {
-        IndexType nonzeros = 0;
-#pragma omp parallel for reduction(+ : nonzeros)
-        for (size_type j = coo_row_ptrs_val[i]; j < coo_row_ptrs_val[i + 1];
-             j++) {
-            nonzeros += coo_val[j] != zero<ValueType>();
-        }
-        coo_offset_val[i] = nonzeros;
-    }
-
-    // Compute row pointer of Csr
-    csr_row_ptrs[0] = 0;
-#pragma omp parallel for
-    for (size_type i = 0; i < num_rows; i++) {
-        csr_row_ptrs[i + 1] = coo_offset_val[i];
-    }
-
-    for (size_type col = 0; col < max_nnz_per_row; col++) {
-#pragma omp parallel for
-        for (size_type row = 0; row < num_rows; row++) {
-            csr_row_ptrs[row + 1] +=
-                (ell->val_at(row, col) != zero<ValueType>());
-        }
-    }
+    auto coo_row_ptrs_array = Array<IndexType>(exec, num_rows + 1);
+    auto coo_row_ptrs = coo_row_ptrs_array.get_data();
+    convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs, num_rows);
 
-    auto workspace = Array<IndexType>(exec, num_rows + 1);
-    auto workspace_val = workspace.get_data();
-    for (size_type i = 1; i < num_rows + 1; i <<= 1) {
+    // Compute the row sizes of Coo without zeros
 #pragma omp parallel for
-        for (size_type j = i; j < num_rows + 1; j++) {
-            workspace_val[j] = csr_row_ptrs[j] + csr_row_ptrs[j - i];
+    for (size_type row = 0; row < num_rows; row++) {
+        IndexType nonzeros{};
+        for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) {
+            nonzeros += coo_val[j] != zero<ValueType>();
         }
-#pragma omp parallel for
-        for (size_type j = i; j < num_rows + 1; j++) {
-            csr_row_ptrs[j] = workspace_val[j];
+        for (size_type col = 0; col < max_nnz_per_row; col++) {
+            nonzeros += (ell->val_at(row, col) != zero<ValueType>());
         }
+        csr_row_ptrs[row] = nonzeros;
     }
 
-// Fill in Csr
+    components::prefix_sum(exec, csr_row_ptrs, num_rows + 1);
+
+    // Fill in Csr
 #pragma omp parallel for
     for (IndexType row = 0; row < num_rows; row++) {
         // Ell part
-        size_type csr_idx = csr_row_ptrs[row];
-        size_type coo_idx = coo_offset_val[row];
+        auto csr_idx = csr_row_ptrs[row];
         for (IndexType col = 0; col < max_nnz_per_row; col++) {
             const auto val = ell->val_at(row, col);
             if (val != zero<ValueType>()) {
@@ -173,11 +145,10 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
             }
         }
         // Coo part
-        for (auto coo_idx = coo_row_ptrs_val[row];
-             coo_idx < coo_row_ptrs_val[row + 1]; coo_idx++) {
-            if (coo_val[coo_idx] != zero<ValueType>()) {
-                csr_val[csr_idx] = coo_val[coo_idx];
-                csr_col_idxs[csr_idx] = coo_col[coo_idx];
+        for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) {
+            if (coo_val[j] != zero<ValueType>()) {
+                csr_val[csr_idx] = coo_val[j];
+                csr_col_idxs[csr_idx] = coo_col[j];
                 csr_idx++;
             }
         }

diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp
@@ -55,6 +55,7 @@ namespace {
 
 class Hybrid : public ::testing::Test {
 protected:
+    using value_type = double;
     using Mtx = gko::matrix::Hybrid<>;
     using Vec = gko::matrix::Dense<>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
@@ -77,10 +78,17 @@ class Hybrid : public ::testing::Test {
     template <typename MtxType = Vec>
     std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
                                      int min_nnz_row)
+    {
+        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
+    }
+
+    template <typename MtxType = Vec>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row, int max_nnz_row)
     {
         return gko::test::generate_random_matrix<MtxType>(
             num_rows, num_cols,
-            std::uniform_int_distribution<>(min_nnz_row, num_cols),
+            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
@@ -230,6 +238,49 @@ TEST_F(Hybrid, CountNonzerosIsEquivalentToRef)
 }
 
 
+TEST_F(Hybrid, ConvertEmptyCooToCsrIsEquivalentToRef)
+{
+    auto balanced_mtx =
+        Mtx::create(ref, std::make_shared<Mtx::column_limit>(4));
+    balanced_mtx->copy_from(gen_mtx(400, 200, 4, 4).get());
+    auto dbalanced_mtx =
+        Mtx::create(omp, std::make_shared<Mtx::column_limit>(4));
+    dbalanced_mtx->copy_from(balanced_mtx.get());
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(omp);
+
+    balanced_mtx->convert_to(csr_mtx.get());
+    dbalanced_mtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Hybrid, ConvertWithEmptyFirstAndLastRowToCsrIsEquivalentToRef)
+{
+    // create a dense matrix for easier manipulation
+    auto dense_mtx = gen_mtx(400, 200, 0, 4);
+    // set first and last row to zero
+    for (gko::size_type col = 0; col < dense_mtx->get_size()[1]; col++) {
+        dense_mtx->at(0, col) = gko::zero<value_type>();
+        dense_mtx->at(dense_mtx->get_size()[0] - 1, col) =
+            gko::zero<value_type>();
+    }
+    // now convert them to hybrid matrices
+    auto balanced_mtx = Mtx::create(ref);
+    balanced_mtx->copy_from(dense_mtx.get());
+    auto dbalanced_mtx = Mtx::create(omp);
+    dbalanced_mtx->copy_from(balanced_mtx.get());
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(omp);
+
+    balanced_mtx->convert_to(csr_mtx.get());
+    dbalanced_mtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
 TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef)
 {
     set_up_apply_data(1, std::make_shared<Mtx::column_limit>(2));