Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various benchmark improvements #812

Merged
merged 10 commits into from
Jul 6, 2021
7 changes: 3 additions & 4 deletions benchmark/conversions/conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
matrix_to->copy_from(matrix_from);
}
add_or_set_member(conversion_case[conversion_name], "time",
timer->compute_average_time(), allocator);
ic.compute_average_time(), allocator);
add_or_set_member(conversion_case[conversion_name], "repetitions",
timer->get_num_repetitions(), allocator);
ic.get_num_repetitions(), allocator);

// compute and write benchmark data
add_or_set_member(conversion_case[conversion_name], "completed", true,
Expand Down Expand Up @@ -156,8 +156,7 @@ int main(int argc, char *argv[])
try {
auto matrix_from =
share(formats::matrix_factory.at(format_from)(exec, data));
for (const auto &format : formats::matrix_factory) {
const auto format_to = std::get<0>(format);
for (const auto &format_to : formats) {
if (format_from == format_to) {
continue;
}
Expand Down
33 changes: 30 additions & 3 deletions benchmark/run_all_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ if [ ! "${EXECUTOR}" ]; then
echo "EXECUTOR environment variable not set - assuming \"${EXECUTOR}\"" 1>&2
fi

if [ ! "${REPETITIONS}" ]; then
REPETITIONS=10
echo "REPETITIONS environment variable not set - assuming ${REPETITIONS}" 1>&2
fi

if [ ! "${SEGMENTS}" ]; then
echo "SEGMENTS environment variable not set - running entire suite" 1>&2
SEGMENTS=1
Expand All @@ -35,6 +40,11 @@ if [ ! "${FORMATS}" ]; then
FORMATS="csr,coo,ell,hybrid,sellp"
fi

if [ ! "${ELL_IMBALANCE_LIMIT}" ]; then
echo "ELL_IMBALANCE_LIMIT environment variable not set - assuming 100" 1>&2
ELL_IMBALANCE_LIMIT=100
fi

if [ ! "${SOLVERS}" ]; then
SOLVERS="bicgstab,cg,cgs,fcg,gmres,cb_gmres_reduce1,idr"
echo "SOLVERS environment variable not set - assuming \"${SOLVERS}\"" 1>&2
Expand Down Expand Up @@ -67,7 +77,7 @@ fi

if [ ! "${SOLVERS_JACOBI_MAX_BS}" ]; then
SOLVERS_JACOBI_MAX_BS="32"
"SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2
echo "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2
fi

if [ ! "${BENCHMARK_PRECISION}" ]; then
Expand Down Expand Up @@ -191,6 +201,17 @@ compute_matrix_statistics() {
}


remove_ell_worstcase() {
local IMBALANCE=$(jq '.[0].problem.row_distribution | .max / (.mean + 1) | floor' $1)
# if the imbalance is too large, remove ELL formats from the list.
if [[ "${IMBALANCE}" -gt "${ELL_IMBALANCE_LIMIT}" ]]; then
echo -n $FORMATS | tr ',' '\n' | grep -vE '^ell' | tr '\n' ',' | sed 's/,$//'
else
echo -n $FORMATS
fi
}


# Runs the conversion benchmarks for all matrix formats by using file $1 as the
# input, and updating it with the results. Backups are created after each
# benchmark run, to prevent data loss in case of a crash. Once the benchmarking
Expand All @@ -199,9 +220,11 @@ compute_matrix_statistics() {
run_conversion_benchmarks() {
[ "${DRY_RUN}" == "true" ] && return
cp "$1" "$1.imd" # make sure we're not loosing the original input
local LOCAL_FORMATS=$(remove_ell_worstcase "$1")
./conversions/conversions${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \
--executor="${EXECUTOR}" --formats="${FORMATS}" \
--executor="${EXECUTOR}" --formats="${LOCAL_FORMATS}" \
--device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
--repetitions="${REPETITIONS}" \
<"$1.imd" 2>&1 >"$1"
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
}
Expand All @@ -214,10 +237,12 @@ run_conversion_benchmarks() {
# taken as the final result.
run_spmv_benchmarks() {
[ "${DRY_RUN}" == "true" ] && return
local LOCAL_FORMATS=$(remove_ell_worstcase "$1")
cp "$1" "$1.imd" # make sure we're not loosing the original input
./spmv/spmv${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \
--executor="${EXECUTOR}" --formats="${FORMATS}" \
--executor="${EXECUTOR}" --formats="${LOCAL_FORMATS}" \
--device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
--repetitions="${REPETITIONS}" \
<"$1.imd" 2>&1 >"$1"
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
}
Expand All @@ -239,6 +264,7 @@ run_solver_benchmarks() {
--gpu_timer=${GPU_TIMER} \
--jacobi_max_block_size=${SOLVERS_JACOBI_MAX_BS} --device_id="${DEVICE_ID}" \
--gmres_restart="${SOLVERS_GMRES_RESTART}" \
--repetitions="${REPETITIONS}" \
<"$1.imd" 2>&1 >"$1"
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
}
Expand All @@ -265,6 +291,7 @@ run_preconditioner_benchmarks() {
--jacobi_max_block_size="${bsize}" \
--jacobi_storage="${prec}" \
--device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
--repetitions="${REPETITIONS}" \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't run multiple SpMVs, so I would use the default number of repetitions.

<"$1.imd" 2>&1 >"$1"
keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
done
Expand Down
2 changes: 1 addition & 1 deletion benchmark/spmv/spmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
for (auto _ : ic_tuning.run()) {
system_matrix->apply(lend(b), lend(x_clone));
}
tuning_case["time"].PushBack(tuning_timer->compute_average_time(),
tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
allocator);
tuning_case["values"].PushBack(val, allocator);
}
Expand Down
26 changes: 19 additions & 7 deletions omp/components/format_conversion.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/base/types.hpp>


#include "core/components/prefix_sum.hpp"


namespace gko {
namespace kernels {
namespace omp {
Expand Down Expand Up @@ -84,16 +87,25 @@ inline void convert_unsorted_idxs_to_ptrs(const IndexType *idxs,
template <typename IndexType>
inline void convert_sorted_idxs_to_ptrs(const IndexType *idxs,
size_type num_nonzeros, IndexType *ptrs,
size_type length)
size_type num_rows)
{
ptrs[0] = 0;
ptrs[length - 1] = num_nonzeros;

#pragma omp parallel for schedule( \
static, ceildiv(num_nonzeros, omp_get_max_threads()))
for (size_type i = 0; i < num_nonzeros - 1; i++) {
for (size_type j = idxs[i] + 1; j <= idxs[i + 1]; j++) {
ptrs[j] = i + 1;
if (num_nonzeros == 0) {
#pragma omp parallel for
for (size_type row = 0; row < num_rows; row++) {
ptrs[row + 1] = 0;
}
} else {
// add virtual sentinel values 0 and num_rows to handle empty first and
// last rows
#pragma omp parallel for
for (size_type i = 0; i <= num_nonzeros; i++) {
auto begin_row = i == 0 ? size_type{} : idxs[i - 1];
auto end_row = i == num_nonzeros ? num_rows : idxs[i];
for (auto row = begin_row; row < end_row; row++) {
ptrs[row + 1] = i;
}
}
}
}
Expand Down
7 changes: 3 additions & 4 deletions omp/matrix/coo_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename IndexType>
void convert_row_idxs_to_ptrs(std::shared_ptr<const OmpExecutor> exec,
const IndexType *idxs, size_type num_nonzeros,
IndexType *ptrs, size_type length)
IndexType *ptrs, size_type num_rows)
{
convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, length);
convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, num_rows);
}


Expand All @@ -222,8 +222,7 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,

const auto source_row_idxs = source->get_const_row_idxs();

convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs,
num_rows + 1);
convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, num_rows);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down
67 changes: 19 additions & 48 deletions omp/matrix/hybrid_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,58 +112,30 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
const auto coo_row = source->get_const_coo_row_idxs();
const auto coo_nnz = source->get_coo_num_stored_elements();
const auto num_rows = source->get_size()[0];
auto coo_row_ptrs = Array<IndexType>(exec, num_rows + 1);
auto coo_row_ptrs_val = coo_row_ptrs.get_data();
convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs_val,
num_rows + 1);

// Compute the row offset of Coo without zeros
auto coo_offset = Array<IndexType>(exec, num_rows);
auto coo_offset_val = coo_offset.get_data();
for (size_type i = 0; i < num_rows; i++) {
IndexType nonzeros = 0;
#pragma omp parallel for reduction(+ : nonzeros)
for (size_type j = coo_row_ptrs_val[i]; j < coo_row_ptrs_val[i + 1];
j++) {
nonzeros += coo_val[j] != zero<ValueType>();
}
coo_offset_val[i] = nonzeros;
}

// Compute row pointer of Csr
csr_row_ptrs[0] = 0;
#pragma omp parallel for
for (size_type i = 0; i < num_rows; i++) {
csr_row_ptrs[i + 1] = coo_offset_val[i];
}

for (size_type col = 0; col < max_nnz_per_row; col++) {
#pragma omp parallel for
for (size_type row = 0; row < num_rows; row++) {
csr_row_ptrs[row + 1] +=
(ell->val_at(row, col) != zero<ValueType>());
}
}
auto coo_row_ptrs_array = Array<IndexType>(exec, num_rows + 1);
auto coo_row_ptrs = coo_row_ptrs_array.get_data();
convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs, num_rows);

auto workspace = Array<IndexType>(exec, num_rows + 1);
auto workspace_val = workspace.get_data();
for (size_type i = 1; i < num_rows + 1; i <<= 1) {
// Compute the row sizes of Coo without zeros
#pragma omp parallel for
for (size_type j = i; j < num_rows + 1; j++) {
workspace_val[j] = csr_row_ptrs[j] + csr_row_ptrs[j - i];
for (size_type row = 0; row < num_rows; row++) {
IndexType nonzeros{};
for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) {
nonzeros += coo_val[j] != zero<ValueType>();
}
#pragma omp parallel for
for (size_type j = i; j < num_rows + 1; j++) {
csr_row_ptrs[j] = workspace_val[j];
for (size_type col = 0; col < max_nnz_per_row; col++) {
nonzeros += (ell->val_at(row, col) != zero<ValueType>());
}
csr_row_ptrs[row] = nonzeros;
}

// Fill in Csr
components::prefix_sum(exec, csr_row_ptrs, num_rows + 1);

// Fill in Csr
#pragma omp parallel for
for (IndexType row = 0; row < num_rows; row++) {
// Ell part
size_type csr_idx = csr_row_ptrs[row];
size_type coo_idx = coo_offset_val[row];
auto csr_idx = csr_row_ptrs[row];
for (IndexType col = 0; col < max_nnz_per_row; col++) {
const auto val = ell->val_at(row, col);
if (val != zero<ValueType>()) {
Expand All @@ -173,11 +145,10 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
}
}
// Coo part
for (auto coo_idx = coo_row_ptrs_val[row];
coo_idx < coo_row_ptrs_val[row + 1]; coo_idx++) {
if (coo_val[coo_idx] != zero<ValueType>()) {
csr_val[csr_idx] = coo_val[coo_idx];
csr_col_idxs[csr_idx] = coo_col[coo_idx];
for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) {
if (coo_val[j] != zero<ValueType>()) {
csr_val[csr_idx] = coo_val[j];
csr_col_idxs[csr_idx] = coo_col[j];
csr_idx++;
}
}
Expand Down
53 changes: 52 additions & 1 deletion omp/test/matrix/hybrid_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ namespace {

class Hybrid : public ::testing::Test {
protected:
using value_type = double;
using Mtx = gko::matrix::Hybrid<>;
using Vec = gko::matrix::Dense<>;
using ComplexVec = gko::matrix::Dense<std::complex<double>>;
Expand All @@ -77,10 +78,17 @@ class Hybrid : public ::testing::Test {
template <typename MtxType = Vec>
std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
int min_nnz_row)
{
return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
}

template <typename MtxType = Vec>
std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
int min_nnz_row, int max_nnz_row)
{
return gko::test::generate_random_matrix<MtxType>(
num_rows, num_cols,
std::uniform_int_distribution<>(min_nnz_row, num_cols),
std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
}

Expand Down Expand Up @@ -230,6 +238,49 @@ TEST_F(Hybrid, CountNonzerosIsEquivalentToRef)
}


TEST_F(Hybrid, ConvertEmptyCooToCsrIsEquivalentToRef)
{
auto balanced_mtx =
Mtx::create(ref, std::make_shared<Mtx::column_limit>(4));
balanced_mtx->copy_from(gen_mtx(400, 200, 4, 4).get());
auto dbalanced_mtx =
Mtx::create(omp, std::make_shared<Mtx::column_limit>(4));
dbalanced_mtx->copy_from(balanced_mtx.get());
auto csr_mtx = gko::matrix::Csr<>::create(ref);
auto dcsr_mtx = gko::matrix::Csr<>::create(omp);

balanced_mtx->convert_to(csr_mtx.get());
dbalanced_mtx->convert_to(dcsr_mtx.get());

GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
}


TEST_F(Hybrid, ConvertWithEmptyFirstAndLastRowToCsrIsEquivalentToRef)
{
// create a dense matrix for easier manipulation
auto dense_mtx = gen_mtx(400, 200, 0, 4);
// set first and last row to zero
for (gko::size_type col = 0; col < dense_mtx->get_size()[1]; col++) {
dense_mtx->at(0, col) = gko::zero<value_type>();
dense_mtx->at(dense_mtx->get_size()[0] - 1, col) =
gko::zero<value_type>();
}
// now convert them to hybrid matrices
auto balanced_mtx = Mtx::create(ref);
balanced_mtx->copy_from(dense_mtx.get());
auto dbalanced_mtx = Mtx::create(omp);
dbalanced_mtx->copy_from(balanced_mtx.get());
auto csr_mtx = gko::matrix::Csr<>::create(ref);
auto dcsr_mtx = gko::matrix::Csr<>::create(omp);

balanced_mtx->convert_to(csr_mtx.get());
dbalanced_mtx->convert_to(dcsr_mtx.get());

GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
}


TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef)
{
set_up_apply_data(1, std::make_shared<Mtx::column_limit>(2));
Expand Down