Skip to content

Commit

Permalink
Merge improved solver memory estimates and consistent iteration counts
Browse files Browse the repository at this point in the history
This PR adds memory volume estimates for our iterative solvers and
makes the iteration counts reported to stopping criteria and loggers consistent with
outer iterations of the core solver loop.

Related PR: #691
  • Loading branch information
upsj authored Mar 2, 2021
2 parents cab4358 + d11ec4a commit 118d031
Show file tree
Hide file tree
Showing 11 changed files with 158 additions and 80 deletions.
19 changes: 14 additions & 5 deletions core/solver/bicg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ void Bicg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const

int iter = -1;

/* Memory movement summary:
* 28n * values + matrix/preconditioner storage + conj storage
* 2x SpMV: 4n * values + storage + conj storage
* 2x Preconditioner: 4n * values + storage + conj storage
* 2x dot 4n
* 1x step 1 (axpys) 6n
* 1x step 2 (axpys) 9n
* 1x norm2 residual n
*/
while (true) {
get_preconditioner()->apply(r.get(), z.get());
conj_trans_preconditioner->apply(r2.get(), z2.get());
Expand All @@ -201,21 +210,21 @@ void Bicg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(),
rho.get(), prev_rho.get(), &stop_status));
// tmp = rho / prev_rho
// p = z + tmp * p
// p2 = z2 + tmp * p2
exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(),
rho.get(), prev_rho.get(), &stop_status));
system_matrix_->apply(p.get(), q.get());
conj_trans_A->apply(p2.get(), q2.get());
p2->compute_dot(q.get(), beta.get());
exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(),
q.get(), q2.get(), beta.get(), rho.get(),
&stop_status));
// tmp = rho / beta
// x = x + tmp * p
// r = r - tmp * q
// r2 = r2 - tmp * q2
exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(),
q.get(), q2.get(), beta.get(), rho.get(),
&stop_status));
swap(prev_rho, rho);
}
}
Expand Down
29 changes: 19 additions & 10 deletions core/solver/bicgstab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
rr->copy_from(r.get());

int iter = -1;

/* Memory movement summary:
* 31n * values + 2 * matrix/preconditioner storage
* 2x SpMV: 4n * values + 2 * storage
* 2x Preconditioner: 4n * values + 2 * storage
* 3x dot 6n
* 1x norm2 n
* 1x step 1 (fused axpys) 4n
* 1x step 2 (axpy) 3n
* 1x step 3 (fused axpys) 7n
* 2x norm2 residual 2n
*/
while (true) {
++iter;
this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
Expand All @@ -152,21 +164,20 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

// tmp = rho / prev_rho * alpha / omega
// p = r + tmp * (p - omega * v)
exec->run(bicgstab::make_step_1(r.get(), p.get(), v.get(), rho.get(),
prev_rho.get(), alpha.get(),
omega.get(), &stop_status));
// tmp = rho / prev_rho * alpha / omega
// p = r + tmp * (p - omega * v)

get_preconditioner()->apply(p.get(), y.get());
system_matrix_->apply(y.get(), v.get());
rr->compute_dot(v.get(), beta.get());
exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(),
alpha.get(), beta.get(), &stop_status));
// alpha = rho / beta
// s = r - alpha * v
exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(),
alpha.get(), beta.get(), &stop_status));

++iter;
auto all_converged =
stop_criterion->update()
.num_iterations(iter)
Expand All @@ -178,8 +189,6 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
exec->run(bicgstab::make_finalize(dense_x, y.get(), alpha.get(),
&stop_status));
}
this->template log<log::Logger::iteration_complete>(this, iter,
r.get());
if (all_converged) {
break;
}
Expand All @@ -188,12 +197,12 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
system_matrix_->apply(z.get(), t.get());
s->compute_dot(t.get(), gamma.get());
t->compute_dot(t.get(), beta.get());
exec->run(bicgstab::make_step_3(
dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
beta.get(), gamma.get(), omega.get(), &stop_status));
// omega = gamma / beta
// x = x + alpha * y + omega * z
// r = s - omega * t
exec->run(bicgstab::make_step_3(
dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
beta.get(), gamma.get(), omega.get(), &stop_status));
swap(prev_rho, rho);
}
} // namespace solver
Expand Down
17 changes: 13 additions & 4 deletions core/solver/cg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ void Cg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
x, r.get());

int iter = -1;
/* Memory movement summary:
* 18n * values + matrix/preconditioner storage
* 1x SpMV: 2n * values + storage
* 1x Preconditioner: 2n * values + storage
* 2x dot 4n
* 1x step 1 (axpy) 3n
* 1x step 2 (axpys) 6n
* 1x norm2 residual n
*/
while (true) {
get_preconditioner()->apply(r.get(), z.get());
r->compute_dot(z.get(), rho.get());
Expand All @@ -144,17 +153,17 @@ void Cg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(),
&stop_status));
// tmp = rho / prev_rho
// p = z + tmp * p
exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(),
&stop_status));
system_matrix_->apply(p.get(), q.get());
p->compute_dot(q.get(), beta.get());
exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
// tmp = rho / beta
// x = x + tmp * p
// r = r - tmp * q
exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
swap(prev_rho, rho);
}
}
Expand Down
30 changes: 18 additions & 12 deletions core/solver/cgs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,34 +138,40 @@ void Cgs<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
r_tld->copy_from(r.get());

int iter = 0;
/* Memory movement summary:
* 28n * values + 2 * matrix/preconditioner storage
* 2x SpMV: 4n * values + 2 * storage
* 2x Preconditioner: 4n * values + 2 * storage
* 2x dot 4n
* 1x step 1 (fused axpys) 5n
* 1x step 2 (fused axpys) 4n
* 1x step 3 (axpys) 6n
* 1x norm2 residual n
*/
while (true) {
r->compute_dot(r_tld.get(), rho.get());
// beta = rho / rho_prev
// u = r + beta * q
// p = u + beta * ( q + beta * p )
exec->run(cgs::make_step_1(r.get(), u.get(), p.get(), q.get(),
beta.get(), rho.get(), rho_prev.get(),
&stop_status));
// beta = rho / rho_prev
// u = r + beta * q;
// p = u + beta * ( q + beta * p );
get_preconditioner()->apply(p.get(), t.get());
system_matrix_->apply(t.get(), v_hat.get());
r_tld->compute_dot(v_hat.get(), gamma.get());
// alpha = rho / gamma
// q = u - alpha * v_hat
// t = u + q
exec->run(cgs::make_step_2(u.get(), v_hat.get(), q.get(), t.get(),
alpha.get(), rho.get(), gamma.get(),
&stop_status));

++iter;
this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
dense_x);

// alpha = rho / gamma
// q = u - alpha * v_hat
// t = u + q
get_preconditioner()->apply(t.get(), u_hat.get());
system_matrix_->apply(u_hat.get(), t.get());
// r = r - alpha * t
// x = x + alpha * u_hat
exec->run(cgs::make_step_3(t.get(), u_hat.get(), r.get(), dense_x,
alpha.get(), &stop_status));
// r = r -alpha * t
// x = x + alpha * u_hat

++iter;
this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
Expand Down
17 changes: 13 additions & 4 deletions core/solver/fcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,15 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
x, r.get());

int iter = -1;
/* Memory movement summary:
* 21n * values + matrix/preconditioner storage
* 1x SpMV: 2n * values + storage
* 1x Preconditioner: 2n * values + storage
* 3x dot 6n
* 1x step 1 (axpy) 3n
* 1x step 2 (fused axpys) 7n
* 1x norm2 residual n
*/
while (true) {
get_preconditioner()->apply(r.get(), z.get());
r->compute_dot(z.get(), rho.get());
Expand All @@ -147,19 +156,19 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(),
prev_rho.get(), &stop_status));
// tmp = rho_t / prev_rho
// p = z + tmp * p
exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(),
prev_rho.get(), &stop_status));
system_matrix_->apply(p.get(), q.get());
p->compute_dot(q.get(), beta.get());
exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
// tmp = rho / beta
// [prev_r = r] in registers
// x = x + tmp * p
// r = r - tmp * q
// t = r - [prev_r]
exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
swap(prev_rho, rho);
}
}
Expand Down
62 changes: 39 additions & 23 deletions core/solver/gmres.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,24 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
auto after_preconditioner =
matrix::Dense<ValueType>::create_with_config_of(dense_x);

/* Memory movement summary for average iteration with krylov_dim d:
* (5/2d+21/2+14/d)n * values + (1+1/d) * matrix/preconditioner storage
* 1x SpMV: 2n * values + storage
* 1x Preconditioner: 2n * values + storage
* MGS: (5/2d+11/2)n = sum k=0 to d-1 of (5k+8)n/d
* 1x dots 2(k+1)n in iteration k (0-based)
* 1x axpys 3(k+1)n in iteration k (0-based)
* 1x norm2 n
* 1x scal 2n
* Restart: (1+14/d)n (every dth iteration)
* 1x gemv (d+1)n
* 1x Preconditioner 2n * values + storage
* 1x axpy 3n
* 1x copy 2n
* 1x Advanced SpMV 3n * values + storage
* 1x norm2 n
* 1x scal 2n
*/
while (true) {
++total_iter;
this->template log<log::Logger::iteration_complete>(
Expand All @@ -175,32 +193,31 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const

if (restart_iter == krylov_dim_) {
// Restart
// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y
exec->run(gmres::make_step_2(residual_norm_collection.get(),
krylov_bases.get(), hessenberg.get(),
y.get(), before_preconditioner.get(),
&final_iter_nums));
// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y

// x = x + get_preconditioner() * before_preconditioner
get_preconditioner()->apply(before_preconditioner.get(),
after_preconditioner.get());
dense_x->add_scaled(one_op.get(), after_preconditioner.get());
// Solve x
// x = x + get_preconditioner() * before_preconditioner
residual->copy_from(dense_b);
// residual = dense_b
residual->copy_from(dense_b);
// residual = residual - Ax
system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(),
residual.get());
// residual = residual - Ax
exec->run(gmres::make_initialize_2(
residual.get(), residual_norm.get(),
residual_norm_collection.get(), krylov_bases.get(),
&final_iter_nums, krylov_dim_));
// residual_norm = norm(residual)
// residual_norm_collection = {residual_norm, unchanged}
// krylov_bases(:, 1) = residual / residual_norm
// final_iter_nums = {0, ..., 0}
exec->run(gmres::make_initialize_2(
residual.get(), residual_norm.get(),
residual_norm_collection.get(), krylov_bases.get(),
&final_iter_nums, krylov_dim_));
restart_iter = 0;
}
auto this_krylov = krylov_bases->create_submatrix(
Expand All @@ -212,9 +229,9 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
span{system_matrix_->get_size()[0] * (restart_iter + 1),
system_matrix_->get_size()[0] * (restart_iter + 2)},
span{0, dense_b->get_size()[1]});
// preconditioned_vector = get_preconditioner() * this_krylov
get_preconditioner()->apply(this_krylov.get(),
preconditioned_vector.get());
// preconditioned_vector = get_preconditioner() * this_krylov

// Do Arnoldi and givens rotation
auto hessenberg_iter = hessenberg->create_submatrix(
Expand All @@ -223,14 +240,9 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
dense_b->get_size()[1] * (restart_iter + 1)});

// Start of arnoldi
system_matrix_->apply(preconditioned_vector.get(), next_krylov.get());
// next_krylov = A * preconditioned_vector
system_matrix_->apply(preconditioned_vector.get(), next_krylov.get());

exec->run(gmres::make_step_1(
dense_b->get_size()[0], givens_sin.get(), givens_cos.get(),
residual_norm.get(), residual_norm_collection.get(),
krylov_bases.get(), hessenberg_iter.get(), restart_iter,
&final_iter_nums, &stop_status));
// final_iter_nums += 1 (unconverged)
// next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases
// for i in 0:restart_iter(include)
Expand Down Expand Up @@ -267,6 +279,11 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
// residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc
// residual_norm = abs(next_rnc)
// residual_norm_collection(restart_iter + 1) = next_rnc
exec->run(gmres::make_step_1(
dense_b->get_size()[0], givens_sin.get(), givens_cos.get(),
residual_norm.get(), residual_norm_collection.get(),
krylov_bases.get(), hessenberg_iter.get(), restart_iter,
&final_iter_nums, &stop_status));

restart_iter++;
}
Expand All @@ -279,18 +296,17 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
span{0, restart_iter},
span{0, dense_b->get_size()[1] * (restart_iter)});

// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y
exec->run(gmres::make_step_2(
residual_norm_collection.get(), krylov_bases_small.get(),
hessenberg_small.get(), y.get(), before_preconditioner.get(),
&final_iter_nums));
// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y
// x = x + get_preconditioner() * before_preconditioner
get_preconditioner()->apply(before_preconditioner.get(),
after_preconditioner.get());
dense_x->add_scaled(one_op.get(), after_preconditioner.get());
// Solve x
// x = x + get_preconditioner() * before_preconditioner
}


Expand Down
Loading

0 comments on commit 118d031

Please sign in to comment.