From 8093a6f422822daa31749183032559ed44704465 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 7 Jan 2021 18:37:06 +0100 Subject: [PATCH 01/16] move solver kernel comments before calls Co-authored-by: Hartwig Anzt --- core/solver/bicg.cpp | 10 ++++----- core/solver/bicgstab.cpp | 14 ++++++------- core/solver/cg.cpp | 8 ++++---- core/solver/cgs.cpp | 17 ++++++++-------- core/solver/fcg.cpp | 8 ++++---- core/solver/gmres.cpp | 44 +++++++++++++++++++--------------------- core/solver/idr.cpp | 33 +++++++++++++++--------------- 7 files changed, 65 insertions(+), 69 deletions(-) diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index bbc4eab0e03..300b5eb4774 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -201,21 +201,21 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const break; } - exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(), - rho.get(), prev_rho.get(), &stop_status)); // tmp = rho / prev_rho // p = z + tmp * p // p2 = z2 + tmp * p2 + exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(), + rho.get(), prev_rho.get(), &stop_status)); system_matrix_->apply(p.get(), q.get()); conj_trans_A->apply(p2.get(), q2.get()); p2->compute_dot(q.get(), beta.get()); - exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(), - q.get(), q2.get(), beta.get(), rho.get(), - &stop_status)); // tmp = rho / beta // x = x + tmp * p // r = r - tmp * q // r2 = r2 - tmp * q2 + exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(), + q.get(), q2.get(), beta.get(), rho.get(), + &stop_status)); swap(prev_rho, rho); } } diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 0ee788387db..c545da6b313 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -152,19 +152,19 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const break; } + // tmp = rho / prev_rho * alpha / omega + // p = r + tmp * (p - omega * v) exec->run(bicgstab::make_step_1(r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(), omega.get(), &stop_status)); - // tmp = rho / prev_rho * alpha / omega - // p = r + tmp * (p - omega * v) get_preconditioner()->apply(p.get(), y.get()); system_matrix_->apply(y.get(), v.get()); rr->compute_dot(v.get(), beta.get()); - exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(), - alpha.get(), beta.get(), &stop_status)); // alpha = rho / beta // s = r - alpha * v + exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(), + alpha.get(), beta.get(), &stop_status)); ++iter; auto all_converged = @@ -188,12 +188,12 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const system_matrix_->apply(z.get(), t.get()); s->compute_dot(t.get(), gamma.get()); t->compute_dot(t.get(), beta.get()); - exec->run(bicgstab::make_step_3( - dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), - beta.get(), gamma.get(), omega.get(), &stop_status)); // omega = gamma / beta // x = x + alpha * y + omega * z // r = s - omega * t + exec->run(bicgstab::make_step_3( + dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), + beta.get(), gamma.get(), omega.get(), &stop_status)); swap(prev_rho, rho); } } // namespace solver diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index dd0558acf75..dd0d56c6f22 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -144,17 +144,17 @@ void Cg::apply_impl(const LinOp *b, LinOp *x) const break; } - exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(), - &stop_status)); // tmp = rho / prev_rho // p = z + tmp * p + exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(), + &stop_status)); system_matrix_->apply(p.get(), q.get()); p->compute_dot(q.get(), beta.get()); - exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(), - beta.get(), rho.get(), &stop_status)); // tmp = rho / beta // x = x + tmp * p // r = r - tmp * q + exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(), + beta.get(), rho.get(), &stop_status)); swap(prev_rho, rho); } } diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index febacbb03bf..229e5de622d 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -140,15 +140,18 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const int iter = 0; while (true) { r->compute_dot(r_tld.get(), rho.get()); + // beta = rho / rho_prev + // u = r + beta * q + // p = u + beta * ( q + beta * p ) exec->run(cgs::make_step_1(r.get(), u.get(), p.get(), q.get(), beta.get(), rho.get(), rho_prev.get(), &stop_status)); - // beta = rho / rho_prev - // u = r + beta * q; - // p = u + beta * ( q + beta * p ); get_preconditioner()->apply(p.get(), t.get()); system_matrix_->apply(t.get(), v_hat.get()); r_tld->compute_dot(v_hat.get(), gamma.get()); + // alpha = rho / gamma + // q = u - alpha * v_hat + // t = u + q exec->run(cgs::make_step_2(u.get(), v_hat.get(), q.get(), t.get(), alpha.get(), rho.get(), gamma.get(), &stop_status)); @@ -156,16 +159,12 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const ++iter; this->template log(this, iter, r.get(), dense_x); - - // alpha = rho / gamma - // q = u - alpha * v_hat - // t = u + q get_preconditioner()->apply(t.get(), u_hat.get()); system_matrix_->apply(u_hat.get(), t.get()); + // r = r - alpha * t + // x = x + alpha * u_hat exec->run(cgs::make_step_3(t.get(), u_hat.get(), r.get(), dense_x, alpha.get(), &stop_status)); - // r = r -alpha * t - // x = x + alpha * u_hat ++iter; this->template log(this, iter, r.get(), diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index dea9aeff494..f46e37efe13 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -147,19 +147,19 @@ void Fcg::apply_impl(const LinOp *b, LinOp *x) const break; } - exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(), - prev_rho.get(), &stop_status)); // tmp = rho_t / prev_rho // p = z + tmp * p + exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(), + prev_rho.get(), &stop_status)); system_matrix_->apply(p.get(), q.get()); p->compute_dot(q.get(), beta.get()); - exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(), - beta.get(), rho.get(), &stop_status)); // tmp = rho / beta // [prev_r = r] in registers // x = x + tmp * p // r = r - tmp * q // t = r - [prev_r] + exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(), + beta.get(), rho.get(), &stop_status)); swap(prev_rho, rho); } } diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 3ee99cd3ed3..fb6d4fe012c 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -175,32 +175,31 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const if (restart_iter == krylov_dim_) { // Restart + // Solve upper triangular. + // y = hessenberg \ residual_norm_collection + // before_preconditioner = krylov_bases * y exec->run(gmres::make_step_2(residual_norm_collection.get(), krylov_bases.get(), hessenberg.get(), y.get(), before_preconditioner.get(), &final_iter_nums)); - // Solve upper triangular. - // y = hessenberg \ residual_norm_collection - // before_preconditioner = krylov_bases * y + // x = x + get_preconditioner() * before_preconditioner get_preconditioner()->apply(before_preconditioner.get(), after_preconditioner.get()); dense_x->add_scaled(one_op.get(), after_preconditioner.get()); - // Solve x - // x = x + get_preconditioner() * before_preconditioner - residual->copy_from(dense_b); // residual = dense_b + residual->copy_from(dense_b); + // residual = residual - Ax system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), residual.get()); - // residual = residual - Ax - exec->run(gmres::make_initialize_2( - residual.get(), residual_norm.get(), - residual_norm_collection.get(), krylov_bases.get(), - &final_iter_nums, krylov_dim_)); // residual_norm = norm(residual) // residual_norm_collection = {residual_norm, unchanged} // krylov_bases(:, 1) = residual / residual_norm // final_iter_nums = {0, ..., 0} + exec->run(gmres::make_initialize_2( + residual.get(), residual_norm.get(), + residual_norm_collection.get(), krylov_bases.get(), + &final_iter_nums, krylov_dim_)); restart_iter = 0; } auto this_krylov = krylov_bases->create_submatrix( @@ -212,9 +211,9 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const span{system_matrix_->get_size()[0] * (restart_iter + 1), system_matrix_->get_size()[0] * (restart_iter + 2)}, span{0, dense_b->get_size()[1]}); + // preconditioned_vector = get_preconditioner() * this_krylov get_preconditioner()->apply(this_krylov.get(), preconditioned_vector.get()); - // preconditioned_vector = get_preconditioner() * this_krylov // Do Arnoldi and givens rotation auto hessenberg_iter = hessenberg->create_submatrix( @@ -223,14 +222,9 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const dense_b->get_size()[1] * (restart_iter + 1)}); // Start of arnoldi - system_matrix_->apply(preconditioned_vector.get(), next_krylov.get()); // next_krylov = A * preconditioned_vector + system_matrix_->apply(preconditioned_vector.get(), next_krylov.get()); - exec->run(gmres::make_step_1( - dense_b->get_size()[0], givens_sin.get(), givens_cos.get(), - residual_norm.get(), residual_norm_collection.get(), - krylov_bases.get(), hessenberg_iter.get(), restart_iter, - &final_iter_nums, &stop_status)); // final_iter_nums += 1 (unconverged) // next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases // for i in 0:restart_iter(include) @@ -267,6 +261,11 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const // residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc // residual_norm = abs(next_rnc) // residual_norm_collection(restart_iter + 1) = next_rnc + exec->run(gmres::make_step_1( + dense_b->get_size()[0], givens_sin.get(), givens_cos.get(), + residual_norm.get(), residual_norm_collection.get(), + krylov_bases.get(), hessenberg_iter.get(), restart_iter, + &final_iter_nums, &stop_status)); restart_iter++; } @@ -279,18 +278,17 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const span{0, restart_iter}, span{0, dense_b->get_size()[1] * (restart_iter)}); + // Solve upper triangular. + // y = hessenberg \ residual_norm_collection + // before_preconditioner = krylov_bases * y exec->run(gmres::make_step_2( residual_norm_collection.get(), krylov_bases_small.get(), hessenberg_small.get(), y.get(), before_preconditioner.get(), &final_iter_nums)); - // Solve upper triangular. - // y = hessenberg \ residual_norm_collection - // before_preconditioner = krylov_bases * y + // x = x + get_preconditioner() * before_preconditioner get_preconditioner()->apply(before_preconditioner.get(), after_preconditioner.get()); dense_x->add_scaled(one_op.get(), after_preconditioner.get()); - // Solve x - // x = x + get_preconditioner() * before_preconditioner } diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 88683136a00..d4428abbe08 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -185,32 +185,28 @@ void Idr::iterate(const LinOp *b, LinOp *x) const break; } - subspace_vectors->apply(residual.get(), f.get()); // f = P^H * residual + subspace_vectors->apply(residual.get(), f.get()); for (size_type k = 0; k < subspace_dim_; k++) { + // c = M \ f = (c_1, ..., c_s)^T + // v = residual - c_k * g_k - ... - c_s * g_s exec->run(idr::make_step_1(nrhs, k, m.get(), f.get(), residual.get(), g.get(), c.get(), v.get(), &stop_status)); - // c = M \ f = (c_1, ..., c_s)^T - // v = residual - c_k * g_k - ... - c_s * g_s get_preconditioner()->apply(v.get(), helper.get()); + // u_k = omega * preconditioned_vector + c_k * u_k + ... + c_s * u_s exec->run(idr::make_step_2(nrhs, k, omega.get(), helper.get(), c.get(), u.get(), &stop_status)); - // u_k = omega * preconditioned_vector + c_k * u_k + ... + c_s * u_s auto u_k = u->create_submatrix(span{0, problem_size}, span{k * nrhs, (k + 1) * nrhs}); - system_matrix_->apply(u_k.get(), helper.get()); // g_k = Au_k + system_matrix_->apply(u_k.get(), helper.get()); - exec->run(idr::make_step_3(nrhs, k, subspace_vectors.get(), g.get(), - helper.get(), u.get(), m.get(), f.get(), - alpha.get(), residual.get(), dense_x, - &stop_status)); // for i = 1 to k - 1 do // alpha = p^H_i * g_k / m_i,i // g_k -= alpha * g_i @@ -223,6 +219,10 @@ void Idr::iterate(const LinOp *b, LinOp *x) const // residual -= beta * g_k // dense_x += beta * u_k // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s - beta * m_s,k) + exec->run(idr::make_step_3(nrhs, k, subspace_vectors.get(), g.get(), + helper.get(), u.get(), m.get(), f.get(), + alpha.get(), residual.get(), dense_x, + &stop_status)); } get_preconditioner()->apply(residual.get(), helper.get()); @@ -232,14 +232,6 @@ void Idr::iterate(const LinOp *b, LinOp *x) const t->compute_dot(t.get(), tht.get()); residual->compute_norm2(residual_norm.get()); - exec->run(idr::make_compute_omega(nrhs, kappa_, tht.get(), - residual_norm.get(), omega.get(), - &stop_status)); - - t->scale(subspace_neg_one_op.get()); - residual->add_scaled(omega.get(), t.get()); - dense_x->add_scaled(omega.get(), helper.get()); - // omega = (t^H * residual) / (t^H * t) // rho = (t^H * residual) / (norm(t) * norm(residual)) // if abs(rho) < kappa then @@ -247,6 +239,13 @@ void Idr::iterate(const LinOp *b, LinOp *x) const // end if // residual -= omega * t // dense_x += omega * v + exec->run(idr::make_compute_omega(nrhs, kappa_, tht.get(), + residual_norm.get(), omega.get(), + &stop_status)); + + t->scale(subspace_neg_one_op.get()); + residual->add_scaled(omega.get(), t.get()); + dense_x->add_scaled(omega.get(), helper.get()); } } From e43843d4ec335c44881a7d7edfeefbfacc55bdf6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 7 Jan 2021 18:37:53 +0100 Subject: [PATCH 02/16] use C++ headers for time.h --- cuda/solver/idr_kernels.cu | 2 +- hip/solver/idr_kernels.hip.cpp | 2 +- omp/solver/idr_kernels.cpp | 1 + reference/solver/idr_kernels.cpp | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index a493b2c8105..13dd40cd762 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/idr_kernels.hpp" +#include #include -#include #include diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 5c6b3bd6e92..a467504e95c 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/idr_kernels.hpp" +#include #include -#include #include diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index debfaf86eb8..44dc5d870be 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index c1e40c1786c..18a152cbf91 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include From 707b305182281fc9292771e06024b32ca8a46ba4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 7 Jan 2021 18:38:18 +0100 Subject: [PATCH 03/16] add memory movement estimates to solvers Co-authored-by: Hartwig Anzt --- core/solver/bicg.cpp | 8 ++++++++ core/solver/bicgstab.cpp | 11 +++++++++++ core/solver/cg.cpp | 8 ++++++++ core/solver/cgs.cpp | 9 +++++++++ core/solver/fcg.cpp | 8 ++++++++ core/solver/gmres.cpp | 11 +++++++++++ core/solver/idr.cpp | 16 ++++++++++++++++ 7 files changed, 71 insertions(+) diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index 300b5eb4774..ea23dca52fd 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -184,6 +184,14 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; + /* Memory movement summary: + * 27n * values + 2 * matrix/preconditioner storage + * 2x SpMV: 4n * values + 2 * storage + * 2x Preconditioner: 4n * values + 2 * storage + * 2x dot 4n + * 1x step 1 (axpys) 6n + * 1x step 2 (axpys) 9n + */ while (true) { get_preconditioner()->apply(r.get(), z.get()); conj_trans_preconditioner->apply(r2.get(), z2.get()); diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index c545da6b313..4d2216ca57b 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -137,6 +137,17 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const rr->copy_from(r.get()); int iter = -1; + + /* Memory movement summary: + * 29n * values + 2 * matrix/preconditioner storage + * 2x SpMV: 4n * values + 2 * storage + * 2x Preconditioner: 4n * values + 2 * storage + * 3x dot 6n + * 1x norm2 n + * 1x step 1 (fused axpys) 4n + * 1x step 2 (axpy) 3n + * 1x step 3 (fused axpys) 7n + */ while (true) { ++iter; this->template log(this, iter, r.get(), diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index dd0d56c6f22..13ecfc89e82 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -128,6 +128,14 @@ void Cg::apply_impl(const LinOp *b, LinOp *x) const x, r.get()); int iter = -1; + /* Memory movement summary: + * 17n * values + matrix/preconditioner storage + * 1x SpMV: 2n * values + storage + * 1x Preconditioner: 2n * values + storage + * 2x dot 4n + * 1x step 1 (axpy) 3n + * 1x step 2 (axpys) 6n + */ while (true) { get_preconditioner()->apply(r.get(), z.get()); r->compute_dot(z.get(), rho.get()); diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 229e5de622d..4689af5bb4f 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -138,6 +138,15 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const r_tld->copy_from(r.get()); int iter = 0; + /* Memory movement summary: + * 27n * values + 2 * matrix/preconditioner storage + * 2x SpMV: 4n * values + 2 * storage + * 2x Preconditioner: 4n * values + 2 * storage + * 2x dot 4n + * 1x step 1 (fused axpys) 5n + * 1x step 2 (fused axpys) 4n + * 1x step 3 (axpys) 6n + */ while (true) { r->compute_dot(r_tld.get(), rho.get()); // beta = rho / rho_prev diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index f46e37efe13..74cb939ef31 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -130,6 +130,14 @@ void Fcg::apply_impl(const LinOp *b, LinOp *x) const x, r.get()); int iter = -1; + /* Memory movement summary: + * 20n * values + matrix/preconditioner storage + * 1x SpMV: 2n * values + storage + * 1x Preconditioner: 2n * values + storage + * 3x dot 6n + * 1x step 1 (axpy) 3n + * 1x step 2 (fused axpys) 7n + */ while (true) { get_preconditioner()->apply(r.get(), z.get()); r->compute_dot(z.get(), rho.get()); diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index fb6d4fe012c..1a231dd005f 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -159,6 +159,17 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const auto after_preconditioner = matrix::Dense::create_with_config_of(dense_x); + /* Memory movement summary for iteration in Krylov subspace of dimension d + * (== krylov_dim / 2 on average), ignoring restarts: + * (5d+7)n * values + matrix/preconditioner storage + * 1x SpMV: 2n * values + storage + * 1x Preconditioner: 2n * values + storage + * MGS: (5d+3)n + * dx dot 2dn + * dx axpys 3dn + * 1x norm2 n + * 1x scal 2n + */ while (true) { ++total_iter; this->template log( diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index d4428abbe08..d862f6c1f78 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -171,6 +171,22 @@ void Idr::iterate(const LinOp *b, LinOp *x) const int total_iter = -1; + /* Memory movement summary for iteration with subspace dimension d + * (3d²+12(d+1))n * values + d * matrix/preconditioner storage + * dx SpMV: 2dn * values + d * storage + * dx Preconditioner: 2dn * values + d * storage + * 1x multidot (gemm) (d+1)n + * dx step 1 (fused axpys) d(d/2+2)n on average + * dx step 2 (fused axpys) d(d/2+2)n on average + * dx step 3: d(2d+4)n on average + * 1x orthogonalize g+u (3k+2)n in kth iteration + * 1x multidot (gemm) kn in (d-k)th iteration + * 2x axpy 6n + * 2x dot 4n + * 1x norm2 n + * 1x scale 2n + * 2x axpy 4n + */ while (true) { ++total_iter; this->template log( From 4ef264501ae730549acb88a0d470ceda7e8b5050 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 10 Jan 2021 19:59:54 +0100 Subject: [PATCH 04/16] update GMRES estimates to include precise counts Co-authored-by: Aditya Kashi --- core/solver/gmres.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 1a231dd005f..26525b4318e 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -159,16 +159,18 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const auto after_preconditioner = matrix::Dense::create_with_config_of(dense_x); - /* Memory movement summary for iteration in Krylov subspace of dimension d - * (== krylov_dim / 2 on average), ignoring restarts: - * (5d+7)n * values + matrix/preconditioner storage + /* Memory movement summary for average iteration with krylov_dim d: + * (5/2d+15/2)n * values + matrix/preconditioner storage * 1x SpMV: 2n * values + storage * 1x Preconditioner: 2n * values + storage - * MGS: (5d+3)n - * dx dot 2dn - * dx axpys 3dn - * 1x norm2 n - * 1x scal 2n + * MGS: (5/2d+5/2)n = sum k=0 to d-1 of (5k+5)n/d + * 1x dots 2(k+1)n in iteration k (0-based) + * 1x axpys 3(k+1)n in iteration k (0-based) + * 1x norm2 n + * 1x scal 2n + * Restart (average): n = approx (d+1)n/d + * 1x gemm (d+1)n + * plus some O(n) terms we can ignore for larger d */ while (true) { ++total_iter; From 6597f0e46ab313fddc08f7b54064bcd6c4d4eeee Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 10 Jan 2021 20:43:00 +0100 Subject: [PATCH 05/16] count SpMV operations as iterations in IDR --- core/solver/idr.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index d862f6c1f78..e679f69d02c 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -205,6 +205,10 @@ void Idr::iterate(const LinOp *b, LinOp *x) const subspace_vectors->apply(residual.get(), f.get()); for (size_type k = 0; k < subspace_dim_; k++) { + ++total_iter; + this->template log( + this, total_iter, residual.get(), dense_x); + // c = M \ f = (c_1, ..., c_s)^T // v = residual - c_k * g_k - ... - c_s * g_s exec->run(idr::make_step_1(nrhs, k, m.get(), f.get(), From 99933f2021fb9730b1702e7c941eb0628d89106c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 10 Jan 2021 20:44:03 +0100 Subject: [PATCH 06/16] update IDR counts --- core/solver/idr.cpp | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index e679f69d02c..0939ab5984a 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -171,21 +171,21 @@ void Idr::iterate(const LinOp *b, LinOp *x) const int total_iter = -1; - /* Memory movement summary for iteration with subspace dimension d - * (3d²+12(d+1))n * values + d * matrix/preconditioner storage - * dx SpMV: 2dn * values + d * storage - * dx Preconditioner: 2dn * values + d * storage - * 1x multidot (gemm) (d+1)n - * dx step 1 (fused axpys) d(d/2+2)n on average - * dx step 2 (fused axpys) d(d/2+2)n on average - * dx step 3: d(2d+4)n on average - * 1x orthogonalize g+u (3k+2)n in kth iteration - * 1x multidot (gemm) kn in (d-k)th iteration - * 2x axpy 6n - * 2x dot 4n - * 1x norm2 n - * 1x scale 2n - * 2x axpy 4n + /* Memory movement summary for iteration with subspace dimension s + * (11/2s^2+27/2d+...)n * values + (s+1) * matrix/preconditioner storage + * dx SpMV: 2(s+1)n * values + (s+1) * storage + * dx Preconditioner: 2(s+1)n * values + (s+1) * storage + * 1x multidot (gemm) (s+1)n + * dx step 1 (fused axpys) s(s/2+5/2)n = approx 1 + sum k=[0,s) of (s-k+1)n + * dx step 2 (fused axpys) s(s/2+5/2)n = approx 1 + sum k=[0,s) of (s-k+1)n + * dx step 3: s(9/2s+7/2)n = sum k=[0,s) of (8k+s-k+1+6)n + * 1x orthogonalize g+u 8kn in iteration k (0-based) + * 1x multidot (gemm) (s-k+1)n in iteration k (0-based) + * 2x axpy 6n + * 1x dot 2n + * 2x norm2 2n + * 1x scale 2n + * 2x axpy 4n */ while (true) { ++total_iter; @@ -210,14 +210,14 @@ void Idr::iterate(const LinOp *b, LinOp *x) const this, total_iter, residual.get(), dense_x); // c = M \ f = (c_1, ..., c_s)^T - // v = residual - c_k * g_k - ... - c_s * g_s + // v = residual - sum i=[k,s) of (c_i * g_i) exec->run(idr::make_step_1(nrhs, k, m.get(), f.get(), residual.get(), g.get(), c.get(), v.get(), &stop_status)); get_preconditioner()->apply(v.get(), helper.get()); - // u_k = omega * preconditioned_vector + c_k * u_k + ... + c_s * u_s + // u_k = omega * precond_vector + sum i=[k,s) of (c_i * u_i) exec->run(idr::make_step_2(nrhs, k, omega.get(), helper.get(), c.get(), u.get(), &stop_status)); @@ -227,18 +227,18 @@ void Idr::iterate(const LinOp *b, LinOp *x) const // g_k = Au_k system_matrix_->apply(u_k.get(), helper.get()); - // for i = 1 to k - 1 do + // for i = [0,k) // alpha = p^H_i * g_k / m_i,i // g_k -= alpha * g_i // u_k -= alpha * u_i // end for - // for i = k to s do + // for i = [k,s) // m_i,k = p^H_i * g_k // end for // beta = f_k / m_k,k // residual -= beta * g_k // dense_x += beta * u_k - // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s - beta * m_s,k) + // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s-1 - beta * m_s-1,k) exec->run(idr::make_step_3(nrhs, k, subspace_vectors.get(), g.get(), helper.get(), u.get(), m.get(), f.get(), alpha.get(), residual.get(), dense_x, From 5108f864b52daca95aea5bafb4033ba35a3cd11a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 11 Jan 2021 16:12:40 +0100 Subject: [PATCH 07/16] Review updates Co-authored-by: Yuhsiang Tsai Co-authored-by: Aditya Kashi --- core/solver/bicg.cpp | 9 ++++++--- core/solver/bicgstab.cpp | 3 +++ core/solver/cgs.cpp | 3 +++ core/solver/idr.cpp | 10 +++++++--- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index ea23dca52fd..0c80443169b 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -185,9 +185,12 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; /* Memory movement summary: - * 27n * values + 2 * matrix/preconditioner storage - * 2x SpMV: 4n * values + 2 * storage - * 2x Preconditioner: 4n * values + 2 * storage + * Per iteration: + * 13.5n * values + matrix/preconditioner storage + * Two iterations: + * 27n * values + matrix/preconditioner storage + conj storage + * 2x SpMV: 4n * values + storage + conj storage + * 2x Preconditioner: 4n * values + storage + conj storage * 2x dot 4n * 1x step 1 (axpys) 6n * 1x step 2 (axpys) 9n diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 4d2216ca57b..238acc48e45 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -139,6 +139,9 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; /* Memory movement summary: + * Per iteration: + * 14.5n * values + matrix/preconditioner storage + * Two iterations: * 29n * values + 2 * matrix/preconditioner storage * 2x SpMV: 4n * values + 2 * storage * 2x Preconditioner: 4n * values + 2 * storage diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 4689af5bb4f..af8cc1fee7b 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -139,6 +139,9 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const int iter = 0; /* Memory movement summary: + * Per iteration: + * 13.5n * values + matrix/preconditioner storage + * Two iterations: * 27n * values + 2 * matrix/preconditioner storage * 2x SpMV: 4n * values + 2 * storage * 2x Preconditioner: 4n * values + 2 * storage diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 0939ab5984a..7f61f290a35 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -172,14 +172,17 @@ void Idr::iterate(const LinOp *b, LinOp *x) const int total_iter = -1; /* Memory movement summary for iteration with subspace dimension s - * (11/2s^2+27/2d+...)n * values + (s+1) * matrix/preconditioner storage + * Per iteration: + * (11/2s+10+5/(s+1))n * values + matrix/preconditioner storage + * For (s+1) iterations: + * (11/2s^2+31/2s+15)n * values + (s+1) * matrix/preconditioner storage * dx SpMV: 2(s+1)n * values + (s+1) * storage * dx Preconditioner: 2(s+1)n * values + (s+1) * storage * 1x multidot (gemm) (s+1)n * dx step 1 (fused axpys) s(s/2+5/2)n = approx 1 + sum k=[0,s) of (s-k+1)n * dx step 2 (fused axpys) s(s/2+5/2)n = approx 1 + sum k=[0,s) of (s-k+1)n - * dx step 3: s(9/2s+7/2)n = sum k=[0,s) of (8k+s-k+1+6)n - * 1x orthogonalize g+u 8kn in iteration k (0-based) + * dx step 3: s(9/2s+11/2)n = sum k=[0,s) of (8k+2+s-k+1+6)n + * 1x orthogonalize g+u (8k+2)n in iteration k (0-based) * 1x multidot (gemm) (s-k+1)n in iteration k (0-based) * 2x axpy 6n * 1x dot 2n @@ -232,6 +235,7 @@ void Idr::iterate(const LinOp *b, LinOp *x) const // g_k -= alpha * g_i // u_k -= alpha * u_i // end for + // store g_k to g // for i = [k,s) // m_i,k = p^H_i * g_k // end for From 927c201f4e88d330a99563be1280d0d5857e6f2d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 11 Jan 2021 16:22:22 +0100 Subject: [PATCH 08/16] exact gmres restart estimate --- core/solver/gmres.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 26525b4318e..91224a33c05 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -160,7 +160,7 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const matrix::Dense::create_with_config_of(dense_x); /* Memory movement summary for average iteration with krylov_dim d: - * (5/2d+15/2)n * values + matrix/preconditioner storage + * (5/2d+15/2+12/d)n * values + (1+1/d) * matrix/preconditioner storage * 1x SpMV: 2n * values + storage * 1x Preconditioner: 2n * values + storage * MGS: (5/2d+5/2)n = sum k=0 to d-1 of (5k+5)n/d @@ -168,9 +168,13 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const * 1x axpys 3(k+1)n in iteration k (0-based) * 1x norm2 n * 1x scal 2n - * Restart (average): n = approx (d+1)n/d + * Restart: (1+12/d)n (every dth iteration) * 1x gemm (d+1)n - * plus some O(n) terms we can ignore for larger d + * 1x Preconditioner 2n * values + storage + * 1x axpy 3n + * 1x Advanced SpMV 3n * values + storage + * 1x norm2 n + * 2x scal 2n */ while (true) { ++total_iter; From b7be195c28d4f9c9a2715c45e7373fea3dd4a297 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 20 Jan 2021 17:42:17 +0100 Subject: [PATCH 09/16] update GMRES estimates Co-authored-by: Aditya Kashi Co-authored-by: Yuhsiang Tsai --- core/solver/gmres.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 91224a33c05..730d539df46 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -160,21 +160,22 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const matrix::Dense::create_with_config_of(dense_x); /* Memory movement summary for average iteration with krylov_dim d: - * (5/2d+15/2+12/d)n * values + (1+1/d) * matrix/preconditioner storage + * (5/2d+21/2+14/d)n * values + (1+1/d) * matrix/preconditioner storage * 1x SpMV: 2n * values + storage * 1x Preconditioner: 2n * values + storage - * MGS: (5/2d+5/2)n = sum k=0 to d-1 of (5k+5)n/d + * MGS: (5/2d+11/2)n = sum k=0 to d-1 of (5k+8)n/d * 1x dots 2(k+1)n in iteration k (0-based) * 1x axpys 3(k+1)n in iteration k (0-based) * 1x norm2 n * 1x scal 2n - * Restart: (1+12/d)n (every dth iteration) + * Restart: (1+14/d)n (every dth iteration) * 1x gemm (d+1)n * 1x Preconditioner 2n * values + storage * 1x axpy 3n + * 1x copy 2n * 1x Advanced SpMV 3n * values + storage * 1x norm2 n - * 2x scal 2n + * 1x scal 2n */ while (true) { ++total_iter; From df862178aedb459012fdff734525e34efee2366a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 22 Jan 2021 11:42:58 +0100 Subject: [PATCH 10/16] update IDR estimates Co-authored-by: Yuhsiang Tsai --- core/solver/gmres.cpp | 2 +- core/solver/idr.cpp | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 730d539df46..a5727cdb151 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -169,7 +169,7 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const * 1x norm2 n * 1x scal 2n * Restart: (1+14/d)n (every dth iteration) - * 1x gemm (d+1)n + * 1x gemv (d+1)n * 1x Preconditioner 2n * values + storage * 1x axpy 3n * 1x copy 2n diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 7f61f290a35..9cdc35e1295 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -173,22 +173,22 @@ void Idr::iterate(const LinOp *b, LinOp *x) const /* Memory movement summary for iteration with subspace dimension s * Per iteration: - * (11/2s+10+5/(s+1))n * values + matrix/preconditioner storage + * (11/2s+10+7/(s+1))n * values + matrix/preconditioner storage * For (s+1) iterations: - * (11/2s^2+31/2s+15)n * values + (s+1) * matrix/preconditioner storage - * dx SpMV: 2(s+1)n * values + (s+1) * storage - * dx Preconditioner: 2(s+1)n * values + (s+1) * storage - * 1x multidot (gemm) (s+1)n - * dx step 1 (fused axpys) s(s/2+5/2)n = approx 1 + sum k=[0,s) of (s-k+1)n - * dx step 2 (fused axpys) s(s/2+5/2)n = approx 1 + sum k=[0,s) of (s-k+1)n - * dx step 3: s(9/2s+11/2)n = sum k=[0,s) of (8k+2+s-k+1+6)n + * (11/2s^2+31/2s+17)n * values + (s+1) * matrix/preconditioner storage + * (s+1)x SpMV: 2(s+1)n * values + (s+1) * storage + * (s+1)x Preconditioner: 2(s+1)n * values + (s+1) * storage + * 1x multidot (gemv) (s+1)n + * sx step 1 (fused axpys) s(s/2+5/2)n = sum k=[0,s) of (s-k+2)n + * sx step 2 (fused axpys) s(s/2+5/2)n = sum k=[0,s) of (s-k+2)n + * sx step 3: s(9/2s+11/2)n = sum k=[0,s) of (8k+2+s-k+1+6)n * 1x orthogonalize g+u (8k+2)n in iteration k (0-based) - * 1x multidot (gemm) (s-k+1)n in iteration k (0-based) + * 1x multidot (gemv) (s-k+1)n in iteration k (0-based) * 2x axpy 6n * 1x dot 2n * 2x norm2 2n * 1x scale 2n - * 2x axpy 4n + * 2x axpy 6n */ while (true) { ++total_iter; From bb8bb2ee5e5a43303361b3d5b15a8c05aa4f853a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Feb 2021 13:01:54 +0100 Subject: [PATCH 11/16] Revert "count SpMV operations as iterations in IDR" This reverts commit b59befece0db8b3abc34b1e28709c50bf83a6f53. --- core/solver/idr.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 9cdc35e1295..8dcde2e00d9 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -173,9 +173,7 @@ void Idr::iterate(const LinOp *b, LinOp *x) const /* Memory movement summary for iteration with subspace dimension s * Per iteration: - * (11/2s+10+7/(s+1))n * values + matrix/preconditioner storage - * For (s+1) iterations: - * (11/2s^2+31/2s+17)n * values + (s+1) * matrix/preconditioner storage + * (11/2s^2+31/2s+18)n * values + (s+1) * matrix/preconditioner storage * (s+1)x SpMV: 2(s+1)n * values + (s+1) * storage * (s+1)x Preconditioner: 2(s+1)n * values + (s+1) * storage * 1x multidot (gemv) (s+1)n @@ -189,6 +187,7 @@ void Idr::iterate(const LinOp *b, LinOp *x) const * 2x norm2 2n * 1x scale 2n * 2x axpy 6n + * 1x norm2 residual n */ while (true) { ++total_iter; @@ -208,10 +207,6 @@ void Idr::iterate(const LinOp *b, LinOp *x) const subspace_vectors->apply(residual.get(), f.get()); for (size_type k = 0; k < subspace_dim_; k++) { - ++total_iter; - this->template log( - this, total_iter, residual.get(), dense_x); - // c = M \ f = (c_1, ..., c_s)^T // v = residual - sum i=[k,s) of (c_i * g_i) exec->run(idr::make_step_1(nrhs, k, m.get(), f.get(), From b88b9fbbe0923e9ccb593347ee419f77d4c60ede Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Feb 2021 13:26:12 +0100 Subject: [PATCH 12/16] add stopping criterion norm2 call --- core/solver/bicg.cpp | 5 +++-- core/solver/bicgstab.cpp | 5 +++-- core/solver/cg.cpp | 3 ++- core/solver/cgs.cpp | 5 +++-- core/solver/fcg.cpp | 3 ++- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index 0c80443169b..9e58930e63b 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -186,14 +186,15 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const /* Memory movement summary: * Per iteration: - * 13.5n * values + matrix/preconditioner storage + * 14n * values + matrix/preconditioner storage * Two iterations: - * 27n * values + matrix/preconditioner storage + conj storage + * 28n * values + matrix/preconditioner storage + conj storage * 2x SpMV: 4n * values + storage + conj storage * 2x Preconditioner: 4n * values + storage + conj storage * 2x dot 4n * 1x step 1 (axpys) 6n * 1x step 2 (axpys) 9n + * 1x norm2 residual n */ while (true) { get_preconditioner()->apply(r.get(), z.get()); diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 238acc48e45..e05a3150dc5 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -140,9 +140,9 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const /* Memory movement summary: * Per iteration: - * 14.5n * values + matrix/preconditioner storage + * 15.5n * values + matrix/preconditioner storage * Two iterations: - * 29n * values + 2 * matrix/preconditioner storage + * 31n * values + 2 * matrix/preconditioner storage * 2x SpMV: 4n * values + 2 * storage * 2x Preconditioner: 4n * values + 2 * storage * 3x dot 6n @@ -150,6 +150,7 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const * 1x step 1 (fused axpys) 4n * 1x step 2 (axpy) 3n * 1x step 3 (fused axpys) 7n + * 2x norm2 residual 2n */ while (true) { ++iter; diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index 13ecfc89e82..5a76392abff 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -129,12 +129,13 @@ void Cg::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; /* Memory movement summary: - * 17n * values + matrix/preconditioner storage + * 18n * values + matrix/preconditioner storage * 1x SpMV: 2n * values + storage * 1x Preconditioner: 2n * values + storage * 2x dot 4n * 1x step 1 (axpy) 3n * 1x step 2 (axpys) 6n + * 1x norm2 residual n */ while (true) { get_preconditioner()->apply(r.get(), z.get()); diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index af8cc1fee7b..da7db6ee70f 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -140,15 +140,16 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const int iter = 0; /* Memory movement summary: * Per iteration: - * 13.5n * values + matrix/preconditioner storage + * 14n * values + matrix/preconditioner storage * Two iterations: - * 27n * values + 2 * matrix/preconditioner storage + * 28n * values + 2 * matrix/preconditioner storage * 2x SpMV: 4n * values + 2 * storage * 2x Preconditioner: 4n * values + 2 * storage * 2x dot 4n * 1x step 1 (fused axpys) 5n * 1x step 2 (fused axpys) 4n * 1x step 3 (axpys) 6n + * 1x norm2 residual n */ while (true) { r->compute_dot(r_tld.get(), rho.get()); diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index 74cb939ef31..1e38cdae02d 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -131,12 +131,13 @@ void Fcg::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; /* Memory movement summary: - * 20n * values + matrix/preconditioner storage + * 21n * values + matrix/preconditioner storage * 1x SpMV: 2n * values + storage * 1x Preconditioner: 2n * values + storage * 3x dot 6n * 1x step 1 (axpy) 3n * 1x step 2 (fused axpys) 7n + * 1x norm2 residual n */ while (true) { get_preconditioner()->apply(r.get(), z.get()); From 95a2f062f14851caf74713524b4899735fe2d513 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Feb 2021 14:08:08 +0100 Subject: [PATCH 13/16] fix bicg iteration count in estimate --- core/solver/bicg.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index 9e58930e63b..7cb51f2f404 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -185,9 +185,6 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; /* Memory movement summary: - * Per iteration: - * 14n * values + matrix/preconditioner storage - * Two iterations: * 28n * values + matrix/preconditioner storage + conj storage * 2x SpMV: 4n * values + storage + conj storage * 2x Preconditioner: 4n * values + storage + conj storage From 05a0ee6cc0f7ed194a297ce7a336603d0a9396ce Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Feb 2021 16:00:26 +0100 Subject: [PATCH 14/16] make cgs iterations match stopping checks --- core/solver/cgs.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index da7db6ee70f..99438ba4a1f 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -139,9 +139,6 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const int iter = 0; /* Memory movement summary: - * Per iteration: - * 14n * values + matrix/preconditioner storage - * Two iterations: * 28n * values + 2 * matrix/preconditioner storage * 2x SpMV: 4n * values + 2 * storage * 2x Preconditioner: 4n * values + 2 * storage @@ -169,9 +166,6 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const alpha.get(), rho.get(), gamma.get(), &stop_status)); - ++iter; - this->template log(this, iter, r.get(), - dense_x); get_preconditioner()->apply(t.get(), u_hat.get()); system_matrix_->apply(u_hat.get(), t.get()); // r = r - alpha * t From 61693d2eed2113d28bef9fa687634a2ced2bb817 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 5 Feb 2021 16:00:44 +0100 Subject: [PATCH 15/16] fix BiCGSTAB half iteration log --- core/solver/bicgstab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index e05a3150dc5..1a3e38b9fba 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -194,7 +194,7 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const &stop_status)); } this->template log(this, iter, - r.get()); + s.get()); if (all_converged) { break; } From d11ec4af6acf3b2075e547467d88feba8ba92589 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 1 Mar 2021 19:33:35 +0100 Subject: [PATCH 16/16] replace BiCGSTAB half-iteration by full iteration but still keep the half-iteration stop check in place --- core/solver/bicgstab.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 1a3e38b9fba..f3828797fdb 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -139,9 +139,6 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; /* Memory movement summary: - * Per iteration: - * 15.5n * values + matrix/preconditioner storage - * Two iterations: * 31n * values + 2 * matrix/preconditioner storage * 2x SpMV: 4n * values + 2 * storage * 2x Preconditioner: 4n * values + 2 * storage @@ -181,7 +178,6 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(), alpha.get(), beta.get(), &stop_status)); - ++iter; auto all_converged = stop_criterion->update() .num_iterations(iter) @@ -193,8 +189,6 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const exec->run(bicgstab::make_finalize(dense_x, y.get(), alpha.get(), &stop_status)); } - this->template log(this, iter, - s.get()); if (all_converged) { break; }