From ecad3b4a401c0c9723a8ee736efb6ca7505c8e36 Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Thu, 7 Sep 2023 15:25:35 -0600 Subject: [PATCH 1/5] Switch to three levels of parallelism --- src/bvals/comms/boundary_communication.cpp | 60 +++++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 2deabe4ef81c..bb25451cc520 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -82,6 +82,9 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { PARTHENON_DEBUG_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info"); auto &sending_nonzero_flags = cache.sending_non_zero_flags; auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h; + for (int ibuf = 0; ibuf < cache.buf_vec.size(); ++ibuf) + sending_nonzero_flags_h(ibuf) = true; + Kokkos::parallel_for( "SendBoundBufs", Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), @@ -98,15 +101,27 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { int idx_offset = 0; for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) { auto &idxer = bnd_info(b).idxer[iel]; + const int Ni = idxer.EndIdx<5>() - idxer.StartIdx<5>() + 1; Kokkos::parallel_reduce( - Kokkos::TeamThreadRange<>(team_member, idxer.size()), - [&](const int idx, bool &lnon_zero) { - const auto [t, u, v, k, j, i] = idxer(idx); - const Real &val = bnd_info(b).var(iel, t, u, v, k, j, i); - bnd_info(b).buf(idx + idx_offset) = val; - lnon_zero = lnon_zero || (std::abs(val) >= threshold); - }, - Kokkos::LOr(non_zero[iel])); + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx, bool &lnon_zero) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + buf[m] = var[m]; + }); + + bool mnon_zero = false; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m, bool &llnon_zero) { + llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); + }, Kokkos::LOr(mnon_zero)); + + lnon_zero = lnon_zero || mnon_zero; + }, Kokkos::LOr(non_zero[iel])); idx_offset += idxer.size(); } Kokkos::single(Kokkos::PerTeam(team_member), [&]() { @@ -229,20 +244,37 @@ TaskStatus SetBounds(std::shared_ptr> &md) { int idx_offset = 0; for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) { auto &idxer = bnd_info(b).idxer[iel]; + const int Ni = idxer.EndIdx<5>() - idxer.StartIdx<5>() + 1; if (bnd_info(b).buf_allocated && bnd_info(b).allocated) { - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size()), + Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), [&](const int idx) { const auto [t, u, v, k, j, i] = idxer(idx); - if (idxer.IsActive(k, j, i)) - bnd_info(b).var(iel, t, u, v, k, j, i) = - bnd_info(b).buf(idx + idx_offset); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + // Have to do this because of some weird issue about structure bindings being captured + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var[m] = buf[m]; + }); }); } else if (bnd_info(b).allocated) { const Real default_val = bnd_info(b).var.sparse_default_val; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size()), + Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), [&](const int idx) { const auto [t, u, v, k, j, i] = idxer(idx); - bnd_info(b).var(iel, t, u, v, k, j, i) = default_val; + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var[m] = default_val; + }); }); } idx_offset += idxer.size(); From 6e7467e4abee43a2c5ff6d1a37eac22010b4ef7a Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Thu, 7 Sep 2023 15:26:32 -0600 Subject: [PATCH 2/5] Remove unecessary initialization of host nonzero flags --- src/bvals/comms/boundary_communication.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index bb25451cc520..6108e1da3bbf 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -82,8 +82,6 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { PARTHENON_DEBUG_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info"); auto &sending_nonzero_flags = cache.sending_non_zero_flags; auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h; - for (int ibuf = 0; ibuf < cache.buf_vec.size(); ++ibuf) - sending_nonzero_flags_h(ibuf) = true; Kokkos::parallel_for( "SendBoundBufs", From 3d410234a6974025fb156542f92f18b711b669bd Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Thu, 7 Sep 2023 15:40:06 -0600 Subject: [PATCH 3/5] Add template in a couple places, format, lint --- src/bvals/comms/boundary_communication.cpp | 92 +++++++++++----------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 6108e1da3bbf..54275ae40db7 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -82,7 +82,7 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { PARTHENON_DEBUG_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info"); auto &sending_nonzero_flags = cache.sending_non_zero_flags; auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h; - + Kokkos::parallel_for( "SendBoundBufs", Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), @@ -99,27 +99,28 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { int idx_offset = 0; for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) { auto &idxer = bnd_info(b).idxer[iel]; - const int Ni = idxer.EndIdx<5>() - idxer.StartIdx<5>() + 1; + const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; Kokkos::parallel_reduce( Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx, bool &lnon_zero) { + [&](const int idx, bool &lnon_zero) { const auto [t, u, v, k, j, i] = idxer(idx * Ni); - Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m) { - buf[m] = var[m]; - }); - + [&](int m) { buf[m] = var[m]; }); + bool mnon_zero = false; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m, bool &llnon_zero) { - llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); - }, Kokkos::LOr(mnon_zero)); - + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m, bool &llnon_zero) { + llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); + }, + Kokkos::LOr(mnon_zero)); + lnon_zero = lnon_zero || mnon_zero; - }, Kokkos::LOr(non_zero[iel])); + }, + Kokkos::LOr(non_zero[iel])); idx_offset += idxer.size(); } Kokkos::single(Kokkos::PerTeam(team_member), [&]() { @@ -242,38 +243,41 @@ TaskStatus SetBounds(std::shared_ptr> &md) { int idx_offset = 0; for (int iel = 0; iel < bnd_info(b).ntopological_elements; ++iel) { auto &idxer = bnd_info(b).idxer[iel]; - const int Ni = idxer.EndIdx<5>() - idxer.StartIdx<5>() + 1; + const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; if (bnd_info(b).buf_allocated && bnd_info(b).allocated) { - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx); - Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); - // Have to do this because of some weird issue about structure bindings being captured - const int kk = k; - const int jj = j; - const int ii = i; - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m) { - if (idxer.IsActive(kk, jj, ii + m)) - var[m] = buf[m]; - }); - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + // Have to do this because of some weird issue about structure bindings + // being captured + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var[m] = buf[m]; + }); + }); } else if (bnd_info(b).allocated) { const Real default_val = bnd_info(b).var.sparse_default_val; - Kokkos::parallel_for(Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx); - Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); - const int kk = k; - const int jj = j; - const int ii = i; - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m) { - if (idxer.IsActive(kk, jj, ii + m)) - var[m] = default_val; - }); - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var[m] = default_val; + }); + }); } idx_offset += idxer.size(); } From 82d60c822de6d453ddcfbd31dc4a21b235d5e994 Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Thu, 7 Sep 2023 16:53:43 -0600 Subject: [PATCH 4/5] Bug fix --- src/bvals/comms/boundary_communication.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 54275ae40db7..54fd5945028e 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -248,7 +248,7 @@ TaskStatus SetBounds(std::shared_ptr> &md) { Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); // Have to do this because of some weird issue about structure bindings @@ -267,7 +267,7 @@ TaskStatus SetBounds(std::shared_ptr> &md) { Kokkos::parallel_for( Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); const int kk = k; const int jj = j; From 7f1d629cf0762fa2c2e7e543083b4aa7028088fc Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Thu, 7 Sep 2023 17:20:01 -0600 Subject: [PATCH 5/5] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index befb895a8a5a..d3405ddcb800 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ - [[PR 890]](https://github.com/parthenon-hpc-lab/parthenon/pull/890) Fix bugs in sparse communication and prolongation ### Infrastructure (changes irrelevant to downstream codes) +- [[PR 938]](https://github.com/parthenon-hpc-lab/parthenon/pull/938) Restructure buffer packing/unpacking kernel hierarchical parallelism - [[PR 904]](https://github.com/parthenon-hpc-lab/parthenon/pull/904) Move to prolongation/restriction in one for AMR and communicate non-cell centered fields - [[PR 918]](https://github.com/parthenon-hpc-lab/parthenon/pull/918) Refactor RegionSize - [[PR 901]](https://github.com/parthenon-hpc-lab/parthenon/pull/901) Implement shared element ownership model