Skip to content

Commit

Permalink
Merge branch 'develop' into lfroberts36/speedup-buffer-kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
jdolence authored Oct 6, 2023
2 parents 3c71701 + 2a8a2f5 commit 5be9d17
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 78 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,18 @@
- [[PR 885]](https://github.com/parthenon-hpc-lab/parthenon/pull/885) Expose PackDescriptor and use uids in SparsePacks

### Fixed (not changing behavior/API/variables/...)
- [[PR 947]](https://github.com/parthenon-hpc-lab/parthenon/pull/947) Add missing ForceRemeshComm dependencies
- [[PR 928]](https://github.com/parthenon-hpc-lab/parthenon/pull/928) Fix boundary comms during refinement next to refined blocks
- [[PR 937]](https://github.com/parthenon-hpc-lab/parthenon/pull/937) Fix multiple line continuations
- [[PR 933]](https://github.com/parthenon-hpc-lab/parthenon/pull/933) Remove extraneous debug check
- [[PR 917]](https://github.com/parthenon-hpc-lab/parthenon/pull/917) Update Iterative Tasking Infrastructure
- [[PR 890]](https://github.com/parthenon-hpc-lab/parthenon/pull/890) Fix bugs in sparse communication and prolongation

### Infrastructure (changes irrelevant to downstream codes)
- [[PR 938]](https://github.com/parthenon-hpc-lab/parthenon/pull/938) Restructure buffer packing/unpacking kernel hierarchical parallelism
- [[PR 944]](https://github.com/parthenon-hpc-lab/parthenon/pull/944) Move sparse pack identifier creation to descriptor
- [[PR 904]](https://github.com/parthenon-hpc-lab/parthenon/pull/904) Move to prolongation/restriction in one for AMR and communicate non-cell centered fields
- [[PR 918]](https://github.com/parthenon-hpc-lab/parthenon/pull/918) Refactor RegionSize
- [[PR 918]](https://github.com/parthenon-hpc-lab/parthenon/pull/918) Refactor RegionSize
- [[PR 901]](https://github.com/parthenon-hpc-lab/parthenon/pull/901) Implement shared element ownership model

### Removed (removing behavior/API/varaibles/...)
Expand Down
8 changes: 6 additions & 2 deletions src/bvals/bvals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <memory>
#include <string>
#include <unordered_set>
#include <vector>

#include "basic_types.hpp"
Expand Down Expand Up @@ -78,7 +79,9 @@ class BoundaryBase {
static int BufferID(int dim, bool multilevel);
static int FindBufferID(int ox1, int ox2, int ox3, int fi1, int fi2);

void SearchAndSetNeighbors(MeshBlockTree &tree, int *ranklist, int *nslist);
void
SearchAndSetNeighbors(MeshBlockTree &tree, int *ranklist, int *nslist,
const std::unordered_set<LogicalLocation> &newly_refined = {});

protected:
// 1D refined or unrefined=2
Expand All @@ -90,7 +93,8 @@ class BoundaryBase {
RegionSize block_size_;
ParArrayND<Real> sarea_[2];

void SetNeighborOwnership();
void
SetNeighborOwnership(const std::unordered_set<LogicalLocation> &newly_refined = {});

private:
// calculate 3x shared static data members when constructing only the 1st class instance
Expand Down
17 changes: 10 additions & 7 deletions src/bvals/bvals_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <string> // c_str()

#include "globals.hpp"
#include "mesh/logical_location.hpp"
#include "mesh/mesh.hpp"
#include "utils/buffer_utils.hpp"
#include "utils/error_checking.hpp"
Expand Down Expand Up @@ -300,8 +301,9 @@ int BoundaryBase::CreateBvalsMPITag(int lid, int bufid) {

// TODO(felker): break-up this long function

void BoundaryBase::SearchAndSetNeighbors(MeshBlockTree &tree, int *ranklist,
int *nslist) {
void BoundaryBase::SearchAndSetNeighbors(
MeshBlockTree &tree, int *ranklist, int *nslist,
const std::unordered_set<LogicalLocation> &newly_refined) {
Kokkos::Profiling::pushRegion("SearchAndSetNeighbors");
MeshBlockTree *neibt;
int myox1, myox2 = 0, myox3 = 0, myfx1, myfx2, myfx3;
Expand Down Expand Up @@ -368,7 +370,7 @@ void BoundaryBase::SearchAndSetNeighbors(MeshBlockTree &tree, int *ranklist,
}
}
if (block_size_.nx(X2DIR) == 1) {
SetNeighborOwnership();
SetNeighborOwnership(newly_refined);
Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
return;
}
Expand Down Expand Up @@ -503,7 +505,7 @@ void BoundaryBase::SearchAndSetNeighbors(MeshBlockTree &tree, int *ranklist,
}

if (block_size_.nx(X3DIR) == 1) {
SetNeighborOwnership();
SetNeighborOwnership(newly_refined);
Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
return;
}
Expand Down Expand Up @@ -626,11 +628,12 @@ void BoundaryBase::SearchAndSetNeighbors(MeshBlockTree &tree, int *ranklist,
}
}

SetNeighborOwnership();
SetNeighborOwnership(newly_refined);
Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
}

void BoundaryBase::SetNeighborOwnership() {
void BoundaryBase::SetNeighborOwnership(
const std::unordered_set<LogicalLocation> &newly_refined) {
// Set neighbor block ownership
std::set<LogicalLocation> allowed_neighbors;
allowed_neighbors.insert(loc); // Insert the location of this block
Expand All @@ -642,7 +645,7 @@ void BoundaryBase::SetNeighborOwnership() {
RootGridInfo rg_info = pmy_mesh_->GetRootGridInfo();
for (int n = 0; n < nneighbor; ++n) {
neighbor[n].ownership =
DetermineOwnership(neighbor[n].loc, allowed_neighbors, rg_info);
DetermineOwnership(neighbor[n].loc, allowed_neighbors, rg_info, newly_refined);
neighbor[n].ownership.initialized = true;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/bvals/comms/boundary_communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ template <BoundaryType bound_type>
TaskStatus StartReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
Kokkos::Profiling::pushRegion("Task_StartReceiveBoundBufs");
Mesh *pmesh = md->GetMeshPointer();
auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_send, false);
auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
if (cache.buf_vec.size() == 0)
InitializeBufferCache<bound_type>(md, &(pmesh->boundary_comm_map), &cache, ReceiveKey,
false);
Expand Down
55 changes: 21 additions & 34 deletions src/interface/sparse_pack_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,21 +276,28 @@ template SparsePackBase SparsePackBase::Build<MeshData<Real>>(MeshData<Real> *,
template <class T>
SparsePackBase &SparsePackCache::Get(T *pmd, const PackDescriptor &desc,
const std::vector<bool> &include_block) {
std::string ident = GetIdentifier(desc, include_block);
if (pack_map.count(ident) > 0) {
auto &pack = pack_map[ident].first;
if (pack_map.count(desc.identifier) > 0) {
auto &cache_tuple = pack_map[desc.identifier];
auto &pack = std::get<0>(cache_tuple);
auto alloc_status_in = SparsePackBase::GetAllocStatus(pmd, desc, include_block);
auto &alloc_status = pack_map[ident].second;
auto &alloc_status = std::get<1>(cache_tuple);
if (alloc_status.size() != alloc_status_in.size())
return BuildAndAdd(pmd, desc, ident, include_block);
return BuildAndAdd(pmd, desc, include_block);
for (int i = 0; i < alloc_status_in.size(); ++i) {
if (alloc_status[i] != alloc_status_in[i])
return BuildAndAdd(pmd, desc, ident, include_block);
return BuildAndAdd(pmd, desc, include_block);
}
auto &include_status = std::get<2>(cache_tuple);
if (include_status.size() != include_block.size())
return BuildAndAdd(pmd, desc, include_block);
for (int i = 0; i < include_block.size(); ++i) {
if (include_status[i] != include_block[i])
return BuildAndAdd(pmd, desc, include_block);
}
// Cached version is not stale, so just return a reference to it
return pack_map[ident].first;
return std::get<0>(cache_tuple);
}
return BuildAndAdd(pmd, desc, ident, include_block);
return BuildAndAdd(pmd, desc, include_block);
}
template SparsePackBase &SparsePackCache::Get<MeshData<Real>>(MeshData<Real> *,
const PackDescriptor &,
Expand All @@ -301,37 +308,17 @@ SparsePackCache::Get<MeshBlockData<Real>>(MeshBlockData<Real> *, const PackDescr

template <class T>
SparsePackBase &SparsePackCache::BuildAndAdd(T *pmd, const PackDescriptor &desc,
const std::string &ident,
const std::vector<bool> &include_block) {
if (pack_map.count(ident) > 0) pack_map.erase(ident);
pack_map[ident] = {SparsePackBase::Build(pmd, desc, include_block),
SparsePackBase::GetAllocStatus(pmd, desc, include_block)};
return pack_map[ident].first;
if (pack_map.count(desc.identifier) > 0) pack_map.erase(desc.identifier);
pack_map[desc.identifier] = {SparsePackBase::Build(pmd, desc, include_block),
SparsePackBase::GetAllocStatus(pmd, desc, include_block),
include_block};
return std::get<0>(pack_map[desc.identifier]);
}
template SparsePackBase &
SparsePackCache::BuildAndAdd<MeshData<Real>>(MeshData<Real> *, const PackDescriptor &,
const std::string &,
const std::vector<bool> &);
template SparsePackBase &SparsePackCache::BuildAndAdd<MeshBlockData<Real>>(
MeshBlockData<Real> *, const PackDescriptor &, const std::string &,
const std::vector<bool> &);

std::string SparsePackCache::GetIdentifier(const PackDescriptor &desc,
const std::vector<bool> &include_block) const {
std::string identifier("");
for (const auto &vgroup : desc.var_groups) {
for (const auto &[vid, uid] : vgroup) {
identifier += std::to_string(uid) + "_";
}
identifier += "|";
}
identifier += std::to_string(desc.with_fluxes);
identifier += std::to_string(desc.coarse);
identifier += std::to_string(desc.flat);
for (const auto b : include_block) {
identifier += std::to_string(b);
}
return identifier;
}
MeshBlockData<Real> *, const PackDescriptor &, const std::vector<bool> &);

} // namespace parthenon
26 changes: 20 additions & 6 deletions src/interface/sparse_pack_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class SparsePackBase {
friend class SparsePackCache;

using alloc_t = std::vector<int>;
using include_t = std::vector<bool>;
using pack_t = ParArray3D<ParArray3D<Real, VariableState>>;
using bounds_t = ParArray3D<int>;
using bounds_h_t = typename ParArray3D<int>::HostMirror;
Expand Down Expand Up @@ -114,12 +115,10 @@ class SparsePackCache {

template <class T>
SparsePackBase &BuildAndAdd(T *pmd, const impl::PackDescriptor &desc,
const std::string &ident,
const std::vector<bool> &include_block);

std::string GetIdentifier(const impl::PackDescriptor &desc,
const std::vector<bool> &include_block) const;
std::unordered_map<std::string, std::pair<SparsePackBase, SparsePackBase::alloc_t>>
std::unordered_map<std::string, std::tuple<SparsePackBase, SparsePackBase::alloc_t,
SparsePackBase::include_t>>
pack_map;

friend class SparsePackBase;
Expand All @@ -136,15 +135,16 @@ struct PackDescriptor {
// default constructor needed for certain use cases
PackDescriptor()
: nvar_groups(0), var_group_names({}), var_groups({}), with_fluxes(false),
coarse(false), flat(false) {}
coarse(false), flat(false), identifier("") {}

template <class GROUP_t, class SELECTOR_t>
PackDescriptor(StateDescriptor *psd, const std::vector<GROUP_t> &var_groups_in,
const SELECTOR_t &selector, const std::set<PDOpt> &options)
: nvar_groups(var_groups_in.size()), var_group_names(MakeGroupNames(var_groups_in)),
var_groups(BuildUids(var_groups_in.size(), psd, selector)),
with_fluxes(options.count(PDOpt::WithFluxes)),
coarse(options.count(PDOpt::Coarse)), flat(options.count(PDOpt::Flatten)) {
coarse(options.count(PDOpt::Coarse)), flat(options.count(PDOpt::Flatten)),
identifier(GetIdentifier()) {
PARTHENON_REQUIRE(!(with_fluxes && coarse),
"Probably shouldn't be making a coarse pack with fine fluxes.");
}
Expand All @@ -155,8 +155,22 @@ struct PackDescriptor {
const bool with_fluxes;
const bool coarse;
const bool flat;
const std::string identifier;

private:
std::string GetIdentifier() {
std::string ident("");
for (const auto &vgroup : var_groups) {
for (const auto &[vid, uid] : vgroup) {
ident += std::to_string(uid) + "_";
}
ident += "|";
}
ident += std::to_string(with_fluxes);
ident += std::to_string(coarse);
ident += std::to_string(flat);
return ident;
}
template <class FUNC_t>
std::vector<PackDescriptor::VariableGroup_t>
BuildUids(int nvgs, const StateDescriptor *const psd, const FUNC_t &selector) {
Expand Down
11 changes: 7 additions & 4 deletions src/interface/variable.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//========================================================================================
// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
//
// This program was produced under U.S. Government contract 89233218CNA000001 for Los
// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
Expand Down Expand Up @@ -81,7 +81,8 @@ void Variable<T>::CopyFluxesAndBdryVar(const Variable<T> *src) {
}
}

if (IsSet(Metadata::FillGhost) || IsSet(Metadata::Independent)) {
if (IsSet(Metadata::FillGhost) || IsSet(Metadata::Independent) ||
IsSet(Metadata::ForceRemeshComm)) {
// no need to check mesh->multilevel, if false, we're just making a shallow copy of
// an empty ParArrayND
coarse_s = src->coarse_s;
Expand Down Expand Up @@ -172,7 +173,8 @@ void Variable<T>::AllocateFluxesAndCoarse(std::weak_ptr<MeshBlock> wpmb) {
}

// Create the boundary object
if (IsSet(Metadata::FillGhost) || IsSet(Metadata::Independent)) {
if (IsSet(Metadata::FillGhost) || IsSet(Metadata::Independent) ||
IsSet(Metadata::ForceRemeshComm)) {
if (wpmb.expired()) return;
std::shared_ptr<MeshBlock> pmb = wpmb.lock();

Expand Down Expand Up @@ -205,7 +207,8 @@ std::int64_t Variable<T>::Deallocate() {
}
}

if (IsSet(Metadata::FillGhost) || IsSet(Metadata::Independent)) {
if (IsSet(Metadata::FillGhost) || IsSet(Metadata::Independent) ||
IsSet(Metadata::ForceRemeshComm)) {
mem_size += coarse_s.size() * sizeof(T);
coarse_s.Reset();
}
Expand Down
44 changes: 35 additions & 9 deletions src/mesh/amr_loadbalance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,13 +702,18 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
oldtonew[mb_idx] = ntot - 1;

current_level = 0;
std::unordered_set<LogicalLocation> newly_refined;
for (int n = 0; n < ntot; n++) {
// "on" = "old n" = "old gid" = "old global MeshBlock ID"
int on = newtoold[n];
if (newloc[n].level() > current_level) // set the current max level
current_level = newloc[n].level();
if (newloc[n].level() >= loclist[on].level()) { // same or refined
newcost[n] = costlist[on];
// Keep a list of all blocks refined for below
if (newloc[n].level() > loclist[on].level()) {
newly_refined.insert(newloc[n]);
}
} else {
double acost = 0.0;
for (int l = 0; l < nleaf; l++)
Expand Down Expand Up @@ -917,28 +922,49 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
}
}
prolongation_cache.CopyToDevice();

refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
block_list[0]->cellbounds, block_list[0]->c_cellbounds);
refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
block_list[0]->cellbounds, block_list[0]->c_cellbounds);

// update the lists
loclist = std::move(newloc);
ranklist = std::move(newrank);
costlist = std::move(newcost);

// A block newly refined and prolongated may have neighbors which were
// already refined to the new level.
// If so, the prolongated versions of shared elements will not reflect
// the true, finer versions present in the neighbor block.
// We must create any new fine buffers and fill them from these neighbors
// in order to maintain a consistent global state.
// Thus we rebuild and synchronize the mesh now, but using a unique
// neighbor precedence favoring the "old" fine blocks over "new" ones
for (auto &pmb : block_list) {
pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data(),
newly_refined);
}
// Make sure all old sends/receives are done before we reconfigure the mesh
#ifdef MPI_PARALLEL
if (send_reqs.size() != 0)
PARTHENON_MPI_CHECK(
MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE));
#endif
Kokkos::Profiling::popRegion(); // AMR: Recv data and unpack
// Re-initialize the mesh with our temporary ownership/neighbor configurations.
// No buffers are different when we switch to the final precedence order.
Initialize(false, pin, app_in);

// update the lists
loclist = std::move(newloc);
ranklist = std::move(newrank);
costlist = std::move(newcost);
// Internal refinement relies on the fine shared values, which are only consistent after
// being updated with any previously fine versions
refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
block_list[0]->cellbounds, block_list[0]->c_cellbounds);

// re-initialize the MeshBlocks
// Rebuild just the ownership model, this time weighting the "new" fine blocks just like
// any other blocks at their level.
for (auto &pmb : block_list) {
pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data());
}
Initialize(false, pin, app_in);

Kokkos::Profiling::popRegion(); // AMR: Recv data and unpack

ResetLoadBalanceVariables();

Expand Down
Loading

0 comments on commit 5be9d17

Please sign in to comment.