Skip to content

Commit

Permalink
Merge pull request #1073 from parthenon-hpc-lab/lroberts36/fix-restar…
Browse files Browse the repository at this point in the history
…t-state-bug

Make AMR and sparse restarts bitwise exact
  • Loading branch information
lroberts36 authored May 15, 2024
2 parents 4049c6f + 3509198 commit dfda73b
Show file tree
Hide file tree
Showing 15 changed files with 70 additions and 22 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- [[PR 1004]](https://github.com/parthenon-hpc-lab/parthenon/pull/1004) Allow parameter modification from an input file for restarts

### Fixed (not changing behavior/API/variables/...)
- [[PR 1073]](https://github.com/parthenon-hpc-lab/parthenon/pull/1073) Fix bug in AMR and sparse restarts
- [[PR 1070]](https://github.com/parthenon-hpc-lab/parthenon/pull/1070) Correctly exclude flux vars from searches by default
- [[PR 1049]](https://github.com/parthenon-hpc-lab/parthenon/pull/1049) Catch task failures from threads
- [[PR 1058]](https://github.com/parthenon-hpc-lab/parthenon/pull/1058) Vector history not being output if no scalar history present
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ include(cmake/Format.cmake)
include(cmake/Lint.cmake)

# regression test reference data
set(REGRESSION_GOLD_STANDARD_VER 22 CACHE STRING "Version of gold standard to download and use")
set(REGRESSION_GOLD_STANDARD_VER 23 CACHE STRING "Version of gold standard to download and use")
set(REGRESSION_GOLD_STANDARD_HASH
"SHA512=c64f34b6841569c74c3918aa33d5c1fe6795b6ba124e25be7184c48f589ebe634062955e97969e68d252e59d8d0fce5d65e32b4252e2c2537f9f49ebfdf37ee0"
"SHA512=bb070f78ae0ecd65bd662f670eee60b4414804770b5041867652d9b5a8e411c59612457499a532068b2584acaa6d120ceb0db96bfde196a9cd129a6246b76fb3"
CACHE STRING "Hash of default gold standard file to download")
option(REGRESSION_GOLD_STANDARD_SYNC "Automatically sync gold standard files." ON)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ def compare(
"Info",
"Params",
"SparseInfo",
"SparseDeallocCount",
"Input",
"Blocks",
]:
Expand Down
8 changes: 4 additions & 4 deletions src/mesh/mesh-amr_loadbalance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,14 +258,14 @@ MPI_Request SendSameToSame(int lid_recv, int dest_rank, Variable<Real> *var,
"AMR SameToSame communication requires blocks to have at least two ghost zones");
auto counter_subview = Kokkos::subview(var->data, std::make_pair(0, 2));
auto counter_subview_h = Kokkos::create_mirror_view(HostMemSpace(), counter_subview);
counter_subview_h(0) = pmb->pmr->DereferenceCount();
counter_subview_h(0) = pmb->pmr->DerefinementCount();
counter_subview_h(1) = var->dealloc_count;
Kokkos::deep_copy(counter_subview, counter_subview_h);

PARTHENON_MPI_CHECK(MPI_Isend(var->data.data(), var->data.size(), MPI_PARTHENON_REAL,
dest_rank, tag, comm, &req));
} else {
var->com_state[0] = pmb->pmr->DereferenceCount();
var->com_state[0] = pmb->pmr->DerefinementCount();
var->com_state[1] = var->dealloc_count;
PARTHENON_MPI_CHECK(
MPI_Isend(var->com_state, 2, MPI_INT, dest_rank, tag, comm, &req));
Expand All @@ -291,15 +291,15 @@ bool TryRecvSameToSame(int lid_recv, int send_rank, Variable<Real> *var, MeshBlo
auto counter_subview = Kokkos::subview(var->data, std::make_pair(0, 2));
auto counter_subview_h =
Kokkos::create_mirror_view_and_copy(HostMemSpace(), counter_subview);
pmb->pmr->DereferenceCount() = counter_subview_h(0);
pmb->pmr->DerefinementCount() = counter_subview_h(0);
var->dealloc_count = counter_subview_h(1);
} else {
if (pmb->IsAllocated(var->label()) &&
!var->metadata().IsSet(Metadata::ForceAllocOnNewBlocks))
pmb->DeallocateSparse(var->label());
PARTHENON_MPI_CHECK(
MPI_Recv(var->com_state, 2, MPI_INT, send_rank, tag, comm, MPI_STATUS_IGNORE));
pmb->pmr->DereferenceCount() = var->com_state[0];
pmb->pmr->DerefinementCount() = var->com_state[0];
var->dealloc_count = var->com_state[1];
}
}
Expand Down
8 changes: 5 additions & 3 deletions src/mesh/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -599,12 +599,11 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
auto locLevelGidLidCnghostGflag = mesh_info.level_gid_lid_cnghost_gflag;
current_level = -1;
for (int i = 0; i < nbtotal; i++) {
loclist[i] = LogicalLocation(locLevelGidLidCnghostGflag[5 * i], lx123[3 * i],
lx123[3 * i + 1], lx123[3 * i + 2]);
loclist[i] = LogicalLocation(locLevelGidLidCnghostGflag[NumIDsAndFlags * i],
lx123[3 * i], lx123[3 * i + 1], lx123[3 * i + 2]);
}

// rebuild the Block Tree

for (int i = 0; i < nbtotal; i++) {
forest.AddMeshBlock(forest.GetForestLocationFromLegacyTreeLocation(loclist[i]),
false);
Expand Down Expand Up @@ -693,6 +692,9 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, RestartReader &rr,
block_list[i - nbs] =
MeshBlock::Make(i, i - nbs, loclist[i], block_size, block_bcs, this, pin, app_in,
packages, resolved_packages, gflag, costlist[i]);
if (block_list[i - nbs]->pmr)
block_list[i - nbs]->pmr->DerefinementCount() =
locLevelGidLidCnghostGflag[NumIDsAndFlags * i + 5];
}
BuildGMGBlockLists(pin, app_in);
SetMeshBlockNeighbors(GridIdentifier::leaf(), block_list, ranklist);
Expand Down
2 changes: 1 addition & 1 deletion src/mesh/mesh_refinement.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class MeshRefinement {
// be made public
Coordinates_t GetCoarseCoords() const { return coarse_coords; }

int &DereferenceCount() { return deref_count_; }
int &DerefinementCount() { return deref_count_; }

private:
// data
Expand Down
4 changes: 3 additions & 1 deletion src/outputs/output_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "interface/swarm_container.hpp"
#include "interface/variable.hpp"
#include "mesh/mesh.hpp"
#include "mesh/mesh_refinement.hpp"
#include "mesh/meshblock.hpp"
#include "outputs/output_utils.hpp"

Expand Down Expand Up @@ -231,13 +232,14 @@ std::vector<int64_t> ComputeLocs(Mesh *pm) {

std::vector<int> ComputeIDsAndFlags(Mesh *pm) {
return FlattenBlockInfo<int>(
pm, 5, [=](MeshBlock *pmb, std::vector<int> &data, int &i) {
pm, 6, [=](MeshBlock *pmb, std::vector<int> &data, int &i) {
auto loc = pmb->pmy_mesh->Forest().GetLegacyTreeLocation(pmb->loc);
data[i++] = loc.level();
data[i++] = pmb->gid;
data[i++] = pmb->lid;
data[i++] = pmb->cnghost;
data[i++] = pmb->gflag;
data[i++] = pmb->pmr ? pmb->pmr->DerefinementCount() : 0;
});
}

Expand Down
1 change: 1 addition & 0 deletions src/outputs/outputs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class PHDF5Output : public OutputType {
void WriteLevelsAndLocs_(Mesh *pm, hid_t file, const HDF5::H5P &pl, hsize_t offset,
hsize_t max_blocks_global) const;
void WriteSparseInfo_(Mesh *pm, hbool_t *sparse_allocated,
const std::vector<int> &dealloc_count,
const std::vector<std::string> &sparse_names, hsize_t num_sparse,
hid_t file, const HDF5::H5P &pl, size_t offset,
hsize_t max_blocks_global) const;
Expand Down
19 changes: 13 additions & 6 deletions src/outputs/parthenon_hdf5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "outputs/outputs.hpp"
#include "outputs/parthenon_hdf5.hpp"
#include "outputs/parthenon_xdmf.hpp"
#include "outputs/restart.hpp"
#include "utils/string_utils.hpp"

namespace parthenon {
Expand Down Expand Up @@ -297,6 +298,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
// can't use std::vector here because std::vector<hbool_t> is the same as
// std::vector<bool> and it doesn't have .data() member
std::unique_ptr<hbool_t[]> sparse_allocated(new hbool_t[num_blocks_local * num_sparse]);
std::vector<int> sparse_dealloc_count(num_blocks_local * num_sparse);

// allocate space for largest size variable
int varSize_max = 0;
Expand Down Expand Up @@ -356,7 +358,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) {
const auto &pmb = pm->block_list[b_idx];
bool is_allocated = false;

int dealloc_count = 0;
// for each variable that this local meshblock actually has
const auto vars = get_vars(pmb);
for (auto &v : vars) {
Expand All @@ -369,15 +371,16 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
[&](auto index, int topo, int t, int u, int v, int k, int j, int i) {
tmpData[index] = static_cast<OutT>(v_h(topo, t, u, v, k, j, i));
});

is_allocated = true;
dealloc_count = v->dealloc_count;
break;
}
}

if (vinfo.is_sparse) {
size_t sparse_idx = sparse_field_idx.at(vinfo.label);
sparse_allocated[b_idx * num_sparse + sparse_idx] = is_allocated;
sparse_dealloc_count[b_idx * num_sparse + sparse_idx] = dealloc_count;
}

if (!is_allocated) {
Expand Down Expand Up @@ -441,8 +444,8 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
// write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write
// this if we have sparse fields)
if (num_sparse > 0) {
WriteSparseInfo_(pm, sparse_allocated.get(), sparse_names, num_sparse, file, pl_xfer,
my_offset, max_blocks_global);
WriteSparseInfo_(pm, sparse_allocated.get(), sparse_dealloc_count, sparse_names,
num_sparse, file, pl_xfer, my_offset, max_blocks_global);
} // SparseInfo and SparseFields sections

// -------------------------------------------------------------------------------- //
Expand Down Expand Up @@ -594,8 +597,8 @@ void PHDF5Output::WriteBlocksMetadata_(Mesh *pm, hid_t file, const HDF5::H5P &pl

{
// (LOC.)level, GID, LID, cnghost, gflag
hsize_t loc_cnt[2] = {num_blocks_local, 5};
hsize_t glob_cnt[2] = {max_blocks_global, 5};
hsize_t loc_cnt[2] = {num_blocks_local, NumIDsAndFlags};
hsize_t glob_cnt[2] = {max_blocks_global, NumIDsAndFlags};
std::vector<int> tmpID = OutputUtils::ComputeIDsAndFlags(pm);
HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), &loc_offset[0],
&loc_cnt[0], &glob_cnt[0], pl);
Expand Down Expand Up @@ -660,6 +663,7 @@ void PHDF5Output::WriteLevelsAndLocs_(Mesh *pm, hid_t file, const HDF5::H5P &pl,
}

void PHDF5Output::WriteSparseInfo_(Mesh *pm, hbool_t *sparse_allocated,
const std::vector<int> &dealloc_count,
const std::vector<std::string> &sparse_names,
hsize_t num_sparse, hid_t file, const HDF5::H5P &pl,
size_t offset, hsize_t max_blocks_global) const {
Expand All @@ -674,6 +678,9 @@ void PHDF5Output::WriteSparseInfo_(Mesh *pm, hbool_t *sparse_allocated,
HDF5Write2D(file, "SparseInfo", sparse_allocated, &loc_offset[0], &loc_cnt[0],
&glob_cnt[0], pl);

HDF5Write2D(file, "SparseDeallocCount", dealloc_count.data(), &loc_offset[0],
&loc_cnt[0], &glob_cnt[0], pl);

// write names of sparse fields as attribute, first convert to vector of const char*
std::vector<const char *> names(num_sparse);
for (size_t i = 0; i < num_sparse; ++i)
Expand Down
15 changes: 15 additions & 0 deletions src/outputs/restart.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ namespace parthenon {
class Mesh;
class Param;

constexpr int NumIDsAndFlags{6};

class RestartReader {
public:
RestartReader() = default;
Expand All @@ -50,6 +52,8 @@ class RestartReader {
// std::vector<bool> and it doesn't have .data() member
std::unique_ptr<bool[]> allocated;

std::vector<int> dealloc_count;

int num_blocks = 0;
int num_sparse = 0;

Expand All @@ -63,6 +67,17 @@ class RestartReader {

return allocated[block * num_sparse + sparse_field_idx];
}

int DeallocCount(int block, int sparse_field_idx) const {
PARTHENON_REQUIRE_THROWS(allocated != nullptr,
"Tried to get allocation status but no data present");
PARTHENON_REQUIRE_THROWS((block >= 0) && (block < num_blocks),
"Invalid block index in SparseInfo:: DeallocCount");
PARTHENON_REQUIRE_THROWS((sparse_field_idx >= 0) && (sparse_field_idx < num_sparse),
"Invalid sparse field index in SparseInfo:: DeallocCount");

return dealloc_count[block * num_sparse + sparse_field_idx];
}
};

[[nodiscard]] virtual SparseInfo GetSparseInfo() const = 0;
Expand Down
7 changes: 7 additions & 0 deletions src/outputs/restart_hdf5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ RestartReaderHDF5::SparseInfo RestartReaderHDF5::GetSparseInfo() const {
// SparseInfo exists, read its contents
auto hdl = OpenDataset<bool>("SparseInfo");
PARTHENON_REQUIRE_THROWS(hdl.rank == 2, "SparseInfo expected to have rank 2");
auto hdl_dealloc = OpenDataset<int>("SparseDeallocCount");

info.labels = HDF5ReadAttributeVec<std::string>(hdl.dataset, "SparseFields");
info.num_sparse = static_cast<int>(info.labels.size());
Expand All @@ -102,6 +103,12 @@ RestartReaderHDF5::SparseInfo RestartReaderHDF5::GetSparseInfo() const {
// Read data from file
PARTHENON_HDF5_CHECK(H5Dread(hdl.dataset, hdl.type, memspace, hdl.dataspace,
H5P_DEFAULT, static_cast<void *>(info.allocated.get())));
info.dealloc_count.resize(hdl_dealloc.count);
PARTHENON_HDF5_CHECK(H5Dread(hdl_dealloc.dataset, hdl_dealloc.type,
H5S::FromHIDCheck(H5Screate_simple(
hdl_dealloc.rank, hdl_dealloc.dims.data(), NULL)),
hdl_dealloc.dataspace, H5P_DEFAULT,
static_cast<void *>(info.dealloc_count.data())));
}

return info;
Expand Down
4 changes: 4 additions & 0 deletions src/parthenon_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ void ParthenonManager::RestartPackages(Mesh &rm, RestartReader &resfile) {
// check if the sparse variable is allocated on this block
if (sparse_info.IsAllocated(pmb->gid, sparse_idxs.at(label))) {
pmb->AllocateSparse(label);
auto dealloc_count = sparse_info.DeallocCount(pmb->gid, sparse_idxs.at(label));
// Warning: For this to work, it is required that the controlling variable is
// stored in the restart files.
pmb->meshblock_data.Get()->GetVarPtr(label)->dealloc_count = dealloc_count;
} else {
// nothing to read for this block, advance reading index
index += nCells * vlen;
Expand Down
2 changes: 1 addition & 1 deletion tst/regression/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ if (ENABLE_HDF5)
list(APPEND TEST_PROCS ${NUM_MPI_PROC_TESTING})
list(APPEND TEST_ARGS "--driver ${PROJECT_BINARY_DIR}/example/sparse_advection/sparse_advection-example \
--driver_input ${CMAKE_CURRENT_SOURCE_DIR}/test_suites/restart/parthinput.restart \
--num_steps 3")
--num_steps 4")
list(APPEND EXTRA_TEST_LABELS "")

# Calculate pi example
Expand Down
3 changes: 1 addition & 2 deletions tst/regression/test_suites/restart/parthinput.restart
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
problem_id = restart

<parthenon/sparse>
dealloc_count = 999999 # disable this since deallcoation counter resets on
# restart and thus leads to different results
dealloc_count = 5

<parthenon/mesh>
refinement = adaptive
Expand Down
13 changes: 11 additions & 2 deletions tst/regression/test_suites/restart/restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ def Prepare(self, parameters, step):
"-t",
"00:00:02",
]
# Test restarting on a step that should have non-zero
# derefinement counts on some blocks
elif step == 3:
parameters.driver_cmd_line_args = [
"-r",
"gold.out0.00009.rhdf",
"parthenon/job/problem_id=silver9",
]
# now restart from the walltime based output
else:
parameters.driver_cmd_line_args = [
Expand All @@ -67,11 +75,11 @@ def Analyse(self, parameters):

success = True

def compare_files(name):
def compare_files(name, base="silver"):
delta = compare(
[
"gold.out0.%s.rhdf" % name,
"silver.out0.%s.rhdf" % name,
"{}.out0.{}.rhdf".format(base, name),
],
one=True,
)
Expand All @@ -90,6 +98,7 @@ def compare_files(name):
success &= compare_files("00005")
success &= compare_files("00009")
success &= compare_files("final")
success &= compare_files("final", "silver9")

found_line = False
for line in parameters.stdouts[1].decode("utf-8").split("\n"):
Expand Down

0 comments on commit dfda73b

Please sign in to comment.