From 00c936c96d273da9b845c25481e5b3218ba1d4e8 Mon Sep 17 00:00:00 2001 From: Jonah Miller Date: Wed, 16 Oct 2024 08:56:23 -0600 Subject: [PATCH 01/13] bump Kokkos version --- external/Kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/Kokkos b/external/Kokkos index 62d2b6c879b74..e0dc0128e04f1 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6 +Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4 From 0f386249fec65bf2b4873711610ea51b30841341 Mon Sep 17 00:00:00 2001 From: Jonah Miller Date: Wed, 16 Oct 2024 08:57:38 -0600 Subject: [PATCH 02/13] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f046faabe759..71d100d528aa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades ### Changed (changing behavior/API/variables/...) +- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize - [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option From 86783d434bd465209467c889290bf8a8494ecb61 Mon Sep 17 00:00:00 2001 From: Patrick Mullen Date: Thu, 17 Oct 2024 08:38:16 -0600 Subject: [PATCH 03/13] Update Kokkos version to 4.4.1 --- CHANGELOG.md | 3 +-- external/Kokkos | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71d100d528aa7..ff9137b4dfc75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,8 @@ - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades ### Changed (changing behavior/API/variables/...) -- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01 +- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize -- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls diff --git a/external/Kokkos b/external/Kokkos index e0dc0128e04f1..15dc143e5f399 160000 --- a/external/Kokkos +++ b/external/Kokkos @@ -1 +1 @@ -Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4 +Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8 From b4ab05f5f3c059e6a9fa208a3101c6e47bd0d3aa Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Fri, 18 Oct 2024 15:34:01 +0200 Subject: [PATCH 04/13] Fix View of View dealloc --- src/bvals/comms/bnd_info.hpp | 2 +- src/bvals/comms/bvals_utils.hpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp index e6214ceba3226..8800f6fd867f3 100644 --- a/src/bvals/comms/bnd_info.hpp +++ b/src/bvals/comms/bnd_info.hpp @@ -127,7 +127,7 @@ struct ProResInfo { int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb, std::shared_ptr> v); -using BndInfoArr_t = ParArray1D; +using BndInfoArr_t = Kokkos::View; using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror; using ProResInfoArr_t = ParArray1D; diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index f185c1207747f..616d5cf21ee05 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -215,7 +215,8 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = BndInfoArr_t("bnd_info", nbound); + cache.bnd_info = + BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound); cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info); // prolongation/restriction sub-sets From e333a7b6c33a4ede8a2c0f9a435bbac85227b59b Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Mon, 21 Oct 2024 11:18:17 +0200 Subject: [PATCH 05/13] Move view_alloc to host --- src/bvals/comms/bvals_utils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp index 616d5cf21ee05..8a47c716aa22c 100644 --- a/src/bvals/comms/bvals_utils.hpp +++ b/src/bvals/comms/bvals_utils.hpp @@ -215,9 +215,9 @@ inline void RebuildBufferCache(std::shared_ptr> md, int nbound, using namespace loops; using namespace loops::shorthands; BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER); - cache.bnd_info = - BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound); - cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info); + cache.bnd_info = BndInfoArr_t("bnd_info", nbound); + cache.bnd_info_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info); // prolongation/restriction sub-sets // TODO(JMM): Right now I exclude fluxcorrection boundaries but if From 094880e12701ba8ee80e37e8a5b0ca52b5f74326 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Mon, 21 Oct 2024 11:34:18 +0200 Subject: [PATCH 06/13] Temp disable rocthrust on amd ci container --- cmake/machinecfg/GitHubActions.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake index 663dcb38d6821..1adba870bdb20 100644 --- a/cmake/machinecfg/GitHubActions.cmake +++ b/cmake/machinecfg/GitHubActions.cmake @@ -19,6 +19,7 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ") # common options set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks") +set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.") set(MACHINE_CXX_FLAGS "") if (${MACHINE_VARIANT} MATCHES "cuda") From 0e0ef22cfc1540bc3156437fed5a4a903becc071 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Mon, 21 Oct 2024 11:42:56 +0200 Subject: [PATCH 07/13] Fix doc creation. Rollback to ubuntu22-04 --- .github/workflows/docs.yml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4b95511d82f58..7fb6e0165ff0e 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -12,7 +12,10 @@ on: jobs: docs: name: build and deploy docs - runs-on: ubuntu-latest + # not using latest due to issues with pip user packages, see + # https://github.com/actions/runner-images/issues/10781 and + # https://github.com/actions/runner-images/issues/10636 + runs-on: ubuntu-22.04 steps: - name: Checkout code @@ -23,9 +26,9 @@ jobs: run: export DEBIAN_FRONTEND=noninteractive - name: install dependencies run: | - pip install --break-system-packages sphinx - pip install --break-system-packages sphinx-rtd-theme - pip install --break-system-packages sphinx-multiversion + pip install sphinx + pip install sphinx-rtd-theme + pip install sphinx-multiversion - name: build docs run: | echo "Repo = ${GITHUB_REPOSITORY}" From 343b0092e838425514e357685066305ab208b9d1 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Tue, 22 Oct 2024 12:02:45 +0200 Subject: [PATCH 08/13] Fix more view of views --- src/bvals/comms/bnd_info.cpp | 3 ++- src/interface/mesh_data.hpp | 3 ++- src/interface/sparse_pack_base.cpp | 6 ++++-- src/interface/swarm_comms.cpp | 1 + src/interface/swarm_pack_base.hpp | 6 ++++-- src/interface/variable_pack.hpp | 23 ++++++++++++++--------- src/parthenon_array_generic.hpp | 4 ++++ tst/unit/test_pararrays.cpp | 6 ++++-- 8 files changed, 35 insertions(+), 17 deletions(-) diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp index 736992260913f..54a6ae2b50fd3 100644 --- a/src/bvals/comms/bnd_info.cpp +++ b/src/bvals/comms/bnd_info.cpp @@ -41,7 +41,8 @@ namespace parthenon { void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) { prores_info = ParArray1D("prores_info", n_regions); - prores_info_h = Kokkos::create_mirror_view(prores_info); + prores_info_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info); int nref_funcs = pkg->NumRefinementFuncs(); // Note that assignment of Kokkos views resets them, but // buffer_subset_sizes is a std::vector. It must be cleared, then diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp index 14ae3959c32ad..7d7d1cabcbe5a 100644 --- a/src/interface/mesh_data.hpp +++ b/src/interface/mesh_data.hpp @@ -150,7 +150,8 @@ const MeshBlockPack

&PackOnMesh(M &map, BlockDataList_t &block_data_, if (make_new_pack) { ParArray1D

packs("MeshData::PackVariables::packs", nblocks); - auto packs_host = Kokkos::create_mirror_view(packs); + auto packs_host = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs); for (size_t i = 0; i < nblocks; i++) { const auto &pack = packing_function(block_data_[i], this_map, this_key); diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp index 2a7a5b70c41c2..4ea5a558c3f52 100644 --- a/src/interface/sparse_pack_base.cpp +++ b/src/interface/sparse_pack_base.cpp @@ -152,7 +152,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, leading_dim += 2; } pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size); - pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_); + pack.pack_h_ = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); // For non-flat packs, shape of pack is type x block x var x k x j x i // where type here might be a flux. @@ -168,7 +169,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc, pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_); pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks); - auto coords_h = Kokkos::create_mirror_view(pack.coords_); + auto coords_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_); // Fill the views int idx = 0; diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp index 3ee4e2af75129..bb5755e2cbe87 100644 --- a/src/interface/swarm_comms.cpp +++ b/src/interface/swarm_comms.cpp @@ -157,6 +157,7 @@ void Swarm::SetupPersistentMPI() { } void Swarm::CountParticlesToSend_() { + // TODO(PG->BRR) What's going on here? Why is the mask being copied? auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_); auto swarm_d = GetDeviceContext(); auto pmb = GetBlockPointer(); diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp index 0733aa51f329e..52a2c3c47fc7b 100644 --- a/src/interface/swarm_pack_base.hpp +++ b/src/interface/swarm_pack_base.hpp @@ -109,7 +109,8 @@ class SwarmPackBase { // Allocate the views int leading_dim = 1; pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size); - auto pack_h = Kokkos::create_mirror_view(pack.pack_); + auto pack_h = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_); pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar); auto bounds_h = Kokkos::create_mirror_view(pack.bounds_); @@ -154,7 +155,8 @@ class SwarmPackBase { Kokkos::deep_copy(pack.bounds_, bounds_h); pack.contexts_ = contexts_t("contexts", nblocks); - pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_); + pack.contexts_h_ = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_); pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks); pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1); BuildSupplemental(pmd, desc, pack); diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 037731093ce13..07e56091ddfe6 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -570,10 +570,11 @@ void FillVarView(const VariableVector &vars, int vsize, bool coarse, assert(vsize == sparse_id_out.size()); assert(vsize == vector_component_out.size()); - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); - auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out); - auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out); + auto host_cv = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out); + auto host_sp = Kokkos::create_mirror_view(sparse_id_out); + auto host_vc = Kokkos::create_mirror_view(vector_component_out); + auto host_al = Kokkos::create_mirror_view(allocated_out); int vindex = 0; for (const auto &v : vars) { @@ -634,7 +635,8 @@ void FillSwarmVarView(const vpack_types::SwarmVarList &vars, ViewOfParArrays1D &cv_out, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out); + auto host_cv = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out); int vindex = 0; for (const auto v : vars) { @@ -675,10 +677,13 @@ void FillFluxViews(const VariableVector &vars, const int ndim, PackIndexMap *pvmap) { using vpack_types::IndexPair; - auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out); - auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out); - auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out); - auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out); + auto host_f1 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f1_out); + auto host_f2 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f2_out); + auto host_f3 = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f3_out); + auto host_al = Kokkos::create_mirror_view(flux_allocated_out); int vindex = 0; for (const auto &v : vars) { diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp index d527707f9070a..ac38b6cb5e5f3 100644 --- a/src/parthenon_array_generic.hpp +++ b/src/parthenon_array_generic.hpp @@ -221,6 +221,8 @@ class ParArrayGeneric : public State { // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6); } + // TODO(PG?) Can we use concepts here to add a + // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template auto GetMirror(MemSpace const &memspace) { auto mirror = Kokkos::create_mirror_view(memspace, data_); @@ -333,6 +335,8 @@ inline auto subview(std::index_sequence, return parthenon::ParArrayGeneric(v, arr); } +// TODO(PG?) Can we use concepts here to add a +// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView? template inline auto create_mirror_view_and_copy(Space const &space, const parthenon::ParArrayGeneric &arr) { diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index e79927c13b20f..d281ce9e0c8d0 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -452,7 +452,8 @@ TEST_CASE("ParArray state", "[ParArrayND]") { GIVEN("An array of ParArrays filled with the values contained in their state") { parthenon::ParArray1D pack("test pack", NS); - auto pack_h = Kokkos::create_mirror_view(pack); + auto pack_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack); for (int b = 0; b < NS; ++b) { state_t state(static_cast(b)); @@ -544,7 +545,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") { new (&views[n]) view_3d_t(Kokkos::view_alloc(label, Kokkos::WithoutInitializing), N, N, N); auto a_h = arrays(n).GetHostMirror(); - auto v_h = Kokkos::create_mirror_view(views(n)); + auto v_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), + views(n)); for (int k = 0; k < N; k++) { for (int j = 0; j < N; j++) { for (int i = 0; i < N; i++) { From 4999a80408910c191c7c0e7fdee5aaca73afe50d Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Tue, 22 Oct 2024 19:40:46 +0200 Subject: [PATCH 09/13] Chasing more ViewOfView on host --- src/interface/variable_pack.hpp | 19 +++++++++++-------- src/kokkos_abstraction.hpp | 13 +++++++++++++ tst/unit/test_pararrays.cpp | 2 +- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp index 07e56091ddfe6..76fce5f6b8616 100644 --- a/src/interface/variable_pack.hpp +++ b/src/interface/variable_pack.hpp @@ -244,10 +244,11 @@ class PackIndexMap { }; template -using ViewOfParArrays = ParArray1D>; +using ViewOfParArrays = + Kokkos::View *, LayoutWrapper, DevMemSpace>; template -using ViewOfParArrays1D = ParArray1D>; +using ViewOfParArrays1D = Kokkos::View *, LayoutWrapper, DevMemSpace>; // forward declaration template @@ -760,10 +761,11 @@ VariableFluxPack MakeFluxPack(const VarListWithKeys &var_list, } // make the outer view - ViewOfParArrays cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1)); - ViewOfParArrays f1("MakeFluxPack::f1", fsize); - ViewOfParArrays f2("MakeFluxPack::f2", fsize); - ViewOfParArrays f3("MakeFluxPack::f3", fsize); + ViewOfParArrays cv(ViewOfViewAlloc("MakeFluxPack::cv"), + vsize * (extra_components ? 3 : 1)); + ViewOfParArrays f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize); + ViewOfParArrays f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize); + ViewOfParArrays f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize); ParArray1D flux_allocated("MakePack::allocated", fsize); ParArray1D sparse_id("MakeFluxPack::sparse_id", vsize); ParArray1D vector_component("MakeFluxPack::vector_component", vsize); @@ -814,7 +816,8 @@ VariablePack MakePack(const VarListWithKeys &var_list, bool coarse, } // make the outer view - ViewOfParArrays cv("MakePack::cv", vsize * (extra_components ? 3 : 1)); + ViewOfParArrays cv(ViewOfViewAlloc("MakePack::cv"), + vsize * (extra_components ? 3 : 1)); ParArray1D sparse_id("MakePack::sparse_id", vsize); ParArray1D vector_component("MakePack::vector_component", vsize); ParArray1D allocated("MakePack::allocated", vsize); @@ -847,7 +850,7 @@ SwarmVariablePack MakeSwarmPack(const vpack_types::SwarmVarList &vars, } // make the outer view - ViewOfParArrays1D cv("MakePack::cv", vsize); + ViewOfParArrays1D cv(ViewOfViewAlloc("MakePack::cv"), vsize); std::array cv_size{0, 0}; if (vsize > 0) { diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 8fa89f82e95e8..b5dfc3fa75a0f 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -1035,6 +1035,19 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons reduction); } +// For ViewOfView we need to call the destructor of the inner views on +// the host and not on the device (which would happen by default). +// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is +// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be +// passed when calling `create_mirror_view`. +auto ViewOfViewAlloc(const std::string &label) { + if constexpr (std::is_same_v) { + return Kokkos::view_alloc(Kokkos::SequentialHostInit, label); + } else { + return Kokkos::view_alloc(label); + } +} + // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7 template diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp index d281ce9e0c8d0..9e1816b0e6690 100644 --- a/tst/unit/test_pararrays.cpp +++ b/tst/unit/test_pararrays.cpp @@ -451,7 +451,7 @@ TEST_CASE("ParArray state", "[ParArrayND]") { } GIVEN("An array of ParArrays filled with the values contained in their state") { - parthenon::ParArray1D pack("test pack", NS); + Kokkos::View pack(parthenon::ViewOfViewAlloc("test pack"), NS); auto pack_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack); From c799d026bd226e3c175e04ff435e5c39744361ee Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Wed, 6 Nov 2024 16:32:37 +0100 Subject: [PATCH 10/13] Fix ViewAlloc func and add doc --- README.md | 2 +- doc/sphinx/src/development.rst | 28 ++++++++++++++++++++++++++++ src/kokkos_abstraction.hpp | 3 ++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ed6a1bb05a169..f049c44ecd7d5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr * CMake 3.16 or greater * C++17 compatible compiler -* Kokkos 4.0.1 or greater +* Kokkos 4.1.1 or greater ## Optional (enabling features) diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst index dbab91d8d5ce6..98ac9cef90a85 100644 --- a/doc/sphinx/src/development.rst +++ b/doc/sphinx/src/development.rst @@ -62,6 +62,34 @@ parallelism interface that is needed for managing memory cached in tightly nested loops. The wrappers are documented :ref:`here `. +View of Views +------------- + +Special care needs to be taken when working with a ``View`` of ``View``. + +To repeat the Kokkos documenation: `Don't use them `__ + +But if you have to (which is the case in some places inside Parthenon) +then follow this pattern: + +.. code:: c++ + + Kokkos::View *> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10); + +The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added, +which results in the (inner ``View`` ) deallocators being called on the host (rather than on +the device by default). + +Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional +property for the same reason. + +.. code:: c++ + + auto view_of_pararrays_h = + Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays); + +Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon). + The need for reductions within function handling ``MeshBlock`` data ------------------------------------------------------------------- diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index b5dfc3fa75a0f..37262dd0356cc 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -1040,8 +1040,9 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons // Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is // on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be // passed when calling `create_mirror_view`. +template auto ViewOfViewAlloc(const std::string &label) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { return Kokkos::view_alloc(Kokkos::SequentialHostInit, label); } else { return Kokkos::view_alloc(label); From ebbcc1704ed2c68c99808cf099eef41396b2d5ce Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 13:41:44 +0100 Subject: [PATCH 11/13] Disable Ascent in testing --- .github/workflows/ci-extended.yml | 48 +++++++++++++++---------------- .github/workflows/ci-short.yml | 12 ++++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 8ca646cfc2ebd..0c083091cf0e2 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -32,9 +32,9 @@ jobs: parallel: ['serial', 'mpi'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -91,27 +91,27 @@ jobs: ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 # Test Ascent integration (only most complex setup with MPI and on device) - - name: Ascent tests - if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} - run: | - cmake -B build-ascent \ - -DCMAKE_BUILD_TYPE=Release \ - -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ - -DPARTHENON_ENABLE_ASCENT=ON \ - -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent - cmake --build build-ascent - cd example/advection/ - # Pick GPU with most available memory - export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') - mpirun -np 2 ../../build-ascent/example/advection/advection-example \ - -i parthinput.advection \ - parthenon/output5/dt=0.05 \ - parthenon/time/tlim=0.1 - # check if file exists - if [ ! -f "ascent_render_57.png" ]; then - echo "'ascent_render_57.png' does not exist." - exit 1 - fi + # - name: Ascent tests + # if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} + # run: | + # cmake -B build-ascent \ + # -DCMAKE_BUILD_TYPE=Release \ + # -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ + # -DPARTHENON_ENABLE_ASCENT=ON \ + # -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent + # cmake --build build-ascent + # cd example/advection/ + # # Pick GPU with most available memory + # export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') + # mpirun -np 2 ../../build-ascent/example/advection/advection-example \ + # -i parthinput.advection \ + # parthenon/output5/dt=0.05 \ + # parthenon/time/tlim=0.1 + # # check if file exists + # if [ ! -f "ascent_render_57.png" ]; then + # echo "'ascent_render_57.png' does not exist." + # exit 1 + # fi - uses: actions/upload-artifact@v3 with: @@ -120,7 +120,7 @@ jobs: build/CMakeFiles/CMakeOutput.log build/tst/regression/outputs/advection_convergence*/advection-errors.dat build/tst/regression/outputs/advection_convergence*/advection-errors.png - example/advection/ascent_render_57.png + # example/advection/ascent_render_57.png retention-days: 3 perf-and-regression-amdgpu: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ecb4052411eea..ab42a7450c836 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -20,9 +20,9 @@ jobs: style: runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -45,9 +45,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -77,9 +77,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: From 11e1e8614d07c0d85c0a04d914fde33e6115dbac Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 14:27:39 +0100 Subject: [PATCH 12/13] Revert "Disable Ascent in testing" This reverts commit ebbcc1704ed2c68c99808cf099eef41396b2d5ce. --- .github/workflows/ci-extended.yml | 48 +++++++++++++++---------------- .github/workflows/ci-short.yml | 12 ++++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 0c083091cf0e2..8ca646cfc2ebd 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -32,9 +32,9 @@ jobs: parallel: ['serial', 'mpi'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: @@ -91,27 +91,27 @@ jobs: ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 # Test Ascent integration (only most complex setup with MPI and on device) - # - name: Ascent tests - # if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} - # run: | - # cmake -B build-ascent \ - # -DCMAKE_BUILD_TYPE=Release \ - # -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ - # -DPARTHENON_ENABLE_ASCENT=ON \ - # -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent - # cmake --build build-ascent - # cd example/advection/ - # # Pick GPU with most available memory - # export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') - # mpirun -np 2 ../../build-ascent/example/advection/advection-example \ - # -i parthinput.advection \ - # parthenon/output5/dt=0.05 \ - # parthenon/time/tlim=0.1 - # # check if file exists - # if [ ! -f "ascent_render_57.png" ]; then - # echo "'ascent_render_57.png' does not exist." - # exit 1 - # fi + - name: Ascent tests + if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }} + run: | + cmake -B build-ascent \ + -DCMAKE_BUILD_TYPE=Release \ + -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \ + -DPARTHENON_ENABLE_ASCENT=ON \ + -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent + cmake --build build-ascent + cd example/advection/ + # Pick GPU with most available memory + export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }') + mpirun -np 2 ../../build-ascent/example/advection/advection-example \ + -i parthinput.advection \ + parthenon/output5/dt=0.05 \ + parthenon/time/tlim=0.1 + # check if file exists + if [ ! -f "ascent_render_57.png" ]; then + echo "'ascent_render_57.png' does not exist." + exit 1 + fi - uses: actions/upload-artifact@v3 with: @@ -120,7 +120,7 @@ jobs: build/CMakeFiles/CMakeOutput.log build/tst/regression/outputs/advection_convergence*/advection-errors.dat build/tst/regression/outputs/advection_convergence*/advection-errors.png - # example/advection/ascent_render_57.png + example/advection/ascent_render_57.png retention-days: 3 perf-and-regression-amdgpu: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ab42a7450c836..ecb4052411eea 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -20,9 +20,9 @@ jobs: style: runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: @@ -45,9 +45,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: @@ -77,9 +77,9 @@ jobs: device: ['cuda', 'host'] runs-on: [self-hosted, A100] container: - image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent + image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 + options: --user 1001 steps: - uses: actions/checkout@v3 with: From e632f6b7cbab5535f8db91eb67cb81fa49146513 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 7 Nov 2024 14:31:04 +0100 Subject: [PATCH 13/13] Disable CUDA IPC --- .github/workflows/ci-extended.yml | 4 +++- .github/workflows/ci-short.yml | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index 8ca646cfc2ebd..1dd39c782477c 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -21,6 +21,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -34,7 +36,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index ecb4052411eea..7e0fd8bf759ac 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -13,6 +13,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # CUDA IPC within docker repeated seem to cause issue on the CI machine + OMPI_MCA_btl_smcuda_use_cuda_ipc: 0 # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 OMPI_MCA_btl_vader_single_copy_mechanism: none @@ -22,7 +24,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -47,7 +49,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: @@ -79,7 +81,7 @@ jobs: container: image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent # map to local user id on CI machine to allow writing to build cache - options: --user 1001 + options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728 steps: - uses: actions/checkout@v3 with: