Merge e632f6b into ddc6500

parthenon-hpc-lab · Nov 7, 2024 · 560dc46 · 560dc46
2 parents ddc6500 + e632f6b
commit 560dc46
Show file tree

Hide file tree

Showing 17 changed files with 113 additions and 45 deletions.
diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
@@ -21,6 +21,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # CUDA IPC within docker repeated seem to cause issue on the CI machine
+  OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
   # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
   OMPI_MCA_btl_vader_single_copy_mechanism: none
 
@@ -34,7 +36,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:

diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
@@ -13,6 +13,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # CUDA IPC within docker repeated seem to cause issue on the CI machine
+  OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
   # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
   OMPI_MCA_btl_vader_single_copy_mechanism: none
 
@@ -22,7 +24,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -47,7 +49,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -79,7 +81,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,18 +3,18 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
-- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185/files) Bugfix to particle defragmentation
+- [[PR 1185]](https://github.com/parthenon-hpc-lab/parthenon/pull/1185) Bugfix to particle defragmentation
 - [[PR 1184]](https://github.com/parthenon-hpc-lab/parthenon/pull/1184) Fix swarm block neighbor indexing in 1D, 2D
 - [[PR 1183]](https://github.com/parthenon-hpc-lab/parthenon/pull/1183) Fix particle leapfrog example initialization data
 - [[PR 1179]](https://github.com/parthenon-hpc-lab/parthenon/pull/1179) Make a global variable for whether simulation is a restart
 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option
 - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1
 - [[PR1203]](https://github.com/parthenon-hpc-lab/parthenon/pull/1203) Pin Ubuntu CI image
-- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
+- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
-- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option
 - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls
 
@@ -31,7 +31,7 @@
 
 
 ### Incompatibilities (i.e. breaking changes)
-- [[PR1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
+- [[PR 1177]](https://github.com/parthenon-hpc-lab/parthenon/pull/1177) Make mesh-level boundary conditions usable without the "user" flag
 
 ## Release 24.08
 Date: 2024-08-30
@@ -155,12 +155,12 @@ Date: 2024-03-21
 - [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades
 
 ### Fixed (not changing behavior/API/variables/...)
-- [[PR1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool
-- [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code
-- [[PR992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools
-- [[PR988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes
-- [[PR986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing
-- [[PR978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check
+- [[PR 1023]](https://github.com/parthenon-hpc-lab/parthenon/pull/1023) Fix broken param of a scalar bool
+- [[PR 1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code
+- [[PR 992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools
+- [[PR 988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes
+- [[PR 986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing
+- [[PR 978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check
 
 ### Infrastructure (changes irrelevant to downstream codes)
 - [[PR 1027]](https://github.com/parthenon-hpc-lab/parthenon/pull/1027) Refactor RestartReader as abstract class
@@ -227,7 +227,7 @@ Date: 2023-11-16
 - [[PR 901]](https://github.com/parthenon-hpc-lab/parthenon/pull/901) Implement shared element ownership model
 
 ### Removed (removing behavior/API/varaibles/...)
-- [[PR 930](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage.
+- [[PR 930]](https://github.com/parthenon-hpc-lab/parthenon/pull/930) Remove ParthenonManager::ParthenonInit as it is error-prone and the split functions are the recommended usage.
 
 
 ## Release 0.8.0

diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr
 
 * CMake 3.16 or greater
 * C++17 compatible compiler
-* Kokkos 4.0.1 or greater
+* Kokkos 4.1.1 or greater
 
 ## Optional (enabling features)
 

diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake
@@ -19,6 +19,7 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ")
 
 # common options
 set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks")
+set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.")
 
 set(MACHINE_CXX_FLAGS "")
 if (${MACHINE_VARIANT} MATCHES "cuda")

diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst
@@ -62,6 +62,34 @@ parallelism interface that is needed for managing memory cached in
 tightly nested loops. The wrappers are documented
 :ref:`here <nested par for>`.
 
+View of Views
+-------------
+
+Special care needs to be taken when working with a ``View`` of ``View``.
+
+To repeat the Kokkos documenation: `Don't use them <https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/View.html#can-i-make-a-view-of-views>`__
+
+But if you have to (which is the case in some places inside Parthenon)
+then follow this pattern:
+
+.. code:: c++
+
+   Kokkos::View<ParArray1D<Real> *> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10);
+
+The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added,
+which results in the (inner ``View`` ) deallocators being called on the host (rather than on
+the device by default).
+
+Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional
+property for the same reason.
+
+.. code:: c++
+
+   auto view_of_pararrays_h =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays);
+
+Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon).
+
 The need for reductions within function handling ``MeshBlock`` data
 -------------------------------------------------------------------
 

diff --git a/external/Kokkos b/external/Kokkos
diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
@@ -41,7 +41,8 @@ namespace parthenon {
 
 void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) {
   prores_info = ParArray1D<ProResInfo>("prores_info", n_regions);
-  prores_info_h = Kokkos::create_mirror_view(prores_info);
+  prores_info_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info);
   int nref_funcs = pkg->NumRefinementFuncs();
   // Note that assignment of Kokkos views resets them, but
   // buffer_subset_sizes is a std::vector. It must be cleared, then

diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
@@ -127,7 +127,7 @@ struct ProResInfo {
 int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb,
                   std::shared_ptr<Variable<Real>> v);
 
-using BndInfoArr_t = ParArray1D<BndInfo>;
+using BndInfoArr_t = Kokkos::View<BndInfo *, LayoutWrapper, DevMemSpace>;
 using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror;
 
 using ProResInfoArr_t = ParArray1D<ProResInfo>;

diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
@@ -216,7 +216,8 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
   using namespace loops::shorthands;
   BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
   cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
-  cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info);
+  cache.bnd_info_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info);
 
   // prolongation/restriction sub-sets
   // TODO(JMM): Right now I exclude fluxcorrection boundaries but if

diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
@@ -150,7 +150,8 @@ const MeshBlockPack<P> &PackOnMesh(M &map, BlockDataList_t<Real> &block_data_,
 
   if (make_new_pack) {
     ParArray1D<P> packs("MeshData::PackVariables::packs", nblocks);
-    auto packs_host = Kokkos::create_mirror_view(packs);
+    auto packs_host =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs);
 
     for (size_t i = 0; i < nblocks; i++) {
       const auto &pack = packing_function(block_data_[i], this_map, this_key);

diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
@@ -152,7 +152,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
     leading_dim += 2;
   }
   pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size);
-  pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_);
+  pack.pack_h_ = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
   // For non-flat packs, shape of pack is type x block x var x k x j x i
   // where type here might be a flux.
@@ -168,7 +169,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_);
 
   pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks);
-  auto coords_h = Kokkos::create_mirror_view(pack.coords_);
+  auto coords_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_);
 
   // Fill the views
   int idx = 0;

diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp
@@ -109,7 +109,8 @@ class SwarmPackBase {
     // Allocate the views
     int leading_dim = 1;
     pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size);
-    auto pack_h = Kokkos::create_mirror_view(pack.pack_);
+    auto pack_h = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
     pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar);
     auto bounds_h = Kokkos::create_mirror_view(pack.bounds_);
@@ -154,7 +155,8 @@ class SwarmPackBase {
     Kokkos::deep_copy(pack.bounds_, bounds_h);
 
     pack.contexts_ = contexts_t("contexts", nblocks);
-    pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_);
+    pack.contexts_h_ = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_);
     pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks);
     pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1);
     BuildSupplemental(pmd, desc, pack);

diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp
@@ -244,10 +244,11 @@ class PackIndexMap {
 };
 
 template <typename T>
-using ViewOfParArrays = ParArray1D<ParArray3D<T, VariableState>>;
+using ViewOfParArrays =
+    Kokkos::View<ParArray3D<T, VariableState> *, LayoutWrapper, DevMemSpace>;
 
 template <typename T>
-using ViewOfParArrays1D = ParArray1D<ParArray1D<T>>;
+using ViewOfParArrays1D = Kokkos::View<ParArray1D<T> *, LayoutWrapper, DevMemSpace>;
 
 // forward declaration
 template <typename T>
@@ -570,10 +571,11 @@ void FillVarView(const VariableVector<T> &vars, int vsize, bool coarse,
   assert(vsize == sparse_id_out.size());
   assert(vsize == vector_component_out.size());
 
-  auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out);
-  auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out);
-  auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out);
-  auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out);
+  auto host_cv =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out);
+  auto host_sp = Kokkos::create_mirror_view(sparse_id_out);
+  auto host_vc = Kokkos::create_mirror_view(vector_component_out);
+  auto host_al = Kokkos::create_mirror_view(allocated_out);
 
   int vindex = 0;
   for (const auto &v : vars) {
@@ -634,7 +636,8 @@ void FillSwarmVarView(const vpack_types::SwarmVarList<T> &vars,
                       ViewOfParArrays1D<T> &cv_out, PackIndexMap *pvmap) {
   using vpack_types::IndexPair;
 
-  auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out);
+  auto host_cv =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out);
 
   int vindex = 0;
   for (const auto v : vars) {
@@ -675,10 +678,13 @@ void FillFluxViews(const VariableVector<T> &vars, const int ndim,
                    PackIndexMap *pvmap) {
   using vpack_types::IndexPair;
 
-  auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out);
-  auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out);
-  auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out);
-  auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out);
+  auto host_f1 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f1_out);
+  auto host_f2 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f2_out);
+  auto host_f3 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f3_out);
+  auto host_al = Kokkos::create_mirror_view(flux_allocated_out);
 
   int vindex = 0;
   for (const auto &v : vars) {
@@ -755,10 +761,11 @@ VariableFluxPack<T> MakeFluxPack(const VarListWithKeys<T> &var_list,
   }
 
   // make the outer view
-  ViewOfParArrays<T> cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1));
-  ViewOfParArrays<T> f1("MakeFluxPack::f1", fsize);
-  ViewOfParArrays<T> f2("MakeFluxPack::f2", fsize);
-  ViewOfParArrays<T> f3("MakeFluxPack::f3", fsize);
+  ViewOfParArrays<T> cv(ViewOfViewAlloc("MakeFluxPack::cv"),
+                        vsize * (extra_components ? 3 : 1));
+  ViewOfParArrays<T> f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize);
+  ViewOfParArrays<T> f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize);
+  ViewOfParArrays<T> f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize);
   ParArray1D<bool> flux_allocated("MakePack::allocated", fsize);
   ParArray1D<int> sparse_id("MakeFluxPack::sparse_id", vsize);
   ParArray1D<int> vector_component("MakeFluxPack::vector_component", vsize);
@@ -809,7 +816,8 @@ VariablePack<T> MakePack(const VarListWithKeys<T> &var_list, bool coarse,
   }
 
   // make the outer view
-  ViewOfParArrays<T> cv("MakePack::cv", vsize * (extra_components ? 3 : 1));
+  ViewOfParArrays<T> cv(ViewOfViewAlloc("MakePack::cv"),
+                        vsize * (extra_components ? 3 : 1));
   ParArray1D<int> sparse_id("MakePack::sparse_id", vsize);
   ParArray1D<int> vector_component("MakePack::vector_component", vsize);
   ParArray1D<bool> allocated("MakePack::allocated", vsize);
@@ -842,7 +850,7 @@ SwarmVariablePack<T> MakeSwarmPack(const vpack_types::SwarmVarList<T> &vars,
   }
 
   // make the outer view
-  ViewOfParArrays1D<T> cv("MakePack::cv", vsize);
+  ViewOfParArrays1D<T> cv(ViewOfViewAlloc("MakePack::cv"), vsize);
 
   std::array<int, 2> cv_size{0, 0};
   if (vsize > 0) {

diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
@@ -1035,6 +1035,20 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons
       reduction);
 }
 
+// For ViewOfView we need to call the destructor of the inner views on
+// the host and not on the device (which would happen by default).
+// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is
+// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be
+// passed when calling `create_mirror_view`.
+template <typename T = DevMemSpace>
+auto ViewOfViewAlloc(const std::string &label) {
+  if constexpr (std::is_same_v<T, HostMemSpace>) {
+    return Kokkos::view_alloc(Kokkos::SequentialHostInit, label);
+  } else {
+    return Kokkos::view_alloc(label);
+  }
+}
+
 // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
 // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7
 template <class ExecSpace>

diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp
@@ -221,6 +221,8 @@ class ParArrayGeneric : public State {
     // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6);
   }
 
+  // TODO(PG?) Can we use concepts here to add a
+  // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView?
   template <typename MemSpace>
   auto GetMirror(MemSpace const &memspace) {
     auto mirror = Kokkos::create_mirror_view(memspace, data_);
@@ -333,6 +335,8 @@ inline auto subview(std::index_sequence<I...>,
   return parthenon::ParArrayGeneric<decltype(v), SU>(v, arr);
 }
 
+// TODO(PG?) Can we use concepts here to add a
+// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView?
 template <class Space, class U, class SU>
 inline auto create_mirror_view_and_copy(Space const &space,
                                         const parthenon::ParArrayGeneric<U, SU> &arr) {