From 00c936c96d273da9b845c25481e5b3218ba1d4e8 Mon Sep 17 00:00:00 2001
From: Jonah Miller <jonahm@lanl.gov>
Date: Wed, 16 Oct 2024 08:56:23 -0600
Subject: [PATCH 01/13] bump Kokkos version

---
 external/Kokkos | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/external/Kokkos b/external/Kokkos
index 62d2b6c879b74..e0dc0128e04f1 160000
--- a/external/Kokkos
+++ b/external/Kokkos
@@ -1 +1 @@
-Subproject commit 62d2b6c879b74b6ae7bd06eb3e5e80139c4708e6
+Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4

From 0f386249fec65bf2b4873711610ea51b30841341 Mon Sep 17 00:00:00 2001
From: Jonah Miller <jonahm@lanl.gov>
Date: Wed, 16 Oct 2024 08:57:38 -0600
Subject: [PATCH 02/13] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f046faabe759..71d100d528aa7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01
 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
 - [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option

From 86783d434bd465209467c889290bf8a8494ecb61 Mon Sep 17 00:00:00 2001
From: Patrick Mullen <pdmullen@pn2293621.lanl.gov>
Date: Thu, 17 Oct 2024 08:38:16 -0600
Subject: [PATCH 03/13] Update Kokkos version to 4.4.1

---
 CHANGELOG.md    | 3 +--
 external/Kokkos | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71d100d528aa7..ff9137b4dfc75 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,9 +11,8 @@
 - [[PR 1161]](https://github.com/parthenon-hpc-lab/parthenon/pull/1161) Make flux field Metadata accessible, add Metadata::CellMemAligned flag, small perfomance upgrades
 
 ### Changed (changing behavior/API/variables/...)
-- [[PR1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.2.01
+- [[PR 1191]](https://github.com/parthenon-hpc-lab/parthenon/pull/1191) Update Kokkos version to 4.4.1
 - [[PR 1187]](https://github.com/parthenon-hpc-lab/parthenon/pull/1187) Make DataCollection::Add safer and generalize MeshBlockData::Initialize
-- [[Issue 1165]](https://github.com/parthenon-hpc-lab/parthenon/issues/1165) Bump Kokkos submodule to 4.4.1
 - [[PR 1171]](https://github.com/parthenon-hpc-lab/parthenon/pull/1171) Add PARTHENON_USE_SYSTEM_PACKAGES build option
 - [[PR 1172]](https://github.com/parthenon-hpc-lab/parthenon/pull/1172) Make parthenon manager robust against external MPI init and finalize calls
 
diff --git a/external/Kokkos b/external/Kokkos
index e0dc0128e04f1..15dc143e5f399 160000
--- a/external/Kokkos
+++ b/external/Kokkos
@@ -1 +1 @@
-Subproject commit e0dc0128e04f18c2bbbaefceef3616e7ddcfa3c4
+Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8

From b4ab05f5f3c059e6a9fa208a3101c6e47bd0d3aa Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Fri, 18 Oct 2024 15:34:01 +0200
Subject: [PATCH 04/13] Fix View of View dealloc

---
 src/bvals/comms/bnd_info.hpp    | 2 +-
 src/bvals/comms/bvals_utils.hpp | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
index e6214ceba3226..8800f6fd867f3 100644
--- a/src/bvals/comms/bnd_info.hpp
+++ b/src/bvals/comms/bnd_info.hpp
@@ -127,7 +127,7 @@ struct ProResInfo {
 int GetBufferSize(MeshBlock *pmb, const NeighborBlock &nb,
                   std::shared_ptr<Variable<Real>> v);
 
-using BndInfoArr_t = ParArray1D<BndInfo>;
+using BndInfoArr_t = Kokkos::View<BndInfo *, LayoutWrapper, DevMemSpace>;
 using BndInfoArrHost_t = typename BndInfoArr_t::HostMirror;
 
 using ProResInfoArr_t = ParArray1D<ProResInfo>;
diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index f185c1207747f..616d5cf21ee05 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -215,7 +215,8 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
   using namespace loops;
   using namespace loops::shorthands;
   BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
-  cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
+  cache.bnd_info =
+      BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound);
   cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info);
 
   // prolongation/restriction sub-sets

From e333a7b6c33a4ede8a2c0f9a435bbac85227b59b Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Mon, 21 Oct 2024 11:18:17 +0200
Subject: [PATCH 05/13] Move view_alloc to host

---
 src/bvals/comms/bvals_utils.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index 616d5cf21ee05..8a47c716aa22c 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -215,9 +215,9 @@ inline void RebuildBufferCache(std::shared_ptr<MeshData<Real>> md, int nbound,
   using namespace loops;
   using namespace loops::shorthands;
   BvarsSubCache_t &cache = md->GetBvarsCache().GetSubCache(BOUND_TYPE, SENDER);
-  cache.bnd_info =
-      BndInfoArr_t(Kokkos::view_alloc("bnd_info", Kokkos::SequentialHostInit), nbound);
-  cache.bnd_info_h = Kokkos::create_mirror_view(cache.bnd_info);
+  cache.bnd_info = BndInfoArr_t("bnd_info", nbound);
+  cache.bnd_info_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), cache.bnd_info);
 
   // prolongation/restriction sub-sets
   // TODO(JMM): Right now I exclude fluxcorrection boundaries but if

From 094880e12701ba8ee80e37e8a5b0ca52b5f74326 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Mon, 21 Oct 2024 11:34:18 +0200
Subject: [PATCH 06/13] Temp disable rocthrust on amd ci container

---
 cmake/machinecfg/GitHubActions.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake
index 663dcb38d6821..1adba870bdb20 100644
--- a/cmake/machinecfg/GitHubActions.cmake
+++ b/cmake/machinecfg/GitHubActions.cmake
@@ -19,6 +19,7 @@ message(STATUS "Loading machine configuration for GitHub Actions CI. ")
 
 # common options
 set(NUM_MPI_PROC_TESTING "2" CACHE STRING "CI runs tests with 2 MPI ranks")
+set(Kokkos_ENABLE_ROCTHRUST OFF CACHE BOOL "Temporarily disabled as the container needs to be updated to the `-complete` base image.")
 
 set(MACHINE_CXX_FLAGS "")
 if (${MACHINE_VARIANT} MATCHES "cuda")

From 0e0ef22cfc1540bc3156437fed5a4a903becc071 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Mon, 21 Oct 2024 11:42:56 +0200
Subject: [PATCH 07/13] Fix doc creation. Rollback to ubuntu22-04

---
 .github/workflows/docs.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 4b95511d82f58..7fb6e0165ff0e 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -12,7 +12,10 @@ on:
 jobs:
     docs:
       name: build and deploy docs
-      runs-on: ubuntu-latest
+      # not using latest due to issues with pip user packages, see
+      # https://github.com/actions/runner-images/issues/10781 and
+      # https://github.com/actions/runner-images/issues/10636
+      runs-on: ubuntu-22.04
 
       steps:      
         - name: Checkout code
@@ -23,9 +26,9 @@ jobs:
           run: export DEBIAN_FRONTEND=noninteractive
         - name: install dependencies
           run: |
-            pip install --break-system-packages sphinx
-            pip install --break-system-packages sphinx-rtd-theme
-            pip install --break-system-packages sphinx-multiversion
+            pip install sphinx
+            pip install sphinx-rtd-theme
+            pip install sphinx-multiversion
         - name: build docs
           run: |
             echo "Repo = ${GITHUB_REPOSITORY}"

From 343b0092e838425514e357685066305ab208b9d1 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Tue, 22 Oct 2024 12:02:45 +0200
Subject: [PATCH 08/13] Fix more view of views

---
 src/bvals/comms/bnd_info.cpp       |  3 ++-
 src/interface/mesh_data.hpp        |  3 ++-
 src/interface/sparse_pack_base.cpp |  6 ++++--
 src/interface/swarm_comms.cpp      |  1 +
 src/interface/swarm_pack_base.hpp  |  6 ++++--
 src/interface/variable_pack.hpp    | 23 ++++++++++++++---------
 src/parthenon_array_generic.hpp    |  4 ++++
 tst/unit/test_pararrays.cpp        |  6 ++++--
 8 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index 736992260913f..54a6ae2b50fd3 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -41,7 +41,8 @@ namespace parthenon {
 
 void ProResCache_t::Initialize(int n_regions, StateDescriptor *pkg) {
   prores_info = ParArray1D<ProResInfo>("prores_info", n_regions);
-  prores_info_h = Kokkos::create_mirror_view(prores_info);
+  prores_info_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), prores_info);
   int nref_funcs = pkg->NumRefinementFuncs();
   // Note that assignment of Kokkos views resets them, but
   // buffer_subset_sizes is a std::vector. It must be cleared, then
diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index 14ae3959c32ad..7d7d1cabcbe5a 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -150,7 +150,8 @@ const MeshBlockPack<P> &PackOnMesh(M &map, BlockDataList_t<Real> &block_data_,
 
   if (make_new_pack) {
     ParArray1D<P> packs("MeshData::PackVariables::packs", nblocks);
-    auto packs_host = Kokkos::create_mirror_view(packs);
+    auto packs_host =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), packs);
 
     for (size_t i = 0; i < nblocks; i++) {
       const auto &pack = packing_function(block_data_[i], this_map, this_key);
diff --git a/src/interface/sparse_pack_base.cpp b/src/interface/sparse_pack_base.cpp
index 2a7a5b70c41c2..4ea5a558c3f52 100644
--- a/src/interface/sparse_pack_base.cpp
+++ b/src/interface/sparse_pack_base.cpp
@@ -152,7 +152,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
     leading_dim += 2;
   }
   pack.pack_ = pack_t("data_ptr", leading_dim, pack.nblocks_, max_size);
-  pack.pack_h_ = Kokkos::create_mirror_view(pack.pack_);
+  pack.pack_h_ = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
   // For non-flat packs, shape of pack is type x block x var x k x j x i
   // where type here might be a flux.
@@ -168,7 +169,8 @@ SparsePackBase SparsePackBase::Build(T *pmd, const PackDescriptor &desc,
   pack.block_props_h_ = Kokkos::create_mirror_view(pack.block_props_);
 
   pack.coords_ = coords_t("coords", desc.flat ? max_size : nblocks);
-  auto coords_h = Kokkos::create_mirror_view(pack.coords_);
+  auto coords_h = Kokkos::create_mirror_view(
+      Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.coords_);
 
   // Fill the views
   int idx = 0;
diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp
index 3ee4e2af75129..bb5755e2cbe87 100644
--- a/src/interface/swarm_comms.cpp
+++ b/src/interface/swarm_comms.cpp
@@ -157,6 +157,7 @@ void Swarm::SetupPersistentMPI() {
 }
 
 void Swarm::CountParticlesToSend_() {
+  // TODO(PG->BRR) What's going on here? Why is the mask being copied?
   auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
   auto swarm_d = GetDeviceContext();
   auto pmb = GetBlockPointer();
diff --git a/src/interface/swarm_pack_base.hpp b/src/interface/swarm_pack_base.hpp
index 0733aa51f329e..52a2c3c47fc7b 100644
--- a/src/interface/swarm_pack_base.hpp
+++ b/src/interface/swarm_pack_base.hpp
@@ -109,7 +109,8 @@ class SwarmPackBase {
     // Allocate the views
     int leading_dim = 1;
     pack.pack_ = pack_t("data_ptr", leading_dim, nblocks, max_size);
-    auto pack_h = Kokkos::create_mirror_view(pack.pack_);
+    auto pack_h = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.pack_);
 
     pack.bounds_ = bounds_t("bounds", 2, nblocks, nvar);
     auto bounds_h = Kokkos::create_mirror_view(pack.bounds_);
@@ -154,7 +155,8 @@ class SwarmPackBase {
     Kokkos::deep_copy(pack.bounds_, bounds_h);
 
     pack.contexts_ = contexts_t("contexts", nblocks);
-    pack.contexts_h_ = Kokkos::create_mirror_view(pack.contexts_);
+    pack.contexts_h_ = Kokkos::create_mirror_view(
+        Kokkos::view_alloc(Kokkos::SequentialHostInit), pack.contexts_);
     pack.max_active_indices_ = max_active_indices_t("max_active_indices", nblocks);
     pack.flat_index_map_ = max_active_indices_t("flat_index_map", nblocks + 1);
     BuildSupplemental(pmd, desc, pack);
diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp
index 037731093ce13..07e56091ddfe6 100644
--- a/src/interface/variable_pack.hpp
+++ b/src/interface/variable_pack.hpp
@@ -570,10 +570,11 @@ void FillVarView(const VariableVector<T> &vars, int vsize, bool coarse,
   assert(vsize == sparse_id_out.size());
   assert(vsize == vector_component_out.size());
 
-  auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out);
-  auto host_sp = Kokkos::create_mirror_view(Kokkos::HostSpace(), sparse_id_out);
-  auto host_vc = Kokkos::create_mirror_view(Kokkos::HostSpace(), vector_component_out);
-  auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), allocated_out);
+  auto host_cv =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out);
+  auto host_sp = Kokkos::create_mirror_view(sparse_id_out);
+  auto host_vc = Kokkos::create_mirror_view(vector_component_out);
+  auto host_al = Kokkos::create_mirror_view(allocated_out);
 
   int vindex = 0;
   for (const auto &v : vars) {
@@ -634,7 +635,8 @@ void FillSwarmVarView(const vpack_types::SwarmVarList<T> &vars,
                       ViewOfParArrays1D<T> &cv_out, PackIndexMap *pvmap) {
   using vpack_types::IndexPair;
 
-  auto host_cv = Kokkos::create_mirror_view(Kokkos::HostSpace(), cv_out);
+  auto host_cv =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), cv_out);
 
   int vindex = 0;
   for (const auto v : vars) {
@@ -675,10 +677,13 @@ void FillFluxViews(const VariableVector<T> &vars, const int ndim,
                    PackIndexMap *pvmap) {
   using vpack_types::IndexPair;
 
-  auto host_f1 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f1_out);
-  auto host_f2 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f2_out);
-  auto host_f3 = Kokkos::create_mirror_view(Kokkos::HostSpace(), f3_out);
-  auto host_al = Kokkos::create_mirror_view(Kokkos::HostSpace(), flux_allocated_out);
+  auto host_f1 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f1_out);
+  auto host_f2 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f2_out);
+  auto host_f3 =
+      Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), f3_out);
+  auto host_al = Kokkos::create_mirror_view(flux_allocated_out);
 
   int vindex = 0;
   for (const auto &v : vars) {
diff --git a/src/parthenon_array_generic.hpp b/src/parthenon_array_generic.hpp
index d527707f9070a..ac38b6cb5e5f3 100644
--- a/src/parthenon_array_generic.hpp
+++ b/src/parthenon_array_generic.hpp
@@ -221,6 +221,8 @@ class ParArrayGeneric : public State {
     // return GetDim(1) * GetDim(2) * GetDim(3) * GetDim(4) * GetDim(5) * GetDim(6);
   }
 
+  // TODO(PG?) Can we use concepts here to add a
+  // Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView?
   template <typename MemSpace>
   auto GetMirror(MemSpace const &memspace) {
     auto mirror = Kokkos::create_mirror_view(memspace, data_);
@@ -333,6 +335,8 @@ inline auto subview(std::index_sequence<I...>,
   return parthenon::ParArrayGeneric<decltype(v), SU>(v, arr);
 }
 
+// TODO(PG?) Can we use concepts here to add a
+// Kokkos::view_alloc(Kokkos::SequentialHostInit) when the original is a ViewOfView?
 template <class Space, class U, class SU>
 inline auto create_mirror_view_and_copy(Space const &space,
                                         const parthenon::ParArrayGeneric<U, SU> &arr) {
diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp
index e79927c13b20f..d281ce9e0c8d0 100644
--- a/tst/unit/test_pararrays.cpp
+++ b/tst/unit/test_pararrays.cpp
@@ -452,7 +452,8 @@ TEST_CASE("ParArray state", "[ParArrayND]") {
 
   GIVEN("An array of ParArrays filled with the values contained in their state") {
     parthenon::ParArray1D<arr3d_t> pack("test pack", NS);
-    auto pack_h = Kokkos::create_mirror_view(pack);
+    auto pack_h =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack);
 
     for (int b = 0; b < NS; ++b) {
       state_t state(static_cast<double>(b));
@@ -544,7 +545,8 @@ TEST_CASE("Check registry pressure", "[ParArrayND][performance]") {
     new (&views[n])
         view_3d_t(Kokkos::view_alloc(label, Kokkos::WithoutInitializing), N, N, N);
     auto a_h = arrays(n).GetHostMirror();
-    auto v_h = Kokkos::create_mirror_view(views(n));
+    auto v_h = Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit),
+                                          views(n));
     for (int k = 0; k < N; k++) {
       for (int j = 0; j < N; j++) {
         for (int i = 0; i < N; i++) {

From 4999a80408910c191c7c0e7fdee5aaca73afe50d Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Tue, 22 Oct 2024 19:40:46 +0200
Subject: [PATCH 09/13] Chasing more ViewOfView on host

---
 src/interface/variable_pack.hpp | 19 +++++++++++--------
 src/kokkos_abstraction.hpp      | 13 +++++++++++++
 tst/unit/test_pararrays.cpp     |  2 +-
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/interface/variable_pack.hpp b/src/interface/variable_pack.hpp
index 07e56091ddfe6..76fce5f6b8616 100644
--- a/src/interface/variable_pack.hpp
+++ b/src/interface/variable_pack.hpp
@@ -244,10 +244,11 @@ class PackIndexMap {
 };
 
 template <typename T>
-using ViewOfParArrays = ParArray1D<ParArray3D<T, VariableState>>;
+using ViewOfParArrays =
+    Kokkos::View<ParArray3D<T, VariableState> *, LayoutWrapper, DevMemSpace>;
 
 template <typename T>
-using ViewOfParArrays1D = ParArray1D<ParArray1D<T>>;
+using ViewOfParArrays1D = Kokkos::View<ParArray1D<T> *, LayoutWrapper, DevMemSpace>;
 
 // forward declaration
 template <typename T>
@@ -760,10 +761,11 @@ VariableFluxPack<T> MakeFluxPack(const VarListWithKeys<T> &var_list,
   }
 
   // make the outer view
-  ViewOfParArrays<T> cv("MakeFluxPack::cv", vsize * (extra_components ? 3 : 1));
-  ViewOfParArrays<T> f1("MakeFluxPack::f1", fsize);
-  ViewOfParArrays<T> f2("MakeFluxPack::f2", fsize);
-  ViewOfParArrays<T> f3("MakeFluxPack::f3", fsize);
+  ViewOfParArrays<T> cv(ViewOfViewAlloc("MakeFluxPack::cv"),
+                        vsize * (extra_components ? 3 : 1));
+  ViewOfParArrays<T> f1(ViewOfViewAlloc("MakeFluxPack::f1"), fsize);
+  ViewOfParArrays<T> f2(ViewOfViewAlloc("MakeFluxPack::f2"), fsize);
+  ViewOfParArrays<T> f3(ViewOfViewAlloc("MakeFluxPack::f3"), fsize);
   ParArray1D<bool> flux_allocated("MakePack::allocated", fsize);
   ParArray1D<int> sparse_id("MakeFluxPack::sparse_id", vsize);
   ParArray1D<int> vector_component("MakeFluxPack::vector_component", vsize);
@@ -814,7 +816,8 @@ VariablePack<T> MakePack(const VarListWithKeys<T> &var_list, bool coarse,
   }
 
   // make the outer view
-  ViewOfParArrays<T> cv("MakePack::cv", vsize * (extra_components ? 3 : 1));
+  ViewOfParArrays<T> cv(ViewOfViewAlloc("MakePack::cv"),
+                        vsize * (extra_components ? 3 : 1));
   ParArray1D<int> sparse_id("MakePack::sparse_id", vsize);
   ParArray1D<int> vector_component("MakePack::vector_component", vsize);
   ParArray1D<bool> allocated("MakePack::allocated", vsize);
@@ -847,7 +850,7 @@ SwarmVariablePack<T> MakeSwarmPack(const vpack_types::SwarmVarList<T> &vars,
   }
 
   // make the outer view
-  ViewOfParArrays1D<T> cv("MakePack::cv", vsize);
+  ViewOfParArrays1D<T> cv(ViewOfViewAlloc("MakePack::cv"), vsize);
 
   std::array<int, 2> cv_size{0, 0};
   if (vsize > 0) {
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index 8fa89f82e95e8..b5dfc3fa75a0f 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -1035,6 +1035,19 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons
       reduction);
 }
 
+// For ViewOfView we need to call the destructor of the inner views on
+// the host and not on the device (which would happen by default).
+// Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is
+// on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be
+// passed when calling `create_mirror_view`.
+auto ViewOfViewAlloc(const std::string &label) {
+  if constexpr (std::is_same_v<DevMemSpace, HostMemSpace>) {
+    return Kokkos::view_alloc(Kokkos::SequentialHostInit, label);
+  } else {
+    return Kokkos::view_alloc(label);
+  }
+}
+
 // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
 // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7
 template <class ExecSpace>
diff --git a/tst/unit/test_pararrays.cpp b/tst/unit/test_pararrays.cpp
index d281ce9e0c8d0..9e1816b0e6690 100644
--- a/tst/unit/test_pararrays.cpp
+++ b/tst/unit/test_pararrays.cpp
@@ -451,7 +451,7 @@ TEST_CASE("ParArray state", "[ParArrayND]") {
   }
 
   GIVEN("An array of ParArrays filled with the values contained in their state") {
-    parthenon::ParArray1D<arr3d_t> pack("test pack", NS);
+    Kokkos::View<arr3d_t *> pack(parthenon::ViewOfViewAlloc("test pack"), NS);
     auto pack_h =
         Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), pack);
 

From c799d026bd226e3c175e04ff435e5c39744361ee Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Wed, 6 Nov 2024 16:32:37 +0100
Subject: [PATCH 10/13] Fix ViewAlloc func and add doc

---
 README.md                      |  2 +-
 doc/sphinx/src/development.rst | 28 ++++++++++++++++++++++++++++
 src/kokkos_abstraction.hpp     |  3 ++-
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ed6a1bb05a169..f049c44ecd7d5 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Parthenon -- a performance portable block-structured adaptive mesh refinement fr
 
 * CMake 3.16 or greater
 * C++17 compatible compiler
-* Kokkos 4.0.1 or greater
+* Kokkos 4.1.1 or greater
 
 ## Optional (enabling features)
 
diff --git a/doc/sphinx/src/development.rst b/doc/sphinx/src/development.rst
index dbab91d8d5ce6..98ac9cef90a85 100644
--- a/doc/sphinx/src/development.rst
+++ b/doc/sphinx/src/development.rst
@@ -62,6 +62,34 @@ parallelism interface that is needed for managing memory cached in
 tightly nested loops. The wrappers are documented
 :ref:`here <nested par for>`.
 
+View of Views
+-------------
+
+Special care needs to be taken when working with a ``View`` of ``View``.
+
+To repeat the Kokkos documenation: `Don't use them <https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/View.html#can-i-make-a-view-of-views>`__
+
+But if you have to (which is the case in some places inside Parthenon)
+then follow this pattern:
+
+.. code:: c++
+
+   Kokkos::View<ParArray1D<Real> *> view_of_pararrays(parthenon::ViewOfViewAlloc("myname"), 10);
+
+The ``ViewOfViewAlloc`` ensures that the ``Kokkos::SequentialHostInit`` property is added,
+which results in the (inner ``View`` ) deallocators being called on the host (rather than on
+the device by default).
+
+Similarly, when you create a host mirror of said ``View`` of ``View`` add the additional
+property for the same reason.
+
+.. code:: c++
+
+   auto view_of_pararrays_h =
+        Kokkos::create_mirror_view(Kokkos::view_alloc(Kokkos::SequentialHostInit), view_of_pararrays);
+
+Note that the ``SequentialHostInit`` was only added in Kokkos 4.4.1 (which is now the default in Parthenon).
+
 The need for reductions within function handling ``MeshBlock`` data
 -------------------------------------------------------------------
 
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index b5dfc3fa75a0f..37262dd0356cc 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -1040,8 +1040,9 @@ par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, cons
 // Thus, we need to pass `SquentialHostInit` as allocator, but only if the ViewOfView is
 // on the host. If the ViewOfViews in on the device, then `SequentialHostInit` should be
 // passed when calling `create_mirror_view`.
+template <typename T = DevMemSpace>
 auto ViewOfViewAlloc(const std::string &label) {
-  if constexpr (std::is_same_v<DevMemSpace, HostMemSpace>) {
+  if constexpr (std::is_same_v<T, HostMemSpace>) {
     return Kokkos::view_alloc(Kokkos::SequentialHostInit, label);
   } else {
     return Kokkos::view_alloc(label);

From ebbcc1704ed2c68c99808cf099eef41396b2d5ce Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 13:41:44 +0100
Subject: [PATCH 11/13] Disable Ascent in testing

---
 .github/workflows/ci-extended.yml | 48 +++++++++++++++----------------
 .github/workflows/ci-short.yml    | 12 ++++----
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index 8ca646cfc2ebd..0c083091cf0e2 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -32,9 +32,9 @@ jobs:
         parallel: ['serial', 'mpi']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -91,27 +91,27 @@ jobs:
           ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
 
       # Test Ascent integration (only most complex setup with MPI and on device)
-      - name: Ascent tests
-        if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
-        run: |
-          cmake -B build-ascent \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
-            -DPARTHENON_ENABLE_ASCENT=ON \
-            -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
-          cmake --build build-ascent
-          cd example/advection/
-          # Pick GPU with most available memory
-          export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-          mpirun -np 2 ../../build-ascent/example/advection/advection-example \
-            -i parthinput.advection \
-            parthenon/output5/dt=0.05 \
-            parthenon/time/tlim=0.1
-          # check if file exists
-          if [ ! -f "ascent_render_57.png" ]; then
-            echo "'ascent_render_57.png' does not exist."
-            exit 1
-          fi
+      # - name: Ascent tests
+      #   if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
+      #   run: |
+      #     cmake -B build-ascent \
+      #       -DCMAKE_BUILD_TYPE=Release \
+      #       -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
+      #       -DPARTHENON_ENABLE_ASCENT=ON \
+      #       -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
+      #     cmake --build build-ascent
+      #     cd example/advection/
+      #     # Pick GPU with most available memory
+      #     export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
+      #     mpirun -np 2 ../../build-ascent/example/advection/advection-example \
+      #       -i parthinput.advection \
+      #       parthenon/output5/dt=0.05 \
+      #       parthenon/time/tlim=0.1
+      #     # check if file exists
+      #     if [ ! -f "ascent_render_57.png" ]; then
+      #       echo "'ascent_render_57.png' does not exist."
+      #       exit 1
+      #     fi
 
       - uses: actions/upload-artifact@v3
         with:
@@ -120,7 +120,7 @@ jobs:
             build/CMakeFiles/CMakeOutput.log
             build/tst/regression/outputs/advection_convergence*/advection-errors.dat
             build/tst/regression/outputs/advection_convergence*/advection-errors.png
-            example/advection/ascent_render_57.png
+          # example/advection/ascent_render_57.png
           retention-days: 3
 
   perf-and-regression-amdgpu:
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index ecb4052411eea..ab42a7450c836 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -20,9 +20,9 @@ jobs:
   style:
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -45,9 +45,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -77,9 +77,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:

From 11e1e8614d07c0d85c0a04d914fde33e6115dbac Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 14:27:39 +0100
Subject: [PATCH 12/13] Revert "Disable Ascent in testing"

This reverts commit ebbcc1704ed2c68c99808cf099eef41396b2d5ce.
---
 .github/workflows/ci-extended.yml | 48 +++++++++++++++----------------
 .github/workflows/ci-short.yml    | 12 ++++----
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index 0c083091cf0e2..8ca646cfc2ebd 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -32,9 +32,9 @@ jobs:
         parallel: ['serial', 'mpi']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:
@@ -91,27 +91,27 @@ jobs:
           ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
 
       # Test Ascent integration (only most complex setup with MPI and on device)
-      # - name: Ascent tests
-      #   if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
-      #   run: |
-      #     cmake -B build-ascent \
-      #       -DCMAKE_BUILD_TYPE=Release \
-      #       -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
-      #       -DPARTHENON_ENABLE_ASCENT=ON \
-      #       -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
-      #     cmake --build build-ascent
-      #     cd example/advection/
-      #     # Pick GPU with most available memory
-      #     export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-      #     mpirun -np 2 ../../build-ascent/example/advection/advection-example \
-      #       -i parthinput.advection \
-      #       parthenon/output5/dt=0.05 \
-      #       parthenon/time/tlim=0.1
-      #     # check if file exists
-      #     if [ ! -f "ascent_render_57.png" ]; then
-      #       echo "'ascent_render_57.png' does not exist."
-      #       exit 1
-      #     fi
+      - name: Ascent tests
+        if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
+        run: |
+          cmake -B build-ascent \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
+            -DPARTHENON_ENABLE_ASCENT=ON \
+            -DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
+          cmake --build build-ascent
+          cd example/advection/
+          # Pick GPU with most available memory
+          export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
+          mpirun -np 2 ../../build-ascent/example/advection/advection-example \
+            -i parthinput.advection \
+            parthenon/output5/dt=0.05 \
+            parthenon/time/tlim=0.1
+          # check if file exists
+          if [ ! -f "ascent_render_57.png" ]; then
+            echo "'ascent_render_57.png' does not exist."
+            exit 1
+          fi
 
       - uses: actions/upload-artifact@v3
         with:
@@ -120,7 +120,7 @@ jobs:
             build/CMakeFiles/CMakeOutput.log
             build/tst/regression/outputs/advection_convergence*/advection-errors.dat
             build/tst/regression/outputs/advection_convergence*/advection-errors.png
-          # example/advection/ascent_render_57.png
+            example/advection/ascent_render_57.png
           retention-days: 3
 
   perf-and-regression-amdgpu:
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index ab42a7450c836..ecb4052411eea 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -20,9 +20,9 @@ jobs:
   style:
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:
@@ -45,9 +45,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:
@@ -77,9 +77,9 @@ jobs:
         device: ['cuda', 'host']
     runs-on: [self-hosted, A100]
     container:
-      image: ghcr.io/parthenon-hpc-lab/cuda11.6-noascent
+      image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
+      options: --user 1001
     steps:
       - uses: actions/checkout@v3
         with:

From e632f6b7cbab5535f8db91eb67cb81fa49146513 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 7 Nov 2024 14:31:04 +0100
Subject: [PATCH 13/13] Disable CUDA IPC

---
 .github/workflows/ci-extended.yml | 4 +++-
 .github/workflows/ci-short.yml    | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index 8ca646cfc2ebd..1dd39c782477c 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -21,6 +21,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # CUDA IPC within docker repeated seem to cause issue on the CI machine
+  OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
   # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
   OMPI_MCA_btl_vader_single_copy_mechanism: none
 
@@ -34,7 +36,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index ecb4052411eea..7e0fd8bf759ac 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -13,6 +13,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # CUDA IPC within docker repeated seem to cause issue on the CI machine
+  OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
   # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
   OMPI_MCA_btl_vader_single_copy_mechanism: none
 
@@ -22,7 +24,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -47,7 +49,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with:
@@ -79,7 +81,7 @@ jobs:
     container:
       image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
       # map to local user id on CI  machine to allow writing to build cache
-      options: --user 1001
+      options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
     steps:
       - uses: actions/checkout@v3
         with: