parthenon-hpc-lab · pgrete · Aug 26, 2024 · Jul 31, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+- [[PR 1147]](https://github.com/parthenon-hpc-lab/parthenon/pull/1147) Add `par_reduce_inner` functions
 - [[PR 1148]](https://github.com/parthenon-hpc-lab/parthenon/pull/1148) Add `GetPackDimension` to `StateDescriptor` for calculating pack sizes before `Mesh` initialization
 - [[PR 1143]](https://github.com/parthenon-hpc-lab/parthenon/pull/1143) Add tensor indices to VariableState, add radiation constant to constants, add TypeLists, allow for arbitrary containers for solvers
 - [[PR 1140]](https://github.com/parthenon-hpc-lab/parthenon/pull/1140) Allow for relative convergence tolerance in BiCGSTAB solver.

diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
@@ -976,6 +976,65 @@ KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(team_mbr_t team_member, Args &&..
   par_for_inner(DEFAULT_INNER_LOOP_PATTERN, team_member, std::forward<Args>(args)...);
 }
 
+// Inner reduction loops
+template <typename Function, typename T>
+KOKKOS_FORCEINLINE_FUNCTION void
+par_reduce_inner(team_mbr_t team_member, const int kl, const int ku, const int jl,
+                 const int ju, const int il, const int iu, const Function &function,
+                 T reduction) {
+  const int Nk = ku - kl + 1;
+  const int Nj = ju - jl + 1;
+  const int Ni = iu - il + 1;
+  const int NkNjNi = Nk * Nj * Ni;
+  const int NjNi = Nj * Ni;
+  Kokkos::parallel_reduce(
+      Kokkos::TeamThreadRange(team_member, NkNjNi),
+      [&](const int &idx, typename T::value_type &lreduce) {
+        int k = idx / NjNi;
+        int j = (idx - k * NjNi) / Ni;
+        int i = idx - k * NjNi - j * Ni;
+        k += kl;
+        j += jl;
+        i += il;
+        function(k, j, i, lreduce);
+      },
+      reduction);
+}
+
+template <typename Function, typename T>
+KOKKOS_FORCEINLINE_FUNCTION void
+par_reduce_inner(team_mbr_t team_member, const int jl, const int ju, const int il,
+                 const int iu, const Function &function, T reduction) {
+  const int Nj = ju - jl + 1;
+  const int Ni = iu - il + 1;
+  const int NjNi = Nj * Ni;
+  Kokkos::parallel_reduce(
+      Kokkos::TeamThreadRange(team_member, NjNi),
+      [&](const int &idx, typename T::value_type &lreduce) {
+        int j = idx / Ni;
+        int i = idx - j * Ni;
+        j += jl;
+        i += il;
+        function(j, i, lreduce);
+      },
+      reduction);
+}
+
+template <typename Function, typename T>
+KOKKOS_FORCEINLINE_FUNCTION void par_reduce_inner(team_mbr_t team_member, const int il,
+                                                  const int iu, const Function &function,
+                                                  T reduction) {
+  const int Ni = iu - il + 1;
+  Kokkos::parallel_reduce(
+      Kokkos::TeamThreadRange(team_member, Ni),
+      [&](const int &idx, typename T::value_type &lreduce) {
+        int i = idx;
+        i += il;
+        function(i, lreduce);
+      },
+      reduction);
+}
+
 // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
 // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7
 template <class ExecSpace>