From a0e49639abd1e2d579d242c4ed115f7761442059 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Fri, 2 Feb 2024 11:10:41 -0800
Subject: [PATCH] GpuLaunchFuntsG done

---
 Src/Base/AMReX_Box.H              |   6 +-
 Src/Base/AMReX_GpuLaunchFunctsG.H | 410 ++++++++++--------------------
 Src/Base/AMReX_Math.H             |  26 +-
 3 files changed, 152 insertions(+), 290 deletions(-)
diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
index f89e1baa5c..2a6eb6d6e7 100644
--- a/Src/Base/AMReX_Box.H
+++ b/Src/Base/AMReX_Box.H
@@ -1849,7 +1849,7 @@ struct BoxIndexer
           lo  (box.smallEnd())
         {}
 
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Dim3 operator() (std::uint64_t icell) const
     {
         std::uint64_t x, y, z, rem;
@@ -1868,7 +1868,7 @@ struct BoxIndexer
           lo  (box.smallEnd())
         {}
 
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Dim3 operator() (std::uint64_t icell) const
     {
         std::uint64_t x, y;
@@ -1884,7 +1884,7 @@ struct BoxIndexer
         : lo(box.smallEnd(0))
         {}
 
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Dim3 operator() (std::uint64_t icell) const
     {
         return {int(icell)+lo, 0, 0};
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index ec9765cb74..0c87fabd51 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -207,7 +207,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (Long i = item.get_global_id(0), stride = item.get_global_range(0);
-                         i < n; i += stride) {
+                         i < Long(n); i += stride) {
                         int n_active_threads = amrex::min(Long(n)-i+(Long)item.get_local_id(0),
                                                           (Long)item.get_local_range(0));
                         detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
@@ -224,7 +224,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (Long i = item.get_global_id(0), stride = item.get_global_range(0);
-                         i < n; i += stride) {
+                         i < Long(n); i += stride) {
                         detail::call_f(f, T(i), Gpu::Handler{&item});
                     }
                 });
@@ -239,7 +239,7 @@ template <int MT, typename L>
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = box.numPts();
+    const auto ncells = std::uint64_t(box.numPts());
     BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
@@ -259,10 +259,10 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
                         auto [i, j, k] = indexer(icell);
-                        auto n_active_threads = int(amrex::min(ncells-icell+item.get_local_id(0),
-                                                               item.get_local_range(0)));
+                        int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)),
+                                                          std::uint64_t(item.get_local_range(0)));
                         detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
-                                               n_active_threads});
+                                                                n_active_threads});
                     }
                 });
             });
@@ -291,11 +291,8 @@ template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_in
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = Long(len.x)*Long(len.y);
-    const auto lenx = Long(len.x);
+    const auto ncells = std::uint64_t(box.numPts());
+    BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
@@ -311,14 +308,12 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (Long icell = item.get_global_id(0), stride = item.get_global_range(0);
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
-                        Long k =  icell /   lenxy;
-                        Long j = (icell - k*lenxy) /   lenx;
-                        Long i = (icell - k*lenxy) - j*lenx;
-                        int n_active_threads = amrex::min(ncells-icell+(Long)item.get_local_id(0),
-                                                          (Long)item.get_local_range(0));
-                        detail::call_f(f, int(i)+lo.x, int(j)+lo.y, int(k)+lo.z, ncomp,
+                        auto [i, j, k] = indexer(icell);
+                        int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)),
+                                                          std::uint64_t(item.get_local_range(0)));
+                        detail::call_f(f, i, j, k, ncomp,
                                        Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                     n_active_threads});
                     }
@@ -332,12 +327,10 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (Long icell = item.get_global_id(0), stride = item.get_global_range(0);
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
-                        Long k =  icell /   lenxy;
-                        Long j = (icell - k*lenxy) /   lenx;
-                        Long i = (icell - k*lenxy) - j*lenx;
-                        detail::call_f(f,int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,ncomp,Gpu::Handler{&item});
+                        auto [i, j, k] = indexer(icell);
+                        detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item});
                     }
                 });
             });
@@ -368,7 +361,7 @@ void ParallelForRNG (T n, L&& f) noexcept
                 int tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (Long i = tid, stride = item.get_global_range(0); i < n; i += stride) {
+                for (Long i = tid, stride = item.get_global_range(0); i < Long(n); i += stride) {
                     f(T(i),rand_eng);
                 }
                 engine_acc.store(engine, tid);
@@ -384,11 +377,8 @@ template <typename L>
 void ParallelForRNG (Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = Long(len.x)*Long(len.y);
-    const auto lenx = Long(len.x);
+    const auto ncells = std::uint64_t(box.numPts());
+    BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
@@ -406,12 +396,10 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
                 int tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (Long icell = tid, stride = item.get_global_range(0);
+                for (std::uint64_t icell = tid, stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    Long k =  icell /   lenxy;
-                    Long j = (icell - k*lenxy) /   lenx;
-                    Long i = (icell - k*lenxy) - j*lenx;
-                    f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,rand_eng);
+                    auto [i, j, k] = indexer(icell);
+                    f(i,j,k,rand_eng);
                 }
                 engine_acc.store(engine, tid);
             });
@@ -426,11 +414,8 @@ template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T
 void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = Long(len.x)*Long(len.y);
-    const auto lenx = Long(len.x);
+    const auto ncells = std::uint64_t(box.numPts());
+    BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
@@ -445,16 +430,14 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                auto tid = item.get_global_id(0);
+                int tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (Long icell = tid, stride = item.get_global_range(0);
+                for (std::uint64_t icell = tid, stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    Long k =  icell /   lenxy;
-                    Long j = (icell - k*lenxy) /   lenx;
-                    Long i = (icell - k*lenxy) - j*lenx;
+                    auto [i, j, k] = indexer(icell);
                     for (T n = 0; n < ncomp; ++n) {
-                        f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,n,rand_eng);
+                        f(i,j,k,n,rand_eng);
                     }
                 }
                 engine_acc.store(engine, tid);
@@ -470,17 +453,11 @@ template <int MT, typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
     const auto ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
@@ -493,19 +470,15 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (Long icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        Long k =  icell /   len1xy;
-                        Long j = (icell - k*len1xy) /   len1x;
-                        Long i = (icell - k*len1xy) - j*len1x;
-                        f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z);
+                        auto [i, j, k] = indexer1(icell);
+                        f1(i,j,k);
                     }
                     if (icell < ncells2) {
-                        Long k =  icell /   len2xy;
-                        Long j = (icell - k*len2xy) /   len2x;
-                        Long i = (icell - k*len2xy) - j*len2x;
-                        f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z);
+                        auto [i, j, k] = indexer2(icell);
+                        f2(i,j,k);
                     }
                 }
             });
@@ -521,22 +494,13 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
-    const auto ncells3 = box3.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
     const auto ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len3xy = Long(len3.x)*Long(len3.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
-    const auto len3x = Long(len3.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
+    BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
@@ -549,25 +513,19 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (Long icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        Long k =  icell /   len1xy;
-                        Long j = (icell - k*len1xy) /   len1x;
-                        Long i = (icell - k*len1xy) - j*len1x;
-                        f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z);
+                        auto [i, j, k] = indexer1(icell);
+                        f1(i,j,k);
                     }
                     if (icell < ncells2) {
-                        Long k =  icell /   len2xy;
-                        Long j = (icell - k*len2xy) /   len2x;
-                        Long i = (icell - k*len2xy) - j*len2x;
-                        f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z);
+                        auto [i, j, k] = indexer2(icell);
+                        f2(i,j,k);
                     }
                     if (icell < ncells3) {
-                        Long k =  icell /   len3xy;
-                        Long j = (icell - k*len3xy) /   len3x;
-                        Long i = (icell - k*len3xy) - j*len3x;
-                        f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z);
+                        auto [i, j, k] = indexer3(icell);
+                        f3(i,j,k);
                     }
                 }
             });
@@ -585,17 +543,11 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
     const auto ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
@@ -608,22 +560,18 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (Long icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        Long k =  icell /   len1xy;
-                        Long j = (icell - k*len1xy) /   len1x;
-                        Long i = (icell - k*len1xy) - j*len1x;
+                        auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
-                            f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n);
+                            f1(i,j,k,n);
                         }
                     }
                     if (icell < ncells2) {
-                        Long k =  icell /   len2xy;
-                        Long j = (icell - k*len2xy) /   len2x;
-                        Long i = (icell - k*len2xy) - j*len2x;
+                        auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
-                            f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n);
+                            f2(i,j,k,n);
                         }
                     }
                 }
@@ -644,22 +592,13 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
-    const auto ncells3 = box3.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
     const auto ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len3xy = Long(len3.x)*Long(len3.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
-    const auto len3x = Long(len3.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
+    BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
@@ -672,30 +611,24 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (Long icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        Long k =  icell /   len1xy;
-                        Long j = (icell - k*len1xy) /   len1x;
-                        Long i = (icell - k*len1xy) - j*len1x;
+                        auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
-                            f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n);
+                            f1(i,j,k,n);
                         }
                     }
                     if (icell < ncells2) {
-                        Long k =  icell /   len2xy;
-                        Long j = (icell - k*len2xy) /   len2x;
-                        Long i = (icell - k*len2xy) - j*len2x;
+                        auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
-                            f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n);
+                            f2(i,j,k,n);
                         }
                     }
                     if (icell < ncells3) {
-                        Long k =  icell /   len3xy;
-                        Long j = (icell - k*len3xy) /   len3x;
-                        Long i = (icell - k*len3xy) - j*len3x;
+                        auto [i, j, k] = indexer3(icell);
                         for (T3 n = 0; n < ncomp3; ++n) {
-                            f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z,n);
+                            f3(i,j,k,n);
                         }
                     }
                 }
@@ -818,14 +751,13 @@ template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_in
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
-    using IT = std::conditional_t<(sizeof(T) < sizeof(Long)), Long, T>;
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (IT i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             i < n; i += stride) {
-            detail::call_f(f, T(i), (n-i+(IT)threadIdx.x));
+        for (Long i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+             i < Long(n); i += stride) {
+            detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -845,7 +777,7 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
              icell < ncells; icell += stride)
         {
             auto [i, j, k] = indexer(icell);
-            detail::call_f(f, i, j, k, (ncells-icell+(Long)threadIdx.x));
+            detail::call_f(f, i, j, k, (ncells-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -856,20 +788,15 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    const auto ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = Long(len.x)*Long(len.y);
-    const auto lenx = Long(len.x);
+    const auto ncells = std::uint64_t(box.numPts());
+    BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
-            Long k =  icell /   lenxy;
-            Long j = (icell - k*lenxy) /   lenx;
-            Long i = (icell - k*lenxy) - j*lenx;
-            detail::call_f(f, int(i)+lo.x, int(j)+lo.y, int(k)+lo.z, ncomp, (ncells-icell+(Long)threadIdx.x));
+            auto [i, j, k] = indexer(icell);
+            detail::call_f(f, i, j, k, ncomp, (ncells-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -886,9 +813,9 @@ ParallelForRNG (T n, L&& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        auto tid = blockDim.x*blockIdx.x+threadIdx.x;
+        int tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (Long i = tid, stride = blockDim.x*gridDim.x; i < n; i += stride) {
+        for (Long i = tid, stride = blockDim.x*gridDim.x; i < Long(n); i += stride) {
             f(T(i),engine);
         }
     });
@@ -902,11 +829,8 @@ ParallelForRNG (Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    const auto ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = Long(len.x)*Long(len.y);
-    const auto lenx = Long(len.x);
+    const auto ncells = std::uint64_t(box.numPts());
+    BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
@@ -914,11 +838,9 @@ ParallelForRNG (Box const& box, L&& f) noexcept
     [=] AMREX_GPU_DEVICE () noexcept {
         auto tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (Long icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
-            Long k =  icell /   lenxy;
-            Long j = (icell - k*lenxy) /   lenx;
-            Long i = (icell - k*lenxy) - j*lenx;
-            f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,engine);
+        for (std::uint64_t icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
+            auto [i, j, k] = indexer(icell);
+            f(i,j,k,engine);
         }
     });
     Gpu::streamSynchronize(); // To avoid multiple streams using RNG
@@ -931,11 +853,8 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    const auto ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = Long(len.x)*Long(len.y);
-    const auto lenx = Long(len.x);
+    const auto ncells = std::uint64_t(box.numPts());
+    BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
@@ -943,15 +862,10 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     [=] AMREX_GPU_DEVICE () noexcept {
         auto tid = blockDim.x*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (Long icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
-            Long k =  icell /   lenxy;
-            Long j = (icell - k*lenxy) /   lenx;
-            Long i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
+        for (std::uint64_t icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
+            auto [i, j, k] = indexer(icell);
             for (T n = 0; n < ncomp; ++n) {
-                f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,n,engine);
+                f(i,j,k,n,engine);
             }
         }
     });
@@ -965,33 +879,23 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
     const auto ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                Long k =  icell /   len1xy;
-                Long j = (icell - k*len1xy) /   len1x;
-                Long i = (icell - k*len1xy) - j*len1x;
-                f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z);
+                auto [i, j, k] = indexer1(icell);
+                f1(i,j,k);
             }
             if (icell < ncells2) {
-                Long k =  icell /   len2xy;
-                Long j = (icell - k*len2xy) /   len2x;
-                Long i = (icell - k*len2xy) - j*len2x;
-                f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z);
+                auto [i, j, k] = indexer2(icell);
+                f2(i,j,k);
             }
         }
     });
@@ -1005,44 +909,29 @@ ParallelFor (Gpu::KernelInfo const&,
              L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
-    const auto ncells3 = box3.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
     const auto ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len3xy = Long(len3.x)*Long(len3.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
-    const auto len3x = Long(len3.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
+    BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                Long k =  icell /   len1xy;
-                Long j = (icell - k*len1xy) /   len1x;
-                Long i = (icell - k*len1xy) - j*len1x;
-                f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z);
+                auto [i, j, k] = indexer1(icell);
+                f1(i,j,k);
             }
             if (icell < ncells2) {
-                Long k =  icell /   len2xy;
-                Long j = (icell - k*len2xy) /   len2x;
-                Long i = (icell - k*len2xy) - j*len2x;
-                f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z);
+                auto [i, j, k] = indexer2(icell);
+                f2(i,j,k);
             }
             if (icell < ncells3) {
-                Long k =  icell /   len3xy;
-                Long j = (icell - k*len3xy) /   len3x;
-                Long i = (icell - k*len3xy) - j*len3x;
-                f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z);
+                auto [i, j, k] = indexer3(icell);
+                f3(i,j,k);
             }
         }
     });
@@ -1058,36 +947,26 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
     const auto ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                Long k =  icell /   len1xy;
-                Long j = (icell - k*len1xy) /   len1x;
-                Long i = (icell - k*len1xy) - j*len1x;
+                auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n);
+                    f1(i,j,k,n);
                 }
             }
             if (icell < ncells2) {
-                Long k =  icell /   len2xy;
-                Long j = (icell - k*len2xy) /   len2x;
-                Long i = (icell - k*len2xy) - j*len2x;
+                auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n);
+                    f2(i,j,k,n);
                 }
             }
         }
@@ -1106,49 +985,34 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    const auto ncells1 = box1.numPts();
-    const auto ncells2 = box2.numPts();
-    const auto ncells3 = box3.numPts();
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
     const auto ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = Long(len1.x)*Long(len1.y);
-    const auto len2xy = Long(len2.x)*Long(len2.y);
-    const auto len3xy = Long(len3.x)*Long(len3.y);
-    const auto len1x = Long(len1.x);
-    const auto len2x = Long(len2.x);
-    const auto len3x = Long(len3.x);
+    BoxIndexer indexer1(box1);
+    BoxIndexer indexer2(box2);
+    BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                Long k =  icell /   len1xy;
-                Long j = (icell - k*len1xy) /   len1x;
-                Long i = (icell - k*len1xy) - j*len1x;
+                auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n);
+                    f1(i,j,k,n);
                 }
             }
             if (icell < ncells2) {
-                Long k =  icell /   len2xy;
-                Long j = (icell - k*len2xy) /   len2x;
-                Long i = (icell - k*len2xy) - j*len2x;
+                auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n);
+                    f2(i,j,k,n);
                 }
             }
             if (icell < ncells3) {
-                Long k =  icell /   len3xy;
-                Long j = (icell - k*len3xy) /   len3x;
-                Long i = (icell - k*len3xy) - j*len3x;
+                auto [i, j, k] = indexer3(icell);
                 for (T3 n = 0; n < ncomp3; ++n) {
-                    f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z,n);
+                    f3(i,j,k,n);
                 }
             }
         }
diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H
index 0bf783af5e..f4b1681b01 100644
--- a/Src/Base/AMReX_Math.H
+++ b/Src/Base/AMReX_Math.H
@@ -210,16 +210,17 @@ constexpr T powi (T x) noexcept
 }
 
 #if defined(AMREX_INT128_SUPPORTED)
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::uint64_t umulhi (std::uint64_t a, std::uint64_t b)
 {
-#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
-    return __umul64hi(a, b);
-#elif defined(AMREX_USE_SYCL)
+#if defined(AMREX_USE_SYCL)
     return sycl::mul_hi(a,b);
 #else
-    auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b);
-    return std::uint64_t(tmp >> 64);
+    AMREX_IF_ON_DEVICE(( return __umul64hi(a, b); ))
+    AMREX_IF_ON_HOST((
+        auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b);
+        return std::uint64_t(tmp >> 64);
+    ))
 #endif
 }
 #endif
@@ -328,14 +329,11 @@ struct FastDivmodU64
     std::uint64_t divide (std::uint64_t dividend) const
     {
 #if defined(AMREX_INT128_SUPPORTED)
-        AMREX_IF_ON_DEVICE((
-            auto x = dividend;
-            if (multiplier) {
-                x = amrex::Math::umulhi(dividend + round_up, multiplier);
-            }
-            return (x >> shift_right);
-        ))
-        AMREX_IF_ON_HOST(( return dividend / divisor; ))
+        auto x = dividend;
+        if (multiplier) {
+            x = amrex::Math::umulhi(dividend + round_up, multiplier);
+        }
+        return (x >> shift_right);
 #else
         return dividend / divisor;
 #endif