From a0e49639abd1e2d579d242c4ed115f7761442059 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 2 Feb 2024 11:10:41 -0800 Subject: [PATCH] GpuLaunchFuntsG done --- Src/Base/AMReX_Box.H | 6 +- Src/Base/AMReX_GpuLaunchFunctsG.H | 410 ++++++++++-------------------- Src/Base/AMReX_Math.H | 26 +- 3 files changed, 152 insertions(+), 290 deletions(-) diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index f89e1baa5c..2a6eb6d6e7 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -1849,7 +1849,7 @@ struct BoxIndexer lo (box.smallEnd()) {} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 operator() (std::uint64_t icell) const { std::uint64_t x, y, z, rem; @@ -1868,7 +1868,7 @@ struct BoxIndexer lo (box.smallEnd()) {} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 operator() (std::uint64_t icell) const { std::uint64_t x, y; @@ -1884,7 +1884,7 @@ struct BoxIndexer : lo(box.smallEnd(0)) {} - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 operator() (std::uint64_t icell) const { return {int(icell)+lo, 0, 0}; diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index ec9765cb74..0c87fabd51 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -207,7 +207,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { for (Long i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { + i < Long(n); i += stride) { int n_active_threads = amrex::min(Long(n)-i+(Long)item.get_local_id(0), (Long)item.get_local_range(0)); detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr().get(), @@ -224,7 +224,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { for (Long i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { + i < Long(n); i += stride) { detail::call_f(f, T(i), Gpu::Handler{&item}); } }); @@ -239,7 +239,7 @@ template void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = box.numPts(); + const auto ncells = std::uint64_t(box.numPts()); BoxIndexer indexer(box); const auto ec = Gpu::makeExecutionConfig(ncells); int nthreads_per_block = ec.numThreads.x; @@ -259,10 +259,10 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { auto [i, j, k] = indexer(icell); - auto n_active_threads = int(amrex::min(ncells-icell+item.get_local_id(0), - item.get_local_range(0))); + int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)), + std::uint64_t(item.get_local_range(0))); detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), - n_active_threads}); + n_active_threads}); } }); }); @@ -291,11 +291,8 @@ template (ncells); int nthreads_per_block = ec.numThreads.x; int nthreads_total = nthreads_per_block * ec.numBlocks.x; @@ -311,14 +308,12 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (Long icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - Long k = icell / lenxy; - Long j = (icell - k*lenxy) / lenx; - Long i = (icell - k*lenxy) - j*lenx; - int n_active_threads = amrex::min(ncells-icell+(Long)item.get_local_id(0), - (Long)item.get_local_range(0)); - detail::call_f(f, int(i)+lo.x, int(j)+lo.y, int(k)+lo.z, ncomp, + auto [i, j, k] = indexer(icell); + int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)), + std::uint64_t(item.get_local_range(0))); + detail::call_f(f, i, j, k, ncomp, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } @@ -332,12 +327,10 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (Long icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - Long k = icell / lenxy; - Long j = (icell - k*lenxy) / lenx; - Long i = (icell - k*lenxy) - j*lenx; - detail::call_f(f,int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,ncomp,Gpu::Handler{&item}); + auto [i, j, k] = indexer(icell); + detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item}); } }); }); @@ -368,7 +361,7 @@ void ParallelForRNG (T n, L&& f) noexcept int tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (Long i = tid, stride = item.get_global_range(0); i < n; i += stride) { + for (Long i = tid, stride = item.get_global_range(0); i < Long(n); i += stride) { f(T(i),rand_eng); } engine_acc.store(engine, tid); @@ -384,11 +377,8 @@ template void ParallelForRNG (Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = Long(len.x)*Long(len.y); - const auto lenx = Long(len.x); + const auto ncells = std::uint64_t(box.numPts()); + BoxIndexer indexer(box); const auto ec = Gpu::ExecutionConfig(ncells); int nthreads_per_block = ec.numThreads.x; int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); @@ -406,12 +396,10 @@ void ParallelForRNG (Box const& box, L&& f) noexcept int tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (Long icell = tid, stride = item.get_global_range(0); + for (std::uint64_t icell = tid, stride = item.get_global_range(0); icell < ncells; icell += stride) { - Long k = icell / lenxy; - Long j = (icell - k*lenxy) / lenx; - Long i = (icell - k*lenxy) - j*lenx; - f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,rand_eng); + auto [i, j, k] = indexer(icell); + f(i,j,k,rand_eng); } engine_acc.store(engine, tid); }); @@ -426,11 +414,8 @@ template void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); const auto ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); int nthreads_per_block = ec.numThreads.x; int nthreads_total = nthreads_per_block * ec.numBlocks.x; @@ -493,19 +470,15 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (Long icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z); + auto [i, j, k] = indexer1(icell); + f1(i,j,k); } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z); + auto [i, j, k] = indexer2(icell); + f2(i,j,k); } } }); @@ -521,22 +494,13 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); - const auto ncells3 = box3.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); const auto ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len3xy = Long(len3.x)*Long(len3.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); - const auto len3x = Long(len3.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); + BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); int nthreads_per_block = ec.numThreads.x; int nthreads_total = nthreads_per_block * ec.numBlocks.x; @@ -549,25 +513,19 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (Long icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z); + auto [i, j, k] = indexer1(icell); + f1(i,j,k); } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z); + auto [i, j, k] = indexer2(icell); + f2(i,j,k); } if (icell < ncells3) { - Long k = icell / len3xy; - Long j = (icell - k*len3xy) / len3x; - Long i = (icell - k*len3xy) - j*len3x; - f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z); + auto [i, j, k] = indexer3(icell); + f3(i,j,k); } } }); @@ -585,17 +543,11 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); const auto ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); int nthreads_per_block = ec.numThreads.x; int nthreads_total = nthreads_per_block * ec.numBlocks.x; @@ -608,22 +560,18 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (Long icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n); + f1(i,j,k,n); } } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n); + f2(i,j,k,n); } } } @@ -644,22 +592,13 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); - const auto ncells3 = box3.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); const auto ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len3xy = Long(len3.x)*Long(len3.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); - const auto len3x = Long(len3.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); + BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); int nthreads_per_block = ec.numThreads.x; int nthreads_total = nthreads_per_block * ec.numBlocks.x; @@ -672,30 +611,24 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (Long icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n); + f1(i,j,k,n); } } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n); + f2(i,j,k,n); } } if (icell < ncells3) { - Long k = icell / len3xy; - Long j = (icell - k*len3xy) / len3x; - Long i = (icell - k*len3xy) - j*len3x; + auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { - f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z,n); + f3(i,j,k,n); } } } @@ -818,14 +751,13 @@ template ::value> ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept { - using IT = std::conditional_t<(sizeof(T) < sizeof(Long)), Long, T>; if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (IT i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - i < n; i += stride) { - detail::call_f(f, T(i), (n-i+(IT)threadIdx.x)); + for (Long i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + i < Long(n); i += stride) { + detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -845,7 +777,7 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept icell < ncells; icell += stride) { auto [i, j, k] = indexer(icell); - detail::call_f(f, i, j, k, (ncells-icell+(Long)threadIdx.x)); + detail::call_f(f, i, j, k, (ncells-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -856,20 +788,15 @@ std::enable_if_t::value> ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - const auto ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = Long(len.x)*Long(len.y); - const auto lenx = Long(len.x); + const auto ncells = std::uint64_t(box.numPts()); + BoxIndexer indexer(box); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - Long k = icell / lenxy; - Long j = (icell - k*lenxy) / lenx; - Long i = (icell - k*lenxy) - j*lenx; - detail::call_f(f, int(i)+lo.x, int(j)+lo.y, int(k)+lo.z, ncomp, (ncells-icell+(Long)threadIdx.x)); + auto [i, j, k] = indexer(icell); + detail::call_f(f, i, j, k, ncomp, (ncells-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -886,9 +813,9 @@ ParallelForRNG (T n, L&& f) noexcept amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - auto tid = blockDim.x*blockIdx.x+threadIdx.x; + int tid = blockDim.x*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (Long i = tid, stride = blockDim.x*gridDim.x; i < n; i += stride) { + for (Long i = tid, stride = blockDim.x*gridDim.x; i < Long(n); i += stride) { f(T(i),engine); } }); @@ -902,11 +829,8 @@ ParallelForRNG (Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - const auto ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = Long(len.x)*Long(len.y); - const auto lenx = Long(len.x); + const auto ncells = std::uint64_t(box.numPts()); + BoxIndexer indexer(box); const auto ec = Gpu::ExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), @@ -914,11 +838,9 @@ ParallelForRNG (Box const& box, L&& f) noexcept [=] AMREX_GPU_DEVICE () noexcept { auto tid = blockDim.x*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (Long icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - Long k = icell / lenxy; - Long j = (icell - k*lenxy) / lenx; - Long i = (icell - k*lenxy) - j*lenx; - f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,engine); + for (std::uint64_t icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { + auto [i, j, k] = indexer(icell); + f(i,j,k,engine); } }); Gpu::streamSynchronize(); // To avoid multiple streams using RNG @@ -931,11 +853,8 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - const auto ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = Long(len.x)*Long(len.y); - const auto lenx = Long(len.x); + const auto ncells = std::uint64_t(box.numPts()); + BoxIndexer indexer(box); const auto ec = Gpu::ExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), @@ -943,15 +862,10 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept [=] AMREX_GPU_DEVICE () noexcept { auto tid = blockDim.x*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (Long icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - Long k = icell / lenxy; - Long j = (icell - k*lenxy) / lenx; - Long i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { + auto [i, j, k] = indexer(icell); for (T n = 0; n < ncomp; ++n) { - f(int(i)+lo.x,int(j)+lo.y,int(k)+lo.z,n,engine); + f(i,j,k,n,engine); } } }); @@ -965,33 +879,23 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); const auto ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z); + auto [i, j, k] = indexer1(icell); + f1(i,j,k); } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z); + auto [i, j, k] = indexer2(icell); + f2(i,j,k); } } }); @@ -1005,44 +909,29 @@ ParallelFor (Gpu::KernelInfo const&, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); - const auto ncells3 = box3.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); const auto ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len3xy = Long(len3.x)*Long(len3.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); - const auto len3x = Long(len3.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); + BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z); + auto [i, j, k] = indexer1(icell); + f1(i,j,k); } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z); + auto [i, j, k] = indexer2(icell); + f2(i,j,k); } if (icell < ncells3) { - Long k = icell / len3xy; - Long j = (icell - k*len3xy) / len3x; - Long i = (icell - k*len3xy) - j*len3x; - f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z); + auto [i, j, k] = indexer3(icell); + f3(i,j,k); } } }); @@ -1058,36 +947,26 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); const auto ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n); + f1(i,j,k,n); } } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n); + f2(i,j,k,n); } } } @@ -1106,49 +985,34 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - const auto ncells1 = box1.numPts(); - const auto ncells2 = box2.numPts(); - const auto ncells3 = box3.numPts(); + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); const auto ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = Long(len1.x)*Long(len1.y); - const auto len2xy = Long(len2.x)*Long(len2.y); - const auto len3xy = Long(len3.x)*Long(len3.y); - const auto len1x = Long(len1.x); - const auto len2x = Long(len2.x); - const auto len3x = Long(len3.x); + BoxIndexer indexer1(box1); + BoxIndexer indexer2(box2); + BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (Long icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - Long k = icell / len1xy; - Long j = (icell - k*len1xy) / len1x; - Long i = (icell - k*len1xy) - j*len1x; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { - f1(int(i)+lo1.x,int(j)+lo1.y,int(k)+lo1.z,n); + f1(i,j,k,n); } } if (icell < ncells2) { - Long k = icell / len2xy; - Long j = (icell - k*len2xy) / len2x; - Long i = (icell - k*len2xy) - j*len2x; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { - f2(int(i)+lo2.x,int(j)+lo2.y,int(k)+lo2.z,n); + f2(i,j,k,n); } } if (icell < ncells3) { - Long k = icell / len3xy; - Long j = (icell - k*len3xy) / len3x; - Long i = (icell - k*len3xy) - j*len3x; + auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { - f3(int(i)+lo3.x,int(j)+lo3.y,int(k)+lo3.z,n); + f3(i,j,k,n); } } } diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H index 0bf783af5e..f4b1681b01 100644 --- a/Src/Base/AMReX_Math.H +++ b/Src/Base/AMReX_Math.H @@ -210,16 +210,17 @@ constexpr T powi (T x) noexcept } #if defined(AMREX_INT128_SUPPORTED) -AMREX_GPU_DEVICE AMREX_FORCE_INLINE +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::uint64_t umulhi (std::uint64_t a, std::uint64_t b) { -#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) - return __umul64hi(a, b); -#elif defined(AMREX_USE_SYCL) +#if defined(AMREX_USE_SYCL) return sycl::mul_hi(a,b); #else - auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b); - return std::uint64_t(tmp >> 64); + AMREX_IF_ON_DEVICE(( return __umul64hi(a, b); )) + AMREX_IF_ON_HOST(( + auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b); + return std::uint64_t(tmp >> 64); + )) #endif } #endif @@ -328,14 +329,11 @@ struct FastDivmodU64 std::uint64_t divide (std::uint64_t dividend) const { #if defined(AMREX_INT128_SUPPORTED) - AMREX_IF_ON_DEVICE(( - auto x = dividend; - if (multiplier) { - x = amrex::Math::umulhi(dividend + round_up, multiplier); - } - return (x >> shift_right); - )) - AMREX_IF_ON_HOST(( return dividend / divisor; )) + auto x = dividend; + if (multiplier) { + x = amrex::Math::umulhi(dividend + round_up, multiplier); + } + return (x >> shift_right); #else return dividend / divisor; #endif