From 56d57d5f860b09289c4ea63bcfa64da9001060eb Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Tue, 14 Jul 2020 12:08:34 -0400 Subject: [PATCH 1/3] More on Gpu kernel fusing * Add Gpu::KernelInfo argument to ParallelFor to allow the user to indicate whether the kernel is an candidate for fusing. * For MFIter, if the local size is less or equal to 3, the fuse region is turned on and small kernels marked fusable will be fused. * Add launch macros for fusing. * Add fusing to a number of functions used by linear solvers. Note that there are a lot more amrex functions need to be updated for fusing. * Optimize reduction for bottom solve. * Consolidate memcpy in communication functions. * Option to use device memory in communication kernels for packing and unpacking buffers. But it's currently turned off because the performance was not improved in testing. In fact, it was worse than using pinned memory. But this might change in the future. So the option is kept. --- Src/Base/AMReX.cpp | 2 +- Src/Base/AMReX_FBI.H | 192 ++++++---- Src/Base/AMReX_FabArray.H | 24 +- Src/Base/AMReX_FabArrayBase.cpp | 2 +- Src/Base/AMReX_FabArrayCommI.H | 10 +- Src/Base/AMReX_FabArrayUtility.H | 18 +- Src/Base/AMReX_Gpu.H | 1 + Src/Base/AMReX_GpuFuse.H | 69 +++- Src/Base/AMReX_GpuFuse.cpp | 54 ++- Src/Base/AMReX_GpuKernelInfo.H | 18 + Src/Base/AMReX_GpuLaunch.H | 32 ++ Src/Base/AMReX_GpuLaunchFunctsC.H | 257 +++++++++++-- Src/Base/AMReX_GpuLaunchFunctsG.H | 352 +++++++++++++++--- Src/Base/AMReX_GpuLaunchMacrosG.H | 165 ++++++++ Src/Base/AMReX_MFIter.H | 4 + Src/Base/AMReX_MFIter.cpp | 10 +- Src/Base/AMReX_MultiFab.cpp | 46 ++- Src/Base/AMReX_MultiFabUtil.cpp | 34 +- Src/Base/AMReX_Reduce.H | 17 + Src/Base/CMakeLists.txt | 1 + Src/Base/Make.package | 1 + .../MLMG/AMReX_MLABecLaplacian.cpp | 22 +- Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp | 11 +- Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp | 14 +- 24 files changed, 1115 insertions(+), 241 deletions(-) create mode 100644 Src/Base/AMReX_GpuKernelInfo.H diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index ffc0c57e372..6ba27f84b1b 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -522,7 +522,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, BL_PROFILE_INITPARAMS(); #endif machine::Initialize(); -#ifdef AMREX_USE_CUDA +#ifdef AMREX_USE_GPU Gpu::Fuser::Initialize(); #endif diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index 4a33ee31f2a..6748a591613 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -54,19 +54,26 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) } nwarps.push_back(ntotwarps); - std::size_t nbytes = ntags*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, tags.data(), nbytes); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; - nbytes = (ntags+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); - constexpr int nthreads = 128; + std::memcpy(h_buffer, tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); + + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { @@ -112,7 +119,7 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) } }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -159,8 +166,8 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) #endif Gpu::synchronize(); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } #endif @@ -221,19 +228,27 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } nwarps.push_back(ntotwarps); - std::size_t nbytes = N_locs*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes); + const int ntags = copy_tags.size(); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); + + std::memcpy(h_buffer, copy_tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - nbytes = (N_locs+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); - constexpr int nthreads = 128; + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { @@ -280,7 +295,7 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -328,8 +343,8 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in #endif Gpu::synchronize(); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } template @@ -353,23 +368,31 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } nwarps.push_back(ntotwarps); - std::size_t nbytes = N_locs*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes); + const int ntags = copy_tags.size(); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t offset_masks = Arena::align(offset_nwarps+sizeof_nwarps); + std::size_t sizeof_masks = masks.size()*sizeof(Array4); + std::size_t total_buf_size = offset_masks + sizeof_masks; - nbytes = (N_locs+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); - nbytes = masks.size()*sizeof(Array4); - auto d_masks = static_cast*>(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_masks, masks.data(), nbytes); + std::memcpy(h_buffer, copy_tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + std::memcpy(h_buffer+offset_masks, masks.data(), sizeof_masks); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - constexpr int nthreads = 128; + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + auto d_masks = reinterpret_cast*>(d_buffer+offset_masks); + + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept { int g_tid = item.get_global_id(0); @@ -437,7 +460,7 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in if (m) *m = 0; }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -515,9 +538,8 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in #endif Gpu::synchronize(); - The_Device_Arena()->free(d_masks); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } template ::value,int> = 0> @@ -661,16 +683,18 @@ FabArray::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp) } if (maskfabs.size() > 0) { + Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold()); for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) { BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(bx, + amrex::ParallelFor(Gpu::KernelInfo{}.setFusable(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; }); } + Gpu::LaunchFusedKernels(); } if (is_thread_safe) { @@ -923,7 +947,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n // Is the conditional ever expected false? int launches = 0; for (int send = 0; send < N_snds; ++send) { - if (send_data[send] != nullptr) { + if (send_size[send] > 0) { launches += send_cctc[send]->size(); } } @@ -940,8 +964,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n std::size_t(sizeof(CopyMemory)*launches) ); const int j = sit(); - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) @@ -971,9 +994,9 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n for (int send = 0; send < N_snds; ++send) { const int j = send; - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + char* dptr = send_data[j]; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1009,9 +1032,9 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int LayoutData > recv_copy_tags(boxArray(),DistributionMap()); for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1082,7 +1105,7 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int template void FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc) { @@ -1091,16 +1114,26 @@ FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int nc const int N_snds = send_data.size(); if (N_snds == 0) return; + char* pbuffer = send_data[0]; + std::size_t szbuffer = 0; +#if 0 + // For linear solver test on summit, this is slower than writing to + // pinned memory directly on device. + if (not ParallelDescriptor::UseGpuAwareMpi()) { + // Memory in send_data is pinned. + szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1]; + pbuffer = (char*)The_Arena()->alloc(szbuffer); + } +#endif + typedef Array4CopyTag TagType; Vector snd_copy_tags; - // FIX HIP HERE -- Dim3 - Dim3 zero; - zero.x = 0; zero.y = 0; zero.z = 0; for (int j = 0; j < N_snds; ++j) { - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + std::size_t offset = send_data[j]-send_data[0]; + char* dptr = pbuffer + offset; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1108,16 +1141,24 @@ FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int nc amrex::makeArray4((value_type*)(dptr), tag.sbox, ncomp), src.array(tag.srcIndex), tag.sbox, - zero + Dim3{0,0,0} }); dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type)); } - BL_ASSERT(dptr <= send_data[j] + send_size[j]); + BL_ASSERT(dptr <= pbuffer + offset + send_size[j]); } } detail::fab_to_fab(snd_copy_tags, scomp, 0, ncomp, detail::CellStore()); + + // There is Gpu::synchronize in fab_to_fab. + + if (pbuffer != send_data[0]) { + Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]); + Gpu::synchronize(); + The_Arena()->free(pbuffer); + } } template @@ -1133,6 +1174,20 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, const int N_rcvs = recv_cctc.size(); if (N_rcvs == 0) return; + char* pbuffer = recv_data[0]; + std::size_t szbuffer = 0; +#if 0 + // For linear solver test on summit, this is slower than writing to + // pinned memory directly on device. + if (not ParallelDescriptor::UseGpuAwareMpi()) { + // Memory in recv_data is pinned. + szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1]; + pbuffer = (char*)The_Arena()->alloc(szbuffer); + Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer); + Gpu::synchronize(); + } +#endif + typedef Array4CopyTag TagType; Vector recv_copy_tags; @@ -1149,9 +1204,10 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + std::size_t offset = recv_data[k]-recv_data[0]; + const char* dptr = pbuffer + offset; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1171,21 +1227,23 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, masks.push_back(maskfabs[li].array()); } } - BL_ASSERT(dptr <= recv_data[k] + recv_size[k]); + BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]); } } if (maskfabs.size() > 0) { + Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold()); for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) { BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(bx, + amrex::ParallelFor(Gpu::KernelInfo().setFusable(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; }); } + Gpu::LaunchFusedKernels(); } if (op == FabArrayBase::COPY) @@ -1206,6 +1264,12 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, detail::fab_to_fab_atomic_add(recv_copy_tags, 0, dcomp, ncomp, masks); } } + + // There is Gpu::synchronize in fab_to_fab. + + if (pbuffer != recv_data[0]) { + The_Arena()->free(pbuffer); + } } #endif /* AMREX_USE_GPU */ @@ -1213,7 +1277,7 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, template void FabArray::pack_send_buffer_cpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc) { @@ -1227,9 +1291,9 @@ FabArray::pack_send_buffer_cpu (FabArray const& src, int scomp, int nc #endif for (int j = 0; j < N_snds; ++j) { - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + char* dptr = send_data[j]; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1268,9 +1332,9 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, #endif for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1296,9 +1360,9 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, recv_copy_tags.define(dst.boxArray(),dst.DistributionMap()); for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 71e60674686..8ae7b66bd97 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -718,7 +718,7 @@ public: #endif static void pack_send_buffer_gpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc); @@ -731,7 +731,7 @@ public: #endif static void pack_send_buffer_cpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc); @@ -1582,7 +1582,7 @@ FabArray::setVal (value_type val, { const Box& bx = fai.growntilebox(nghost); auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1625,7 +1625,7 @@ FabArray::setVal (value_type val, if (b.ok()) { auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( b, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( b, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1655,7 +1655,7 @@ FabArray::abs (int comp, int ncomp, const IntVect& nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp)); }); @@ -1674,7 +1674,7 @@ FabArray::plus (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1694,7 +1694,7 @@ FabArray::plus (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1714,7 +1714,7 @@ FabArray::mult (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1734,7 +1734,7 @@ FabArray::mult (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1754,7 +1754,7 @@ FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1774,7 +1774,7 @@ FabArray::invert (value_type numerator, const Box& region, int comp, int nu const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1970,7 +1970,7 @@ FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, Box const& fbx = mfi.growntilebox(); Box const& gbx = fbx & domain; Box const& vbx = mfi.validbox(); - AMREX_HOST_DEVICE_FOR_4D(fbx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_FOR_4D_FUSABLE(fbx, ncomp, i, j, k, n, { IntVect iv(AMREX_D_DECL(i,j,k)); if (vbx.contains(iv)) { diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index 77429049df3..50e00ad2c9c 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -118,7 +118,7 @@ FabArrayBase::Initialize () #ifdef AMREX_USE_GPU if (ParallelDescriptor::UseGpuAwareMpi()) { - the_fa_arena = The_Device_Arena(); + the_fa_arena = The_Arena(); } else { the_fa_arena = The_Pinned_Arena(); } diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H index 46ecef31170..a0f05f93d24 100644 --- a/Src/Base/AMReX_FabArrayCommI.H +++ b/Src/Base/AMReX_FabArrayCommI.H @@ -143,9 +143,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, { the_send_data = static_cast(amrex::The_FA_Arena()->alloc(total_volume)); for (int i = 0, N = send_size.size(); i < N; ++i) { - if (send_size[i] > 0) { - send_data[i] = the_send_data + offset[i]; - } + send_data[i] = the_send_data + offset[i]; } } else { the_send_data = nullptr; @@ -495,9 +493,7 @@ FabArray::ParallelCopy (const FabArray& src, { the_send_data = static_cast(amrex::The_FA_Arena()->alloc(total_volume)); for (int i = 0, N = send_size.size(); i < N; ++i) { - if (send_size[i] > 0) { - send_data[i] = the_send_data + offset[i]; - } + send_data[i] = the_send_data + offset[i]; } } @@ -749,9 +745,9 @@ FabArray::PostRcvs (const MapOfCopyComTagContainers& m_RcvTags, for (int i = 0; i < nrecv; ++i) { + recv_data[i] = the_recv_data + offset[i]; if (recv_size[i] > 0) { - recv_data[i] = the_recv_data + offset[i]; const int rank = ParallelContext::global_to_local_rank(recv_from[i]); const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]); if (comm_data_type == 1) { diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H index c31fd19b0c1..c7075d48520 100644 --- a/Src/Base/AMReX_FabArrayUtility.H +++ b/Src/Base/AMReX_FabArrayUtility.H @@ -153,11 +153,13 @@ ReduceSum_device (FabArray const& fa1, FabArray const& fa2, using value_type = typename FAB1::value_type; value_type sm = 0; + BL_PROFILE("ReduceSum_device"); + { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); using ReduceTuple = typename decltype(reduce_data)::Type; - + Gpu::FuseReductionSafeGuard rsg(true); for (MFIter mfi(fa1); mfi.isValid(); ++mfi) { const Box& bx = amrex::grow(mfi.validbox(),nghost); @@ -1467,7 +1469,7 @@ Add (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp); }); @@ -1499,7 +1501,7 @@ Copy (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, in { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n); }); @@ -1531,7 +1533,7 @@ Subtract (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp); }); @@ -1563,7 +1565,7 @@ Multiply (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp); }); @@ -1595,7 +1597,7 @@ Divide (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp); }); @@ -1625,7 +1627,7 @@ Abs (FabArray& fa, int icomp, int numcomp, const IntVect& nghost) if (bx.ok()) { auto const& fab = fa.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp)); }); @@ -1682,7 +1684,7 @@ OverrideSync (FabArray & fa, FabArray const& msk, const Periodicity& const Box& bx = mfi.tilebox(); auto fab = fa.array(mfi); auto const ifab = msk.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, { if (!ifab(i,j,k)) fab(i,j,k,n) = 0; }); diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H index a0fd747a17a..5c59d89305b 100644 --- a/Src/Base/AMReX_Gpu.H +++ b/Src/Base/AMReX_Gpu.H @@ -9,6 +9,7 @@ namespace amrex { namespace Cuda {} } #endif #include +#include #include #include #include diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H index dc3555cb39d..99467f9a956 100644 --- a/Src/Base/AMReX_GpuFuse.H +++ b/Src/Base/AMReX_GpuFuse.H @@ -13,6 +13,8 @@ namespace amrex { namespace Gpu { +#ifdef AMREX_USE_GPU + #ifdef AMREX_USE_CUDA typedef void (*Lambda1DLauncher)(char*,int); @@ -229,23 +231,6 @@ private: } }; -Long getFuseSizeThreshold (); -Long setFuseSizeThreshold (Long new_threshold); -int getFuseNumKernelsThreshold (); -int setFuseNumKernelsThreshold (int new_threshold); -bool inFuseRegion (); -bool setFuseRegion (bool flag); - -struct FuseSafeGuard -{ - explicit FuseSafeGuard (bool flag) noexcept - : m_old(setFuseRegion(flag)) {} - ~FuseSafeGuard () { setFuseRegion(m_old); } -private: - bool m_old; -}; - - template void Register (Box const& bx, F&& f) @@ -273,6 +258,56 @@ LaunchFusedKernels () Fuser::getInstance().Launch(); } +#else + +class Fuser +{ +public: + static Fuser& getInstance (); + static void Initialize (); + static void Finalize (); +private: + static std::unique_ptr m_instance; +}; + +inline void LaunchFusedKernels () {} + +#endif + +Long getFuseSizeThreshold (); +Long setFuseSizeThreshold (Long new_threshold); +int getFuseNumKernelsThreshold (); +int setFuseNumKernelsThreshold (int new_threshold); +bool inFuseRegion (); +bool setFuseRegion (bool flag); +bool inFuseReductionRegion (); +bool setFuseReductionRegion (bool flag); + +struct FuseSafeGuard +{ + explicit FuseSafeGuard (bool flag) noexcept + : m_old(setFuseRegion(flag)) {} + ~FuseSafeGuard () { setFuseRegion(m_old); } +private: + bool m_old; +}; + +struct FuseReductionSafeGuard +{ + explicit FuseReductionSafeGuard (bool flag) noexcept + : m_old(setFuseReductionRegion(flag)) {} + ~FuseReductionSafeGuard () { setFuseReductionRegion(m_old); } +private: + bool m_old; +}; + +#else + +struct FuseSafeGuard +{ + explicit FuseSafeGuard (bool) {} +}; + #endif }} diff --git a/Src/Base/AMReX_GpuFuse.cpp b/Src/Base/AMReX_GpuFuse.cpp index 26f942aa963..e3a4fe812b1 100644 --- a/Src/Base/AMReX_GpuFuse.cpp +++ b/Src/Base/AMReX_GpuFuse.cpp @@ -6,10 +6,11 @@ namespace amrex { namespace Gpu { -#ifdef AMREX_USE_CUDA +#ifdef AMREX_USE_GPU namespace { bool s_in_fuse_region = false; + bool s_in_fuse_reduction_region = false; // Fusing kernels with elements greater than this are not recommended based tests on v100 Long s_fuse_size_threshold = 257*257; // If the number of kernels is less than this, fusing is not recommended based on tests on v100 @@ -18,6 +19,8 @@ namespace { std::unique_ptr Fuser::m_instance = nullptr; +#ifdef AMREX_USE_CUDA + Fuser::Fuser () { AMREX_ASSERT(!OpenMP::in_parallel()); @@ -86,12 +89,32 @@ void Fuser::Launch () constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; + bool is_reduction = s_in_fuse_reduction_region; + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; int g_wid = g_tid / Gpu::Device::warp_size; - if (g_wid >= ntotwarps) return; + if (g_wid >= ntotwarps) { + if (is_reduction) { + // for reduction, the assumption is all lambdas have function signature + FuseHelper& helper = d_lambda_helper[0]; + char* lambda_object = d_lambda_object + helper.m_offset; + if (helper.m_bx.isEmpty()) { + helper.m_fp.L1D(lambda_object,-1); + } else { + if (helper.m_N == 0) { + helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN); + } else { + for (int n = 0; n < helper.m_N; ++n) { + helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1); + } + } + } + } + return; + } int ilambda; { @@ -120,6 +143,8 @@ void Fuser::Launch () if (bx.isEmpty()) { if (icell < helper.m_N) { helper.m_fp.L1D(lambda_object,icell); + } else if (is_reduction) { + helper.m_fp.L1D(lambda_object,-1); } } else { int ncells = bx.numPts(); @@ -139,6 +164,14 @@ void Fuser::Launch () helper.m_fp.L4D(lambda_object,i,j,k,n); } } + } else if (is_reduction) { + if (helper.m_N == 0) { + helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN); + } else { + for (int n = 0; n < helper.m_N; ++n) { + helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1); + } + } } } }); @@ -181,6 +214,8 @@ Fuser::resize_helper_buf () m_helper_buf = p; } +#endif + Fuser& Fuser::getInstance () { @@ -218,8 +253,7 @@ setFuseSizeThreshold (Long new_threshold) int getFuseNumKernelsThreshold () { return s_fuse_numkernels_threshold; } -int -setFuseNumKernelsThreshold (int new_threshold) +int setFuseNumKernelsThreshold (int new_threshold) { int old = s_fuse_numkernels_threshold; s_fuse_numkernels_threshold = new_threshold; @@ -228,14 +262,22 @@ setFuseNumKernelsThreshold (int new_threshold) bool inFuseRegion () { return s_in_fuse_region; } -bool -setFuseRegion (bool flag) +bool setFuseRegion (bool flag) { bool old = s_in_fuse_region; s_in_fuse_region = flag; return old; } +bool inFuseReductionRegion () { return s_in_fuse_reduction_region; } + +bool setFuseReductionRegion (bool flag) +{ + bool old = s_in_fuse_reduction_region; + s_in_fuse_reduction_region = flag; + return old; +} + #endif }} diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H new file mode 100644 index 00000000000..2eac4054e63 --- /dev/null +++ b/Src/Base/AMReX_GpuKernelInfo.H @@ -0,0 +1,18 @@ +#ifndef AMREX_GPU_KERNEL_INFO_H_ +#define AMREX_GPU_KERNEL_INFO_H_ + +namespace amrex { +namespace Gpu { + +class KernelInfo +{ +public: + KernelInfo& setFusable (bool flag) { fusable = flag; return *this; } + bool isFusable () const { return fusable; } +private: + bool fusable = false; +}; + +}} + +#endif diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index f79901a2403..5305139f636 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -2,6 +2,7 @@ #define AMREX_GPU_LAUNCH_H_ #include +#include #include #include #include @@ -9,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -187,6 +189,20 @@ namespace Gpu { AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) +#ifdef AMREX_USE_CUDA +#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) +#else +#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__) +#endif #if (AMREX_SPACEDIM == 1) #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE (a1,a2,a2) @@ -215,6 +231,22 @@ namespace Gpu { #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__) #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) +#ifdef AMREX_USE_CUDA +#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__) +#else +#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__) +#endif + #ifdef AMREX_USE_GPU #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \ diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 3b26b799841..584ad71d810 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -10,7 +10,7 @@ void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } template ::value> > -void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (T n, L&& f) noexcept { for (T i = 0; i < n; ++i) { f(i); @@ -18,7 +18,13 @@ void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } template ::value> > -void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + For(n, std::forward(f)); +} + +template ::value> > +void ParallelFor (T n, L&& f) noexcept { AMREX_PRAGMA_SIMD for (T i = 0; i < n; ++i) { @@ -26,8 +32,14 @@ void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } } +template ::value> > +void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + ParallelFor(n, std::forward(f)); +} + template -void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -39,7 +51,13 @@ void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } template -void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + For(box, std::forward(f)); +} + +template +void ParallelFor (Box const& box, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -51,8 +69,14 @@ void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noe }}} } +template +void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + ParallelFor(box, std::forward(f)); +} + template ::value> > -void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box, T ncomp, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -66,7 +90,13 @@ void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) no } template ::value> > -void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + For(box, ncomp, std::forward(f)); +} + +template ::value> > +void ParallelFor (Box const& box, T ncomp, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -80,90 +110,161 @@ void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes } } +template ::value> > +void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(box, ncomp, std::forward(f)); +} + template -void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { For(box1, std::forward(f1)); For(box2, std::forward(f2)); } +template +void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + For (box1, box2, std::forward(f1), std::forward(f2)); +} + template -void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { For(box1, std::forward(f1)); For(box2, std::forward(f2)); For(box3, std::forward(f3)); } +template +void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +{ + For(box1, box2, box3, std::forward(f1), std::forward(f2), std::forward(f3)); +} + template ::value>, typename M2=amrex::EnableIf_t::value> > void For (Box const& box1, T1 ncomp1, L1&& f1, - Box const& box2, T2 ncomp2, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box2, T2 ncomp2, L2&& f2) noexcept { For(box1, ncomp1, std::forward(f1)); For(box2, ncomp2, std::forward(f2)); } +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + For(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > void For (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, - Box const& box3, T3 ncomp3, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box3, T3 ncomp3, L3&& f3) noexcept { For(box1, ncomp1, std::forward(f1)); For(box2, ncomp2, std::forward(f2)); For(box3, ncomp3, std::forward(f3)); } +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + For(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + template -void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept +void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { ParallelFor(box1, std::forward(f1)); ParallelFor(box2, std::forward(f2)); } +template +void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(box1,box2,f1,f2); +} + template -void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept +void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { ParallelFor(box1, std::forward(f1)); ParallelFor(box2, std::forward(f2)); ParallelFor(box3, std::forward(f3)); } +template +void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + template ::value>, typename M2=amrex::EnableIf_t::value> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, - Box const& box2, T2 ncomp2, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box2, T2 ncomp2, L2&& f2) noexcept { ParallelFor(box1, ncomp1, std::forward(f1)); ParallelFor(box2, ncomp2, std::forward(f2)); } +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void ParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2)); +} + template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, - Box const& box3, T3 ncomp3, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box3, T3 ncomp3, L3&& f3) noexcept { ParallelFor(box1, ncomp1, std::forward(f1)); ParallelFor(box2, ncomp2, std::forward(f2)); ParallelFor(box3, ncomp3, std::forward(f3)); } +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void ParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(box1, ncomp1, std::forward(f1), + box2, ncomp2, std::forward(f2), + box3, ncomp3, std::forward(f3)); +} + template ::value> > void FabReduce (Box const& box, N ncomp, T const& init_val, @@ -315,6 +416,118 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, box3,ncomp3,std::forward(f3)); } +template ::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + ParallelFor(n,std::forward(f)); +} + +template +void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + ParallelFor(box,std::forward(f)); +} + +template ::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(box,ncomp,std::forward(f)); +} + +template +void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void HostDeviceParallelFor (Gpu::KernelInfo const&, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + +template ::value> > +void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + For(n,std::forward(f)); +} + +template +void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + For(box,std::forward(f)); +} + +template ::value> > +void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + For(box,ncomp,std::forward(f)); +} + +template +void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + For(box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void HostDeviceFor (Gpu::KernelInfo const&, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + For(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void HostDeviceFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + For(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void HostDeviceFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + For(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + } #endif diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 920570be12f..d4a1841ce73 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -94,7 +94,7 @@ void launch (T const& n, L&& f) noexcept } template ::value> > -void ParallelFor (T n, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; const auto ec = Gpu::ExecutionConfig(n); @@ -122,7 +122,7 @@ void ParallelFor (T n, L&& f) noexcept } template -void ParallelFor (Box const& box, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -159,7 +159,7 @@ void ParallelFor (Box const& box, L&& f) noexcept } template ::value> > -void ParallelFor (Box const& box, T ncomp, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -198,7 +198,7 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept } template -void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +void ParallelFor (Gpu::KernelInfo const& info, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size ParallelFor(box1, std::forward(f1)); @@ -255,7 +255,9 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept } template -void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +void ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size ParallelFor(box1, std::forward(f1)); @@ -327,7 +329,8 @@ void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2 template ::value>, typename M2=amrex::EnableIf_t::value> > -void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +void ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size @@ -392,7 +395,8 @@ template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > -void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +void ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { @@ -628,9 +632,15 @@ void launch (T const& n, L&& f) noexcept template ::value> > amrex::EnableIf_t::value> -ParallelFor (T n, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && n <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(n, f); + } else +#endif + { const auto ec = Gpu::ExecutionConfig(n); AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { @@ -640,14 +650,21 @@ ParallelFor (T n, L&& f) noexcept } }); AMREX_GPU_ERROR_CHECK(); + } } template amrex::EnableIf_t::value> -ParallelFor (Box const& box, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box, f); + } else +#endif + { const auto lo = amrex::lbound(box); const auto len = amrex::length(box); const auto ec = Gpu::ExecutionConfig(ncells); @@ -665,14 +682,21 @@ ParallelFor (Box const& box, L&& f) noexcept } }); AMREX_GPU_ERROR_CHECK(); + } } template ::value> > amrex::EnableIf_t::value> -ParallelFor (Box const& box, T ncomp, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box, ncomp, f); + } else +#endif + { const auto lo = amrex::lbound(box); const auto len = amrex::length(box); const auto ec = Gpu::ExecutionConfig(ncells); @@ -692,16 +716,25 @@ ParallelFor (Box const& box, T ncomp, L&& f) noexcept } }); AMREX_GPU_ERROR_CHECK(); + } } template amrex::EnableIf_t::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return; int ncells1 = box1.numPts(); int ncells2 = box2.numPts(); int ncells = amrex::max(ncells1, ncells2); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, f1); + Gpu::Register(box2, f2); + } else +#endif + { const auto lo1 = amrex::lbound(box1); const auto lo2 = amrex::lbound(box2); const auto len1 = amrex::length(box1); @@ -732,17 +765,28 @@ ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept } }); AMREX_GPU_ERROR_CHECK(); + } } template amrex::EnableIf_t::value and MaybeDeviceRunnable::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) and amrex::isEmpty(box2) and amrex::isEmpty(box3)) return; int ncells1 = box1.numPts(); int ncells2 = box2.numPts(); int ncells3 = box3.numPts(); int ncells = amrex::max(ncells1, ncells2, ncells3); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, f1); + Gpu::Register(box2, f2); + Gpu::Register(box3, f3); + } else +#endif + { const auto lo1 = amrex::lbound(box1); const auto lo2 = amrex::lbound(box2); const auto lo3 = amrex::lbound(box3); @@ -784,19 +828,28 @@ ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2 } }); AMREX_GPU_ERROR_CHECK(); + } } template ::value>, typename M2=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return; int ncells1 = box1.numPts(); int ncells2 = box2.numPts(); int ncells = amrex::max(ncells1, ncells2); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, ncomp1, f1); + Gpu::Register(box2, ncomp2, f2); + } else +#endif + { const auto lo1 = amrex::lbound(box1); const auto lo2 = amrex::lbound(box2); const auto len1 = amrex::length(box1); @@ -831,6 +884,7 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } }); AMREX_GPU_ERROR_CHECK(); + } } template ::value>, typename M3=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeDeviceRunnable::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { @@ -847,6 +902,14 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, int ncells2 = box2.numPts(); int ncells3 = box3.numPts(); int ncells = amrex::max(ncells1, ncells2, ncells3); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, ncomp1, f1); + Gpu::Register(box2, ncomp2, f2); + Gpu::Register(box3, ncomp3, f3); + } else +#endif + { const auto lo1 = amrex::lbound(box1); const auto lo2 = amrex::lbound(box2); const auto lo3 = amrex::lbound(box3); @@ -894,6 +957,7 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } }); AMREX_GPU_ERROR_CHECK(); + } } template @@ -983,34 +1047,147 @@ void single_task (L&& f) noexcept single_task(Gpu::gpuStream(), std::forward(f)); } +template ::value> > +void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept +{ + ParallelFor(info, n,std::forward(f)); +} + +template +void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept +{ + ParallelFor(info, box,std::forward(f)); +} + +template ::value> > +void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(info,box,ncomp,std::forward(f)); +} + +template +void For (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(info,box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void For (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(info,box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(info,box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(info, + box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + +template ::value> > +void ParallelFor (T n, L&& f) noexcept +{ + ParallelFor(Gpu::KernelInfo{}, n, std::forward(f)); +} + +template +void ParallelFor (Box const& box, L&& f) noexcept +{ + ParallelFor(Gpu::KernelInfo{}, box, std::forward(f)); +} + +template ::value> > +void ParallelFor (Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward(f)); +} + +template +void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void ParallelFor (Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(Gpu::KernelInfo{}, + box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + template ::value> > void For (T n, L&& f) noexcept { - ParallelFor(n,std::forward(f)); + ParallelFor(Gpu::KernelInfo{}, n,std::forward(f)); } template void For (Box const& box, L&& f) noexcept { - ParallelFor(box,std::forward(f)); + ParallelFor(Gpu::KernelInfo{}, box,std::forward(f)); } template ::value> > void For (Box const& box, T ncomp, L&& f) noexcept { - ParallelFor(box,ncomp,std::forward(f)); + ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward(f)); } template void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { - ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); + ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward(f1),std::forward(f2)); } template -void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +void For (Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept { - ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); + ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); } template (f1),box2,ncomp2,std::forward(f2)); + ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); } template (f1), + ParallelFor(Gpu::KernelInfo{}, + box1,ncomp1,std::forward(f1), box2,ncomp2,std::forward(f2), box3,ncomp3,std::forward(f3)); } template ::value> > amrex::EnableIf_t::value> -HostDeviceParallelFor (T n, L&& f) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(n,std::forward(f)); + ParallelFor(info,n,std::forward(f)); } else { AMREX_PRAGMA_SIMD for (T i = 0; i < n; ++i) f(i); } } +template ::value> > +amrex::EnableIf_t::value> +HostDeviceParallelFor (T n, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{}, n, std::forward(f)); +} + template amrex::EnableIf_t::value> -HostDeviceParallelFor (Box const& box, L&& f) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box,std::forward(f)); + ParallelFor(info, box,std::forward(f)); } else { LoopConcurrentOnCpu(box,std::forward(f)); } @@ -1060,10 +1245,10 @@ HostDeviceParallelFor (Box const& box, L&& f) noexcept template ::value> > amrex::EnableIf_t::value> -HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box,ncomp,std::forward(f)); + ParallelFor(info, box,ncomp,std::forward(f)); } else { LoopConcurrentOnCpu(box,ncomp,std::forward(f)); } @@ -1071,10 +1256,11 @@ HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept template amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); + ParallelFor(info,box1,box2,std::forward(f1),std::forward(f2)); } else { LoopConcurrentOnCpu(box1,std::forward(f1)); LoopConcurrentOnCpu(box2,std::forward(f2)); @@ -1083,11 +1269,13 @@ HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexc template amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3, +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); + ParallelFor(info,box1,box2,box3, + std::forward(f1),std::forward(f2),std::forward(f3)); } else { LoopConcurrentOnCpu(box1,std::forward(f1)); LoopConcurrentOnCpu(box2,std::forward(f2)); @@ -1099,11 +1287,12 @@ template ::value>, typename M2=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); + ParallelFor(info,box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); } else { LoopConcurrentOnCpu(box1,ncomp1,std::forward(f1)); LoopConcurrentOnCpu(box2,ncomp2,std::forward(f2)); @@ -1115,12 +1304,14 @@ template ::value>, typename M3=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,ncomp1,std::forward(f1), + ParallelFor(info, + box1,ncomp1,std::forward(f1), box2,ncomp2,std::forward(f2), box3,ncomp3,std::forward(f3)); } else { @@ -1131,54 +1322,115 @@ HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value> > -void HostDeviceFor (T n, L&& f) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { - HostDeviceParallelFor(n,std::forward(f)); + HostDeviceParallelFor(info,n,std::forward(f)); } template -void HostDeviceFor (Box const& box, L&& f) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { - HostDeviceParallelFor(box,std::forward(f)); + HostDeviceParallelFor(info,box,std::forward(f)); } template ::value> > -void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { - HostDeviceParallelFor(box,ncomp,std::forward(f)); + HostDeviceParallelFor(info,box,ncomp,std::forward(f)); } template -void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { - HostDeviceParallelFor(box1,box2,std::forward(f1),std::forward(f2)); + HostDeviceParallelFor(info,box1,box2,std::forward(f1),std::forward(f2)); } template -void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3, +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { - HostDeviceParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); + HostDeviceParallelFor(info, box1,box2,box3, + std::forward(f1),std::forward(f2),std::forward(f3)); } template ::value>, typename M2=amrex::EnableIf_t::value> > -void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { - HostDeviceParallelFor(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); + HostDeviceParallelFor(info,box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); } template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > -void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { - HostDeviceParallelFor(box1,ncomp1,std::forward(f1), + HostDeviceParallelFor(info, + box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + +template ::value> > +void HostDeviceParallelFor (T n, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},n,std::forward(f)); +} + +template +void HostDeviceParallelFor (Box const& box, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box,std::forward(f)); +} + +template ::value> > +void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward(f)); +} + +template +void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{}, box1,box2,box3, + std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{}, + box1,ncomp1,std::forward(f1), box2,ncomp2,std::forward(f2), box3,ncomp3,std::forward(f3)); } diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H index 83306b4b560..bb69202a9d9 100644 --- a/Src/Base/AMReX_GpuLaunchMacrosG.H +++ b/Src/Base/AMReX_GpuLaunchMacrosG.H @@ -53,6 +53,34 @@ block \ } \ }}} +#ifdef AMREX_USE_CUDA +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE(TN,TI,block) \ + { auto const& amrex_i_tn = TN; \ + if (!amrex::isEmpty(amrex_i_tn)) { \ + if (amrex::Gpu::inLaunchRegion()) \ + { \ + const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \ + if (amrex::Gpu::inFuseRegion() && amrex_i_ec.numBlocks.x*amrex_i_ec.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \ + amrex::Gpu::Register(amrex_i_tn, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block \ + }); \ + } else { \ + AMREX_LAUNCH_KERNEL(amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \ + [=] AMREX_GPU_DEVICE () noexcept { \ + for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \ + block \ + } \ + }); \ + AMREX_GPU_ERROR_CHECK(); \ + } \ + } \ + else { \ + for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \ + block \ + } \ + }}} +#endif #endif // two fused launches @@ -134,6 +162,51 @@ block2 \ } \ }}} +#ifdef AMREX_USE_CUDA +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \ + { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \ + if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2)) { \ + if (amrex::Gpu::inLaunchRegion()) \ + { \ + const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \ + const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \ + dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \ + amrex_i_ec2.numBlocks.x); \ + amrex_i_nblocks.y = 2; \ + if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \ + amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block1 \ + }); \ + amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block2 \ + }); \ + } else { \ + AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \ + [=] AMREX_GPU_DEVICE () noexcept { \ + switch (blockIdx.y) { \ + case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + break; \ + case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + } \ + }); \ + AMREX_GPU_ERROR_CHECK(); \ + } \ + } \ + else { \ + for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + }}} +#endif #endif // three fused launches @@ -233,6 +306,64 @@ block3 \ } \ }}} +#ifdef AMREX_USE_CUDA +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \ + { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \ + if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2) or !amrex::isEmpty(amrex_i_tn3)) { \ + if (amrex::Gpu::inLaunchRegion()) \ + { \ + const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \ + const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \ + const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \ + dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \ + amrex_i_ec2.numBlocks.x), \ + amrex_i_ec3.numBlocks.x); \ + amrex_i_nblocks.y = 3; \ + if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \ + amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block1 \ + }); \ + amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block2 \ + }); \ + amrex::Gpu::Register(amrex_i_tn3, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI3(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block3 \ + }); \ + } else { \ + AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \ + [=] AMREX_GPU_DEVICE () noexcept { \ + switch (blockIdx.y) { \ + case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + break; \ + case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + break; \ + case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \ + block3 \ + } \ + } \ + }); \ + AMREX_GPU_ERROR_CHECK(); \ + } \ + } \ + else { \ + for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \ + block3 \ + } \ + }}} +#endif #endif #ifdef AMREX_USE_DPCPP @@ -452,6 +583,19 @@ } \ } +#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(n,i,block) \ +{ \ + auto const& amrex_i_n = n; \ + using amrex_i_inttype = typename std::remove_const::type; \ + if (amrex::Gpu::inLaunchRegion()) { \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ + } else { \ + auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \ + AMREX_PRAGMA_SIMD \ + for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \ + } \ +} + #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \ { \ using amrex_i_inttype = typename std::remove_const::type; \ @@ -470,6 +614,16 @@ } \ } +#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(box,i,j,k,block) \ +{ \ + auto const& amrex_i_box = box; \ + if (amrex::Gpu::inLaunchRegion()) { \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \ + } else { \ + amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \ + } \ +} + #define AMREX_GPU_DEVICE_FOR_3D(box,i,j,k,block) \ { \ amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \ @@ -488,6 +642,17 @@ } \ } +#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(box,ncomp,i,j,k,n,block) \ +{ \ + auto const& amrex_i_box = box; \ + auto const& amrex_i_ncomp = ncomp; \ + if (amrex::Gpu::inLaunchRegion()) { \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \ + } else { \ + amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \ + } \ +} + #define AMREX_GPU_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \ { \ amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \ diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H index 3df4cc2f884..8cae1464acc 100644 --- a/Src/Base/AMReX_MFIter.H +++ b/Src/Base/AMReX_MFIter.H @@ -222,6 +222,10 @@ protected: mutable Vector > real_device_reduce_list; #endif +#ifdef AMREX_USE_GPU + std::unique_ptr gpu_fsg; +#endif + static int nextDynamicIndex; void Initialize (); diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp index 3c07e07fe17..375fd3e578c 100644 --- a/Src/Base/AMReX_MFIter.cpp +++ b/Src/Base/AMReX_MFIter.cpp @@ -225,6 +225,9 @@ MFIter::~MFIter () AMREX_GPU_ERROR_CHECK(); Gpu::Device::resetStreamIndex(); Gpu::resetNumCallbacks(); + if (!OpenMP::in_parallel() && Gpu::inFuseRegion()) { + Gpu::LaunchFusedKernels(); + } #endif if (m_fa) { @@ -236,7 +239,7 @@ MFIter::~MFIter () } } -void +void MFIter::Initialize () { if (flags & SkipInit) { @@ -327,6 +330,11 @@ MFIter::Initialize () #ifdef AMREX_USE_GPU Gpu::Device::setStreamIndex((streams > 0) ? currentIndex%streams : -1); Gpu::resetNumCallbacks(); + if (!OpenMP::in_parallel()) { + if (index_map->size() >= Gpu::getFuseNumKernelsThreshold()) { + gpu_fsg.reset(new Gpu::FuseSafeGuard(true)); + } + } #endif typ = fabArray.boxArray().ixType(); diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp index 08d945cdca2..a748bcd2eb8 100644 --- a/Src/Base/AMReX_MultiFab.cpp +++ b/Src/Base/AMReX_MultiFab.cpp @@ -43,16 +43,36 @@ MultiFab::Dot (const MultiFab& x, int xcomp, BL_PROFILE("MultiFab::Dot()"); - Real sm = amrex::ReduceSum(x, y, nghost, - [=] AMREX_GPU_HOST_DEVICE (Box const& bx, Array4 const& xfab, Array4 const& yfab) -> Real - { - Real t = 0.0; - AMREX_LOOP_4D(bx, numcomp, i, j, k, n, + Real sm = 0.0; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + sm = amrex::ReduceSum(x, y, nghost, + [=] AMREX_GPU_DEVICE (Box const& bx, Array4 const& xfab, Array4 const& yfab) -> Real { - t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n); + Real t = 0.0; + AMREX_LOOP_4D(bx, numcomp, i, j, k, n, + { + t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n); + }); + return t; }); - return t; - }); + } else +#endif + { +#ifdef _OPENMP +#pragma omp parallel reduction(+:sm) +#endif + for (MFIter mfi(x,true); mfi.isValid(); ++mfi) + { + Box const& bx = mfi.growntilebox(nghost); + Array4 const& xfab = x.const_array(mfi); + Array4 const& yfab = y.const_array(mfi); + AMREX_LOOP_4D(bx, numcomp, i, j, k, n, + { + sm += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n); + }); + } + } if (!local) ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub()); @@ -198,7 +218,7 @@ MultiFab::Swap (MultiFab& dst, MultiFab& src, if (bx.ok()) { auto sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { const amrex::Real tmp = dfab(i,j,k,n+dstcomp); dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp); @@ -298,7 +318,7 @@ MultiFab::Saxpy (MultiFab& dst, Real a, const MultiFab& src, if (bx.ok()) { auto const sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,dstcomp+n) += a * sfab(i,j,k,srccomp+n); }); @@ -332,7 +352,7 @@ MultiFab::Xpay (MultiFab& dst, Real a, const MultiFab& src, if (bx.ok()) { auto const sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp) + a * dfab(i,j,k,n+dstcomp); }); @@ -374,7 +394,7 @@ MultiFab::LinComb (MultiFab& dst, auto const xfab = x.array(mfi); auto const yfab = y.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,dstcomp+n) = a*xfab(i,j,k,xcomp+n) + b*yfab(i,j,k,ycomp+n); }); @@ -415,7 +435,7 @@ MultiFab::AddProduct (MultiFab& dst, auto const s1fab = src1.array(mfi); auto const s2fab = src2.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,n+dstcomp) += s1fab(i,j,k,n+comp1) * s2fab(i,j,k,n+comp2); }); diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index d312cb676cf..3acbeeabe5f 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -67,7 +67,7 @@ namespace amrex Array4 const& ccarr = cc.array(mfi); Array4 const& ndarr = nd.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avg_nd_to_cc(tbx, ccarr, ndarr, dcomp, scomp, ncomp); }); @@ -91,7 +91,7 @@ namespace amrex Array4 const& eyarr = edge[1]->const_array(mfi);, Array4 const& ezarr = edge[2]->const_array(mfi);); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avg_eg_to_cc(tbx, ccarr, AMREX_D_DECL(exarr,eyarr,ezarr), dcomp); }); @@ -133,12 +133,12 @@ namespace amrex Array4 const& fzarr = fc[2]->const_array(mfi);); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, fxarr, dcomp, GeometryData()); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp); }); @@ -168,12 +168,12 @@ namespace amrex Array4 const& fzarr = fc[2]->const_array(mfi);); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, fxarr, 0, gd); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), 0); }); @@ -218,12 +218,12 @@ namespace amrex Array4 const& ccarr = cc.const_array(mfi); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx, { amrex_avg_cc_to_fc(tbx, xbx, fxarr, ccarr, gd); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx, { amrex_avg_cc_to_fc(tbx, AMREX_D_DECL(xbx,ybx,zbx), AMREX_D_DECL(fxarr,fyarr,fzarr), ccarr); @@ -288,7 +288,7 @@ namespace amrex Array4 const& finearr = S_fine.const_array(mfi); Array4 const& finevolarr = fvolume.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown_with_vol(tbx,crsearr,finearr,finevolarr, 0,scomp,ncomp,ratio); @@ -337,7 +337,7 @@ namespace amrex Array4 const& crsearr = crse_S_fine.array(mfi); Array4 const& finearr = S_fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); @@ -375,12 +375,12 @@ namespace amrex Array4 const& finearr = S_fine.const_array(mfi); if (is_cell_centered) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown_nodes(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio); }); @@ -406,12 +406,12 @@ namespace amrex // not part of the actual crse multifab which came in. if (is_cell_centered) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown_nodes(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); @@ -486,7 +486,7 @@ namespace amrex Array4 const& crsearr = crse.array(mfi); Array4 const& finearr = fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown_faces(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir); }); @@ -568,7 +568,7 @@ namespace amrex Array4 const& crsearr = crse.array(mfi); Array4 const& finearr = fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { amrex_avgdown_edges(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir); }); @@ -640,7 +640,7 @@ namespace amrex if (interpolate) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tile_box, thread_box, { amrex_fill_slice_interp(thread_box, slice_arr, full_arr, 0, start_comp, ncomp, diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index beaeac8a218..aaef79e9253 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -309,6 +309,22 @@ public: Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r, gh); }); #else +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && Gpu::inFuseReductionRegion() + && ec.numBlocks.x*ec.numThreads.x <= Gpu::getFuseSizeThreshold()) + { + Gpu::Register(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + ReduceTuple r = *(dp+1); + if (box.contains(IntVect(AMREX_D_DECL(i,j,k)))) { + auto pr = Reduce::detail::call_f(f,i,j,k,ixtype); + Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr); + } + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r); + }); + } else +#endif + { amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { ReduceTuple r = *(dp+1); @@ -325,6 +341,7 @@ public: } Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r); }); + } #endif } diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 17d043fa097..081ecf696ed 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -175,6 +175,7 @@ target_sources( amrex # GPU -------------------------------------------------------------------- AMReX_Gpu.H AMReX_GpuQualifiers.H + AMReX_GpuKernelInfo.H AMReX_GpuPrint.H AMReX_GpuAssert.H AMReX_GpuTypes.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index c97b264374d..edea7af44e0 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -66,6 +66,7 @@ C$(AMREX_BASE)_headers += AMReX_parstream.H C$(AMREX_BASE)_sources += AMReX_parstream.cpp C$(AMREX_BASE)_headers += AMReX_Gpu.H AMReX_GpuQualifiers.H AMReX_GpuPrint.H AMReX_GpuAssert.H AMReX_GpuTypes.H AMReX_GpuError.H +C$(AMREX_BASE)_headers += AMReX_GpuKernelInfo.H C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosG.H AMReX_GpuLaunchFunctsG.H C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosC.H AMReX_GpuLaunchFunctsC.H diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp index d1cbedfedea..e641313c7a5 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp @@ -410,13 +410,13 @@ MLABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i const auto& bzfab = bzcoef.array(mfi);); if (m_overset_mask[amrlev][mglev]) { const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { mlabeclap_adotx_os(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), osm, dxinv, ascalar, bscalar, ncomp); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { mlabeclap_adotx(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), dxinv, ascalar, bscalar, ncomp); @@ -454,7 +454,7 @@ MLABecLaplacian::normalize (int amrlev, int mglev, MultiFab& mf) const const auto& byfab = bycoef.array(mfi);, const auto& bzfab = bzcoef.array(mfi);); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { mlabeclap_normalize(tbx, fab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), dxinv, ascalar, bscalar, ncomp); @@ -609,7 +609,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& #else if (m_overset_mask[amrlev][mglev]) { const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box, { abec_gsrb_os(thread_box, solnfab, rhsfab, alpha, afab, AMREX_D_DECL(dhx, dhy, dhz), @@ -621,7 +621,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& osm, vbx, redblack, nc); }); } else if (regular_coarsening) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box, { abec_gsrb(thread_box, solnfab, rhsfab, alpha, afab, AMREX_D_DECL(dhx, dhy, dhz), @@ -688,7 +688,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, Real fac = bscalar*dxinv[0]; Box blo = amrex::bdryLo(box, 0); int blen = box.length(0); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox, { mlabeclap_flux_xface(tbox, fxarr, solarr, bx, fac, blen, ncomp); }); @@ -696,7 +696,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, fac = bscalar*dxinv[1]; blo = amrex::bdryLo(box, 1); blen = box.length(1); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox, { mlabeclap_flux_yface(tbox, fyarr, solarr, by, fac, blen, ncomp); }); @@ -705,7 +705,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, fac = bscalar*dxinv[2]; blo = amrex::bdryLo(box, 2); blen = box.length(2); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox, { mlabeclap_flux_zface(tbox, fzarr, solarr, bz, fac, blen, ncomp); }); @@ -715,14 +715,14 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, { Real fac = bscalar*dxinv[0]; Box bflux = amrex::surroundingNodes(box, 0); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_x(tbox, fxarr, solarr, bx, fac, ncomp); }); #if (AMREX_SPACEDIM >= 2) fac = bscalar*dxinv[1]; bflux = amrex::surroundingNodes(box, 1); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_y(tbox, fyarr, solarr, by, fac, ncomp); }); @@ -730,7 +730,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, #if (AMREX_SPACEDIM == 3) fac = bscalar*dxinv[2]; bflux = amrex::surroundingNodes(box, 2); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_z(tbox, fzarr, solarr, bz, fac, ncomp); }); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp index aadc9ee0106..c854898a693 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp @@ -314,7 +314,7 @@ MLCellLinOp::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiF const Box& bx = mfi.tilebox(); Array4 const& cfab = crse.const_array(mfi); Array4 const& ffab = fine.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, { int ic = amrex::coarsen(i,ratio3.x); int jc = amrex::coarsen(j,ratio3.y); @@ -505,7 +505,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State const Real bcllo = bdlv[icomp][olo]; const Real bclhi = bdlv[icomp][ohi]; if (idim == 0) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_x(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -517,7 +517,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State imaxorder, dxi, flagbc, icomp); }); } else if (idim == 1) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_y(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -529,7 +529,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State imaxorder, dyi, flagbc, icomp); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_z(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -619,6 +619,7 @@ MLCellLinOp::reflux (int crse_amrlev, AMREX_D_TERM(Elixir elifx = flux[0].elixir();, Elixir elify = flux[1].elixir();, Elixir elifz = flux[2].elixir();); + Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux FFlux(crse_amrlev, mfi, pflux, crse_sol[mfi], Location::FaceCentroid); fluxreg.CrseAdd(mfi, cpflux, crse_dx, dt, RunOn::Gpu); } @@ -640,6 +641,7 @@ MLCellLinOp::reflux (int crse_amrlev, AMREX_D_TERM(Elixir elifx = flux[0].elixir();, Elixir elify = flux[1].elixir();, Elixir elifz = flux[2].elixir();); + Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux FFlux(fine_amrlev, mfi, pflux, fine_sol[mfi], Location::FaceCentroid, face_only); fluxreg.FineAdd(mfi, cpflux, fine_dx, dt, RunOn::Gpu); } @@ -678,6 +680,7 @@ MLCellLinOp::compFlux (int amrlev, const Array& fluxes AMREX_D_TERM(Elixir elifx = flux[0].elixir();, Elixir elify = flux[1].elixir();, Elixir elifz = flux[2].elixir();); + Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux FFlux(amrlev, mfi, pflux, sol[mfi], loc); for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { const Box& nbx = mfi.nodaltilebox(idim); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp index 5ad48f59131..aba34c49dd5 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp @@ -77,30 +77,30 @@ MLPoisson::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) con const auto& yfab = out.array(mfi); #if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, { mlpoisson_adotx(i, j, k, yfab, xfab, dhx, dhy, dhz); }); #elif (AMREX_SPACEDIM == 2) if (m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, { mlpoisson_adotx_m(i, j, yfab, xfab, dhx, dhy, dx, probxlo); }); } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, { mlpoisson_adotx(i, j, yfab, xfab, dhx, dhy); }); } #elif (AMREX_SPACEDIM == 1) if (m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, { mlpoisson_adotx_m(i, yfab, xfab, dhx, dx, probxlo); }); } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, { mlpoisson_adotx(i, yfab, xfab, dhx); }); @@ -134,12 +134,12 @@ MLPoisson::normalize (int amrlev, int mglev, MultiFab& mf) const const auto& fab = mf.array(mfi); #if (AMREX_SPACEDIM == 2) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { mlpoisson_normalize(tbx, fab, dhx, dhy, dx, probxlo); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, { mlpoisson_normalize(tbx, fab, dhx, dx, probxlo); }); From 2635ca043b7b8e7d700d24f7c32645d5706a6bf9 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Tue, 15 Sep 2020 09:20:52 -0700 Subject: [PATCH 2/3] fix warnings --- Src/Base/AMReX_GpuLaunchFunctsG.H | 395 +++++++++++++++--------------- 1 file changed, 201 insertions(+), 194 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index d4a1841ce73..a9d3ee2bd2d 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -94,7 +94,7 @@ void launch (T const& n, L&& f) noexcept } template ::value> > -void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; const auto ec = Gpu::ExecutionConfig(n); @@ -122,7 +122,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept } template -void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -159,7 +159,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept } template ::value> > -void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -198,7 +198,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n } template -void ParallelFor (Gpu::KernelInfo const& info, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size ParallelFor(box1, std::forward(f1)); @@ -255,7 +255,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box1, Box const& box2, } template -void ParallelFor (Gpu::KernelInfo const& info, +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { @@ -329,7 +329,7 @@ void ParallelFor (Gpu::KernelInfo const& info, template ::value>, typename M2=amrex::EnableIf_t::value> > -void ParallelFor (Gpu::KernelInfo const& info, +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { @@ -395,7 +395,7 @@ template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > -void ParallelFor (Gpu::KernelInfo const& info, +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept @@ -641,15 +641,16 @@ ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept } else #endif { - const auto ec = Gpu::ExecutionConfig(n); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - i < n; i += stride) { - f(i); - } - }); - AMREX_GPU_ERROR_CHECK(); + amrex::ignore_unused(info); + const auto ec = Gpu::ExecutionConfig(n); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + i < n; i += stride) { + f(i); + } + }); + AMREX_GPU_ERROR_CHECK(); } } @@ -665,23 +666,24 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept } else #endif { - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - i += lo.x; - j += lo.y; - k += lo.z; - f(i,j,k); - } - }); - AMREX_GPU_ERROR_CHECK(); + amrex::ignore_unused(info); + const auto lo = amrex::lbound(box); + const auto len = amrex::length(box); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + f(i,j,k); + } + }); + AMREX_GPU_ERROR_CHECK(); } } @@ -697,25 +699,26 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce } else #endif { - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - i += lo.x; - j += lo.y; - k += lo.z; - for (T n = 0; n < ncomp; ++n) { - f(i,j,k,n); + amrex::ignore_unused(info); + const auto lo = amrex::lbound(box); + const auto len = amrex::length(box); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + for (T n = 0; n < ncomp; ++n) { + f(i,j,k,n); + } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); } } @@ -735,36 +738,37 @@ ParallelFor (Gpu::KernelInfo const& info, } else #endif { - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - f1(i,j,k); - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - f2(i,j,k); + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + f1(i,j,k); + } + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + f2(i,j,k); + } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); } } @@ -787,47 +791,48 @@ ParallelFor (Gpu::KernelInfo const& info, } else #endif { - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - f1(i,j,k); - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - f2(i,j,k); - } - if (icell < ncells3) { - int k = icell / (len3.x*len3.y); - int j = (icell - k*(len3.x*len3.y)) / len3.x; - int i = (icell - k*(len3.x*len3.y)) - j*len3.x; - i += lo3.x; - j += lo3.y; - k += lo3.z; - f3(i,j,k); + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto lo3 = amrex::lbound(box3); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto len3 = amrex::length(box3); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + f1(i,j,k); + } + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + f2(i,j,k); + } + if (icell < ncells3) { + int k = icell / (len3.x*len3.y); + int j = (icell - k*(len3.x*len3.y)) / len3.x; + int i = (icell - k*(len3.x*len3.y)) - j*len3.x; + i += lo3.x; + j += lo3.y; + k += lo3.z; + f3(i,j,k); + } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); } } @@ -850,40 +855,41 @@ ParallelFor (Gpu::KernelInfo const& info, } else #endif { - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - for (T1 n = 0; n < ncomp1; ++n) { - f1(i,j,k,n); + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + for (T1 n = 0; n < ncomp1; ++n) { + f1(i,j,k,n); + } } - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - for (T2 n = 0; n < ncomp2; ++n) { - f2(i,j,k,n); + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + for (T2 n = 0; n < ncomp2; ++n) { + f2(i,j,k,n); + } } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); } } @@ -910,53 +916,54 @@ ParallelFor (Gpu::KernelInfo const& info, } else #endif { - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - for (T1 n = 0; n < ncomp1; ++n) { - f1(i,j,k,n); + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto lo3 = amrex::lbound(box3); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto len3 = amrex::length(box3); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + for (T1 n = 0; n < ncomp1; ++n) { + f1(i,j,k,n); + } } - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - for (T2 n = 0; n < ncomp2; ++n) { - f2(i,j,k,n); + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + for (T2 n = 0; n < ncomp2; ++n) { + f2(i,j,k,n); + } } - } - if (icell < ncells3) { - int k = icell / (len3.x*len3.y); - int j = (icell - k*(len3.x*len3.y)) / len3.x; - int i = (icell - k*(len3.x*len3.y)) - j*len3.x; - i += lo3.x; - j += lo3.y; - k += lo3.z; - for (T3 n = 0; n < ncomp3; ++n) { - f3(i,j,k,n); + if (icell < ncells3) { + int k = icell / (len3.x*len3.y); + int j = (icell - k*(len3.x*len3.y)) / len3.x; + int i = (icell - k*(len3.x*len3.y)) - j*len3.x; + i += lo3.x; + j += lo3.y; + k += lo3.z; + for (T3 n = 0; n < ncomp3; ++n) { + f3(i,j,k,n); + } } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); } } From 3657b11d00aed5a5d519b4c0a8f7bd5c746d7f0f Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Tue, 15 Sep 2020 10:01:39 -0700 Subject: [PATCH 3/3] fusable -> fusible --- Src/Base/AMReX_FBI.H | 4 +-- Src/Base/AMReX_FabArray.H | 20 +++++------ Src/Base/AMReX_FabArrayUtility.H | 14 ++++---- Src/Base/AMReX_GpuKernelInfo.H | 6 ++-- Src/Base/AMReX_GpuLaunch.H | 34 +++++++++---------- Src/Base/AMReX_GpuLaunchFunctsG.H | 14 ++++---- Src/Base/AMReX_GpuLaunchMacrosG.H | 18 +++++----- Src/Base/AMReX_MultiFab.cpp | 10 +++--- Src/Base/AMReX_MultiFabUtil.cpp | 34 +++++++++---------- .../MLMG/AMReX_MLABecLaplacian.cpp | 22 ++++++------ Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp | 8 ++--- Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp | 14 ++++---- 12 files changed, 99 insertions(+), 99 deletions(-) diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index 6748a591613..970f63421e5 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -688,7 +688,7 @@ FabArray::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp) BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(Gpu::KernelInfo{}.setFusable(true), bx, + amrex::ParallelFor(Gpu::KernelInfo{}.setFusible(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; @@ -1237,7 +1237,7 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(Gpu::KernelInfo().setFusable(true), bx, + amrex::ParallelFor(Gpu::KernelInfo().setFusible(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 8ae7b66bd97..db0b2073d2f 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -1582,7 +1582,7 @@ FabArray::setVal (value_type val, { const Box& bx = fai.growntilebox(nghost); auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1625,7 +1625,7 @@ FabArray::setVal (value_type val, if (b.ok()) { auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( b, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( b, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1655,7 +1655,7 @@ FabArray::abs (int comp, int ncomp, const IntVect& nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp)); }); @@ -1674,7 +1674,7 @@ FabArray::plus (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1694,7 +1694,7 @@ FabArray::plus (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1714,7 +1714,7 @@ FabArray::mult (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1734,7 +1734,7 @@ FabArray::mult (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1754,7 +1754,7 @@ FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1774,7 +1774,7 @@ FabArray::invert (value_type numerator, const Box& region, int comp, int nu const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1970,7 +1970,7 @@ FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, Box const& fbx = mfi.growntilebox(); Box const& gbx = fbx & domain; Box const& vbx = mfi.validbox(); - AMREX_HOST_DEVICE_FOR_4D_FUSABLE(fbx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(fbx, ncomp, i, j, k, n, { IntVect iv(AMREX_D_DECL(i,j,k)); if (vbx.contains(iv)) { diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H index c7075d48520..869fd172d89 100644 --- a/Src/Base/AMReX_FabArrayUtility.H +++ b/Src/Base/AMReX_FabArrayUtility.H @@ -1469,7 +1469,7 @@ Add (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp); }); @@ -1501,7 +1501,7 @@ Copy (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, in { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n); }); @@ -1533,7 +1533,7 @@ Subtract (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp); }); @@ -1565,7 +1565,7 @@ Multiply (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp); }); @@ -1597,7 +1597,7 @@ Divide (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp); }); @@ -1627,7 +1627,7 @@ Abs (FabArray& fa, int icomp, int numcomp, const IntVect& nghost) if (bx.ok()) { auto const& fab = fa.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp)); }); @@ -1684,7 +1684,7 @@ OverrideSync (FabArray & fa, FabArray const& msk, const Periodicity& const Box& bx = mfi.tilebox(); auto fab = fa.array(mfi); auto const ifab = msk.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { if (!ifab(i,j,k)) fab(i,j,k,n) = 0; }); diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H index 2eac4054e63..94e125e5a0c 100644 --- a/Src/Base/AMReX_GpuKernelInfo.H +++ b/Src/Base/AMReX_GpuKernelInfo.H @@ -7,10 +7,10 @@ namespace Gpu { class KernelInfo { public: - KernelInfo& setFusable (bool flag) { fusable = flag; return *this; } - bool isFusable () const { return fusable; } + KernelInfo& setFusible (bool flag) { fusible = flag; return *this; } + bool isFusible () const { return fusible; } private: - bool fusable = false; + bool fusible = false; }; }} diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index 5305139f636..d6c35972427 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -190,18 +190,18 @@ namespace Gpu { AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) #ifdef AMREX_USE_CUDA -#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\ - AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3, \ +#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3, \ AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS, \ - AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2, \ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2, \ AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS, \ - AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE, \ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE, \ AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) #else -#define AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__) +#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__) #endif #if (AMREX_SPACEDIM == 1) @@ -232,19 +232,19 @@ namespace Gpu { #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) #ifdef AMREX_USE_CUDA -#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__) -#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__) -#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__) -#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(__VA_ARGS__) -#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(__VA_ARGS__) -#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__) #else -#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__) -#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__) -#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) -#define AMREX_HOST_DEVICE_FOR_1D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__) -#define AMREX_HOST_DEVICE_FOR_3D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__) -#define AMREX_HOST_DEVICE_FOR_4D_FUSABLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__) #endif #ifdef AMREX_USE_GPU diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index a9d3ee2bd2d..295d3320380 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -636,7 +636,7 @@ ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && n <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) { Gpu::Register(n, f); } else #endif @@ -661,7 +661,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept if (amrex::isEmpty(box)) return; int ncells = box.numPts(); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box, f); } else #endif @@ -694,7 +694,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce if (amrex::isEmpty(box)) return; int ncells = box.numPts(); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box, ncomp, f); } else #endif @@ -732,7 +732,7 @@ ParallelFor (Gpu::KernelInfo const& info, int ncells2 = box2.numPts(); int ncells = amrex::max(ncells1, ncells2); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box1, f1); Gpu::Register(box2, f2); } else @@ -784,7 +784,7 @@ ParallelFor (Gpu::KernelInfo const& info, int ncells3 = box3.numPts(); int ncells = amrex::max(ncells1, ncells2, ncells3); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box1, f1); Gpu::Register(box2, f2); Gpu::Register(box3, f3); @@ -849,7 +849,7 @@ ParallelFor (Gpu::KernelInfo const& info, int ncells2 = box2.numPts(); int ncells = amrex::max(ncells1, ncells2); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box1, ncomp1, f1); Gpu::Register(box2, ncomp2, f2); } else @@ -909,7 +909,7 @@ ParallelFor (Gpu::KernelInfo const& info, int ncells3 = box3.numPts(); int ncells = amrex::max(ncells1, ncells2, ncells3); #ifdef AMREX_USE_CUDA - if (Gpu::inFuseRegion() && info.isFusable() && ncells <= Gpu::getFuseSizeThreshold()) { + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { Gpu::Register(box1, ncomp1, f1); Gpu::Register(box2, ncomp2, f2); Gpu::Register(box3, ncomp3, f3); diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H index bb69202a9d9..45400858abd 100644 --- a/Src/Base/AMReX_GpuLaunchMacrosG.H +++ b/Src/Base/AMReX_GpuLaunchMacrosG.H @@ -54,7 +54,7 @@ } \ }}} #ifdef AMREX_USE_CUDA -#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE(TN,TI,block) \ +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE(TN,TI,block) \ { auto const& amrex_i_tn = TN; \ if (!amrex::isEmpty(amrex_i_tn)) { \ if (amrex::Gpu::inLaunchRegion()) \ @@ -163,7 +163,7 @@ } \ }}} #ifdef AMREX_USE_CUDA -#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \ +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \ { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \ if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2)) { \ if (amrex::Gpu::inLaunchRegion()) \ @@ -307,7 +307,7 @@ } \ }}} #ifdef AMREX_USE_CUDA -#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \ +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \ { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \ if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2) or !amrex::isEmpty(amrex_i_tn3)) { \ if (amrex::Gpu::inLaunchRegion()) \ @@ -583,12 +583,12 @@ } \ } -#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSABLE(n,i,block) \ +#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(n,i,block) \ { \ auto const& amrex_i_n = n; \ using amrex_i_inttype = typename std::remove_const::type; \ if (amrex::Gpu::inLaunchRegion()) { \ - amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ } else { \ auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \ AMREX_PRAGMA_SIMD \ @@ -614,11 +614,11 @@ } \ } -#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSABLE(box,i,j,k,block) \ +#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(box,i,j,k,block) \ { \ auto const& amrex_i_box = box; \ if (amrex::Gpu::inLaunchRegion()) { \ - amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \ } else { \ amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \ } \ @@ -642,12 +642,12 @@ } \ } -#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSABLE(box,ncomp,i,j,k,n,block) \ +#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(box,ncomp,i,j,k,n,block) \ { \ auto const& amrex_i_box = box; \ auto const& amrex_i_ncomp = ncomp; \ if (amrex::Gpu::inLaunchRegion()) { \ - amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusable(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \ } else { \ amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \ } \ diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp index a748bcd2eb8..2ce916e627e 100644 --- a/Src/Base/AMReX_MultiFab.cpp +++ b/Src/Base/AMReX_MultiFab.cpp @@ -218,7 +218,7 @@ MultiFab::Swap (MultiFab& dst, MultiFab& src, if (bx.ok()) { auto sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { const amrex::Real tmp = dfab(i,j,k,n+dstcomp); dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp); @@ -318,7 +318,7 @@ MultiFab::Saxpy (MultiFab& dst, Real a, const MultiFab& src, if (bx.ok()) { auto const sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,dstcomp+n) += a * sfab(i,j,k,srccomp+n); }); @@ -352,7 +352,7 @@ MultiFab::Xpay (MultiFab& dst, Real a, const MultiFab& src, if (bx.ok()) { auto const sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp) + a * dfab(i,j,k,n+dstcomp); }); @@ -394,7 +394,7 @@ MultiFab::LinComb (MultiFab& dst, auto const xfab = x.array(mfi); auto const yfab = y.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,dstcomp+n) = a*xfab(i,j,k,xcomp+n) + b*yfab(i,j,k,ycomp+n); }); @@ -435,7 +435,7 @@ MultiFab::AddProduct (MultiFab& dst, auto const s1fab = src1.array(mfi); auto const s2fab = src2.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,n+dstcomp) += s1fab(i,j,k,n+comp1) * s2fab(i,j,k,n+comp2); }); diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index 3acbeeabe5f..fb436d6dcaf 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -67,7 +67,7 @@ namespace amrex Array4 const& ccarr = cc.array(mfi); Array4 const& ndarr = nd.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_nd_to_cc(tbx, ccarr, ndarr, dcomp, scomp, ncomp); }); @@ -91,7 +91,7 @@ namespace amrex Array4 const& eyarr = edge[1]->const_array(mfi);, Array4 const& ezarr = edge[2]->const_array(mfi);); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_eg_to_cc(tbx, ccarr, AMREX_D_DECL(exarr,eyarr,ezarr), dcomp); }); @@ -133,12 +133,12 @@ namespace amrex Array4 const& fzarr = fc[2]->const_array(mfi);); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, fxarr, dcomp, GeometryData()); }); #else - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp); }); @@ -168,12 +168,12 @@ namespace amrex Array4 const& fzarr = fc[2]->const_array(mfi);); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, fxarr, 0, gd); }); #else - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), 0); }); @@ -218,12 +218,12 @@ namespace amrex Array4 const& ccarr = cc.const_array(mfi); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx, { amrex_avg_cc_to_fc(tbx, xbx, fxarr, ccarr, gd); }); #else - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA (index_bounds, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx, { amrex_avg_cc_to_fc(tbx, AMREX_D_DECL(xbx,ybx,zbx), AMREX_D_DECL(fxarr,fyarr,fzarr), ccarr); @@ -288,7 +288,7 @@ namespace amrex Array4 const& finearr = S_fine.const_array(mfi); Array4 const& finevolarr = fvolume.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_with_vol(tbx,crsearr,finearr,finevolarr, 0,scomp,ncomp,ratio); @@ -337,7 +337,7 @@ namespace amrex Array4 const& crsearr = crse_S_fine.array(mfi); Array4 const& finearr = S_fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); @@ -375,12 +375,12 @@ namespace amrex Array4 const& finearr = S_fine.const_array(mfi); if (is_cell_centered) { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio); }); } else { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_nodes(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio); }); @@ -406,12 +406,12 @@ namespace amrex // not part of the actual crse multifab which came in. if (is_cell_centered) { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); } else { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_nodes(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); @@ -486,7 +486,7 @@ namespace amrex Array4 const& crsearr = crse.array(mfi); Array4 const& finearr = fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_faces(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir); }); @@ -568,7 +568,7 @@ namespace amrex Array4 const& crsearr = crse.array(mfi); Array4 const& finearr = fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_edges(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir); }); @@ -640,7 +640,7 @@ namespace amrex if (interpolate) { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tile_box, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tile_box, thread_box, { amrex_fill_slice_interp(thread_box, slice_arr, full_arr, 0, start_comp, ncomp, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp index e641313c7a5..652baa1bf92 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp @@ -410,13 +410,13 @@ MLABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i const auto& bzfab = bzcoef.array(mfi);); if (m_overset_mask[amrlev][mglev]) { const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlabeclap_adotx_os(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), osm, dxinv, ascalar, bscalar, ncomp); }); } else { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlabeclap_adotx(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), dxinv, ascalar, bscalar, ncomp); @@ -454,7 +454,7 @@ MLABecLaplacian::normalize (int amrlev, int mglev, MultiFab& mf) const const auto& byfab = bycoef.array(mfi);, const auto& bzfab = bzcoef.array(mfi);); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlabeclap_normalize(tbx, fab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), dxinv, ascalar, bscalar, ncomp); @@ -609,7 +609,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& #else if (m_overset_mask[amrlev][mglev]) { const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box, { abec_gsrb_os(thread_box, solnfab, rhsfab, alpha, afab, AMREX_D_DECL(dhx, dhy, dhz), @@ -621,7 +621,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& osm, vbx, redblack, nc); }); } else if (regular_coarsening) { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( tbx, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box, { abec_gsrb(thread_box, solnfab, rhsfab, alpha, afab, AMREX_D_DECL(dhx, dhy, dhz), @@ -688,7 +688,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, Real fac = bscalar*dxinv[0]; Box blo = amrex::bdryLo(box, 0); int blen = box.length(0); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox, { mlabeclap_flux_xface(tbox, fxarr, solarr, bx, fac, blen, ncomp); }); @@ -696,7 +696,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, fac = bscalar*dxinv[1]; blo = amrex::bdryLo(box, 1); blen = box.length(1); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox, { mlabeclap_flux_yface(tbox, fyarr, solarr, by, fac, blen, ncomp); }); @@ -705,7 +705,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, fac = bscalar*dxinv[2]; blo = amrex::bdryLo(box, 2); blen = box.length(2); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox, { mlabeclap_flux_zface(tbox, fzarr, solarr, bz, fac, blen, ncomp); }); @@ -715,14 +715,14 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, { Real fac = bscalar*dxinv[0]; Box bflux = amrex::surroundingNodes(box, 0); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_x(tbox, fxarr, solarr, bx, fac, ncomp); }); #if (AMREX_SPACEDIM >= 2) fac = bscalar*dxinv[1]; bflux = amrex::surroundingNodes(box, 1); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_y(tbox, fyarr, solarr, by, fac, ncomp); }); @@ -730,7 +730,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, #if (AMREX_SPACEDIM == 3) fac = bscalar*dxinv[2]; bflux = amrex::surroundingNodes(box, 2); - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_z(tbox, fzarr, solarr, bz, fac, ncomp); }); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp index c854898a693..bb88308d595 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp @@ -314,7 +314,7 @@ MLCellLinOp::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiF const Box& bx = mfi.tilebox(); Array4 const& cfab = crse.const_array(mfi); Array4 const& ffab = fine.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSABLE ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { int ic = amrex::coarsen(i,ratio3.x); int jc = amrex::coarsen(j,ratio3.y); @@ -505,7 +505,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State const Real bcllo = bdlv[icomp][olo]; const Real bclhi = bdlv[icomp][ohi]; if (idim == 0) { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_x(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -517,7 +517,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State imaxorder, dxi, flagbc, icomp); }); } else if (idim == 1) { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_y(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -529,7 +529,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State imaxorder, dyi, flagbc, icomp); }); } else { - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_z(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp index aba34c49dd5..c556f22824a 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp @@ -77,30 +77,30 @@ MLPoisson::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) con const auto& yfab = out.array(mfi); #if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx(i, j, k, yfab, xfab, dhx, dhy, dhz); }); #elif (AMREX_SPACEDIM == 2) if (m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx_m(i, j, yfab, xfab, dhx, dhy, dx, probxlo); }); } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx(i, j, yfab, xfab, dhx, dhy); }); } #elif (AMREX_SPACEDIM == 1) if (m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx_m(i, yfab, xfab, dhx, dx, probxlo); }); } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSABLE (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx(i, yfab, xfab, dhx); }); @@ -134,12 +134,12 @@ MLPoisson::normalize (int amrlev, int mglev, MultiFab& mf) const const auto& fab = mf.array(mfi); #if (AMREX_SPACEDIM == 2) - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlpoisson_normalize(tbx, fab, dhx, dhy, dx, probxlo); }); #else - AMREX_LAUNCH_HOST_DEVICE_FUSABLE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlpoisson_normalize(tbx, fab, dhx, dx, probxlo); });