diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp index ffc0c57e372..6ba27f84b1b 100644 --- a/Src/Base/AMReX.cpp +++ b/Src/Base/AMReX.cpp @@ -522,7 +522,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse, BL_PROFILE_INITPARAMS(); #endif machine::Initialize(); -#ifdef AMREX_USE_CUDA +#ifdef AMREX_USE_GPU Gpu::Fuser::Initialize(); #endif diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index 4a33ee31f2a..970f63421e5 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -54,19 +54,26 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) } nwarps.push_back(ntotwarps); - std::size_t nbytes = ntags*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, tags.data(), nbytes); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; - nbytes = (ntags+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); - constexpr int nthreads = 128; + std::memcpy(h_buffer, tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); + + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { @@ -112,7 +119,7 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) } }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -159,8 +166,8 @@ ParallelFor (Vector > const& tags, int ncomp, F && f) #endif Gpu::synchronize(); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } #endif @@ -221,19 +228,27 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } nwarps.push_back(ntotwarps); - std::size_t nbytes = N_locs*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes); + const int ntags = copy_tags.size(); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); + + std::memcpy(h_buffer, copy_tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - nbytes = (N_locs+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); - constexpr int nthreads = 128; + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size) { @@ -280,7 +295,7 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -328,8 +343,8 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in #endif Gpu::synchronize(); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } template @@ -353,23 +368,31 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in } nwarps.push_back(ntotwarps); - std::size_t nbytes = N_locs*sizeof(TagType); - auto d_tags = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes); + const int ntags = copy_tags.size(); + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t offset_masks = Arena::align(offset_nwarps+sizeof_nwarps); + std::size_t sizeof_masks = masks.size()*sizeof(Array4); + std::size_t total_buf_size = offset_masks + sizeof_masks; - nbytes = (N_locs+1)*sizeof(int); - auto d_nwarps = static_cast(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes); + char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); - nbytes = masks.size()*sizeof(Array4); - auto d_masks = static_cast*>(The_Device_Arena()->alloc(nbytes)); - Gpu::htod_memcpy(d_masks, masks.data(), nbytes); + std::memcpy(h_buffer, copy_tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + std::memcpy(h_buffer+offset_masks, masks.data(), sizeof_masks); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - constexpr int nthreads = 128; + auto d_tags = reinterpret_cast(d_buffer); + auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + auto d_masks = reinterpret_cast*>(d_buffer+offset_masks); + + constexpr int nthreads = 256; constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; #ifdef AMREX_USE_DPCPP - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept { int g_tid = item.get_global_id(0); @@ -437,7 +460,7 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in if (m) *m = 0; }); #else - amrex::launch(nblocks, nthreads, Gpu::nullStream(), + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; @@ -515,9 +538,8 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, in #endif Gpu::synchronize(); - The_Device_Arena()->free(d_masks); - The_Device_Arena()->free(d_nwarps); - The_Device_Arena()->free(d_tags); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); } template ::value,int> = 0> @@ -661,16 +683,18 @@ FabArray::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp) } if (maskfabs.size() > 0) { + Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold()); for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) { BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(bx, + amrex::ParallelFor(Gpu::KernelInfo{}.setFusible(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; }); } + Gpu::LaunchFusedKernels(); } if (is_thread_safe) { @@ -923,7 +947,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n // Is the conditional ever expected false? int launches = 0; for (int send = 0; send < N_snds; ++send) { - if (send_data[send] != nullptr) { + if (send_size[send] > 0) { launches += send_cctc[send]->size(); } } @@ -940,8 +964,7 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n std::size_t(sizeof(CopyMemory)*launches) ); const int j = sit(); - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) @@ -971,9 +994,9 @@ FabArray::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n for (int send = 0; send < N_snds; ++send) { const int j = send; - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + char* dptr = send_data[j]; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1009,9 +1032,9 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int LayoutData > recv_copy_tags(boxArray(),DistributionMap()); for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1082,7 +1105,7 @@ FabArray::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int template void FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc) { @@ -1091,16 +1114,26 @@ FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int nc const int N_snds = send_data.size(); if (N_snds == 0) return; + char* pbuffer = send_data[0]; + std::size_t szbuffer = 0; +#if 0 + // For linear solver test on summit, this is slower than writing to + // pinned memory directly on device. + if (not ParallelDescriptor::UseGpuAwareMpi()) { + // Memory in send_data is pinned. + szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1]; + pbuffer = (char*)The_Arena()->alloc(szbuffer); + } +#endif + typedef Array4CopyTag TagType; Vector snd_copy_tags; - // FIX HIP HERE -- Dim3 - Dim3 zero; - zero.x = 0; zero.y = 0; zero.z = 0; for (int j = 0; j < N_snds; ++j) { - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + std::size_t offset = send_data[j]-send_data[0]; + char* dptr = pbuffer + offset; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1108,16 +1141,24 @@ FabArray::pack_send_buffer_gpu (FabArray const& src, int scomp, int nc amrex::makeArray4((value_type*)(dptr), tag.sbox, ncomp), src.array(tag.srcIndex), tag.sbox, - zero + Dim3{0,0,0} }); dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type)); } - BL_ASSERT(dptr <= send_data[j] + send_size[j]); + BL_ASSERT(dptr <= pbuffer + offset + send_size[j]); } } detail::fab_to_fab(snd_copy_tags, scomp, 0, ncomp, detail::CellStore()); + + // There is Gpu::synchronize in fab_to_fab. + + if (pbuffer != send_data[0]) { + Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]); + Gpu::synchronize(); + The_Arena()->free(pbuffer); + } } template @@ -1133,6 +1174,20 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, const int N_rcvs = recv_cctc.size(); if (N_rcvs == 0) return; + char* pbuffer = recv_data[0]; + std::size_t szbuffer = 0; +#if 0 + // For linear solver test on summit, this is slower than writing to + // pinned memory directly on device. + if (not ParallelDescriptor::UseGpuAwareMpi()) { + // Memory in recv_data is pinned. + szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1]; + pbuffer = (char*)The_Arena()->alloc(szbuffer); + Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer); + Gpu::synchronize(); + } +#endif + typedef Array4CopyTag TagType; Vector recv_copy_tags; @@ -1149,9 +1204,10 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + std::size_t offset = recv_data[k]-recv_data[0]; + const char* dptr = pbuffer + offset; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1171,21 +1227,23 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, masks.push_back(maskfabs[li].array()); } } - BL_ASSERT(dptr <= recv_data[k] + recv_size[k]); + BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]); } } if (maskfabs.size() > 0) { + Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold()); for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) { BaseFab& mskfab = maskfabs[sit()]; const Array4& msk = mskfab.array(); const Box& bx = mskfab.box(); - amrex::ParallelFor(bx, + amrex::ParallelFor(Gpu::KernelInfo().setFusible(true), bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { msk(i,j,k) = 0; }); } + Gpu::LaunchFusedKernels(); } if (op == FabArrayBase::COPY) @@ -1206,6 +1264,12 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, detail::fab_to_fab_atomic_add(recv_copy_tags, 0, dcomp, ncomp, masks); } } + + // There is Gpu::synchronize in fab_to_fab. + + if (pbuffer != recv_data[0]) { + The_Arena()->free(pbuffer); + } } #endif /* AMREX_USE_GPU */ @@ -1213,7 +1277,7 @@ FabArray::unpack_recv_buffer_gpu (FabArray& dst, int dcomp, int ncomp, template void FabArray::pack_send_buffer_cpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc) { @@ -1227,9 +1291,9 @@ FabArray::pack_send_buffer_cpu (FabArray const& src, int scomp, int nc #endif for (int j = 0; j < N_snds; ++j) { - char* dptr = send_data[j]; - if (dptr != nullptr) + if (send_size[j] > 0) { + char* dptr = send_data[j]; auto const& cctc = *send_cctc[j]; for (auto const& tag : cctc) { @@ -1268,9 +1332,9 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, #endif for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { @@ -1296,9 +1360,9 @@ FabArray::unpack_recv_buffer_cpu (FabArray& dst, int dcomp, int ncomp, recv_copy_tags.define(dst.boxArray(),dst.DistributionMap()); for (int k = 0; k < N_rcvs; ++k) { - const char* dptr = recv_data[k]; - if (dptr != nullptr) + if (recv_size[k] > 0) { + const char* dptr = recv_data[k]; auto const& cctc = *recv_cctc[k]; for (auto const& tag : cctc) { diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 71e60674686..db0b2073d2f 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -718,7 +718,7 @@ public: #endif static void pack_send_buffer_gpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc); @@ -731,7 +731,7 @@ public: #endif static void pack_send_buffer_cpu (FabArray const& src, int scomp, int ncomp, - Vector& send_data, + Vector const& send_data, Vector const& send_size, Vector const& send_cctc); @@ -1582,7 +1582,7 @@ FabArray::setVal (value_type val, { const Box& bx = fai.growntilebox(nghost); auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1625,7 +1625,7 @@ FabArray::setVal (value_type val, if (b.ok()) { auto fab = this->array(fai); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( b, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( b, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = val; }); @@ -1655,7 +1655,7 @@ FabArray::abs (int comp, int ncomp, const IntVect& nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp)); }); @@ -1674,7 +1674,7 @@ FabArray::plus (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1694,7 +1694,7 @@ FabArray::plus (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) += val; }); @@ -1714,7 +1714,7 @@ FabArray::mult (value_type val, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1734,7 +1734,7 @@ FabArray::mult (value_type val, const Box& region, int comp, int num_comp, const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) *= val; }); @@ -1754,7 +1754,7 @@ FabArray::invert (value_type numerator, int comp, int num_comp, int nghost) { const Box& bx = mfi.growntilebox(nghost); auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1774,7 +1774,7 @@ FabArray::invert (value_type numerator, const Box& region, int comp, int nu const Box& bx = mfi.growntilebox(nghost) & region; if (bx.ok()) { auto fab = this->array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n, { fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp); }); @@ -1970,7 +1970,7 @@ FabArray::BuildMask (const Box& phys_domain, const Periodicity& period, Box const& fbx = mfi.growntilebox(); Box const& gbx = fbx & domain; Box const& vbx = mfi.validbox(); - AMREX_HOST_DEVICE_FOR_4D(fbx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(fbx, ncomp, i, j, k, n, { IntVect iv(AMREX_D_DECL(i,j,k)); if (vbx.contains(iv)) { diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index 77429049df3..50e00ad2c9c 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -118,7 +118,7 @@ FabArrayBase::Initialize () #ifdef AMREX_USE_GPU if (ParallelDescriptor::UseGpuAwareMpi()) { - the_fa_arena = The_Device_Arena(); + the_fa_arena = The_Arena(); } else { the_fa_arena = The_Pinned_Arena(); } diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H index 46ecef31170..a0f05f93d24 100644 --- a/Src/Base/AMReX_FabArrayCommI.H +++ b/Src/Base/AMReX_FabArrayCommI.H @@ -143,9 +143,7 @@ FabArray::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost, { the_send_data = static_cast(amrex::The_FA_Arena()->alloc(total_volume)); for (int i = 0, N = send_size.size(); i < N; ++i) { - if (send_size[i] > 0) { - send_data[i] = the_send_data + offset[i]; - } + send_data[i] = the_send_data + offset[i]; } } else { the_send_data = nullptr; @@ -495,9 +493,7 @@ FabArray::ParallelCopy (const FabArray& src, { the_send_data = static_cast(amrex::The_FA_Arena()->alloc(total_volume)); for (int i = 0, N = send_size.size(); i < N; ++i) { - if (send_size[i] > 0) { - send_data[i] = the_send_data + offset[i]; - } + send_data[i] = the_send_data + offset[i]; } } @@ -749,9 +745,9 @@ FabArray::PostRcvs (const MapOfCopyComTagContainers& m_RcvTags, for (int i = 0; i < nrecv; ++i) { + recv_data[i] = the_recv_data + offset[i]; if (recv_size[i] > 0) { - recv_data[i] = the_recv_data + offset[i]; const int rank = ParallelContext::global_to_local_rank(recv_from[i]); const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]); if (comm_data_type == 1) { diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H index c31fd19b0c1..869fd172d89 100644 --- a/Src/Base/AMReX_FabArrayUtility.H +++ b/Src/Base/AMReX_FabArrayUtility.H @@ -153,11 +153,13 @@ ReduceSum_device (FabArray const& fa1, FabArray const& fa2, using value_type = typename FAB1::value_type; value_type sm = 0; + BL_PROFILE("ReduceSum_device"); + { ReduceOps reduce_op; ReduceData reduce_data(reduce_op); using ReduceTuple = typename decltype(reduce_data)::Type; - + Gpu::FuseReductionSafeGuard rsg(true); for (MFIter mfi(fa1); mfi.isValid(); ++mfi) { const Box& bx = amrex::grow(mfi.validbox(),nghost); @@ -1467,7 +1469,7 @@ Add (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, int { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp); }); @@ -1499,7 +1501,7 @@ Copy (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, in { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n); }); @@ -1531,7 +1533,7 @@ Subtract (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp); }); @@ -1563,7 +1565,7 @@ Multiply (FabArray& dst, FabArray const& src, int srccomp, int dstcomp { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp); }); @@ -1595,7 +1597,7 @@ Divide (FabArray& dst, FabArray const& src, int srccomp, int dstcomp, { auto const srcFab = src.array(mfi); auto dstFab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp); }); @@ -1625,7 +1627,7 @@ Abs (FabArray& fa, int icomp, int numcomp, const IntVect& nghost) if (bx.ok()) { auto const& fab = fa.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp)); }); @@ -1682,7 +1684,7 @@ OverrideSync (FabArray & fa, FabArray const& msk, const Periodicity& const Box& bx = mfi.tilebox(); auto fab = fa.array(mfi); auto const ifab = msk.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { if (!ifab(i,j,k)) fab(i,j,k,n) = 0; }); diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H index a0fd747a17a..5c59d89305b 100644 --- a/Src/Base/AMReX_Gpu.H +++ b/Src/Base/AMReX_Gpu.H @@ -9,6 +9,7 @@ namespace amrex { namespace Cuda {} } #endif #include +#include #include #include #include diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H index dc3555cb39d..99467f9a956 100644 --- a/Src/Base/AMReX_GpuFuse.H +++ b/Src/Base/AMReX_GpuFuse.H @@ -13,6 +13,8 @@ namespace amrex { namespace Gpu { +#ifdef AMREX_USE_GPU + #ifdef AMREX_USE_CUDA typedef void (*Lambda1DLauncher)(char*,int); @@ -229,23 +231,6 @@ private: } }; -Long getFuseSizeThreshold (); -Long setFuseSizeThreshold (Long new_threshold); -int getFuseNumKernelsThreshold (); -int setFuseNumKernelsThreshold (int new_threshold); -bool inFuseRegion (); -bool setFuseRegion (bool flag); - -struct FuseSafeGuard -{ - explicit FuseSafeGuard (bool flag) noexcept - : m_old(setFuseRegion(flag)) {} - ~FuseSafeGuard () { setFuseRegion(m_old); } -private: - bool m_old; -}; - - template void Register (Box const& bx, F&& f) @@ -273,6 +258,56 @@ LaunchFusedKernels () Fuser::getInstance().Launch(); } +#else + +class Fuser +{ +public: + static Fuser& getInstance (); + static void Initialize (); + static void Finalize (); +private: + static std::unique_ptr m_instance; +}; + +inline void LaunchFusedKernels () {} + +#endif + +Long getFuseSizeThreshold (); +Long setFuseSizeThreshold (Long new_threshold); +int getFuseNumKernelsThreshold (); +int setFuseNumKernelsThreshold (int new_threshold); +bool inFuseRegion (); +bool setFuseRegion (bool flag); +bool inFuseReductionRegion (); +bool setFuseReductionRegion (bool flag); + +struct FuseSafeGuard +{ + explicit FuseSafeGuard (bool flag) noexcept + : m_old(setFuseRegion(flag)) {} + ~FuseSafeGuard () { setFuseRegion(m_old); } +private: + bool m_old; +}; + +struct FuseReductionSafeGuard +{ + explicit FuseReductionSafeGuard (bool flag) noexcept + : m_old(setFuseReductionRegion(flag)) {} + ~FuseReductionSafeGuard () { setFuseReductionRegion(m_old); } +private: + bool m_old; +}; + +#else + +struct FuseSafeGuard +{ + explicit FuseSafeGuard (bool) {} +}; + #endif }} diff --git a/Src/Base/AMReX_GpuFuse.cpp b/Src/Base/AMReX_GpuFuse.cpp index 26f942aa963..e3a4fe812b1 100644 --- a/Src/Base/AMReX_GpuFuse.cpp +++ b/Src/Base/AMReX_GpuFuse.cpp @@ -6,10 +6,11 @@ namespace amrex { namespace Gpu { -#ifdef AMREX_USE_CUDA +#ifdef AMREX_USE_GPU namespace { bool s_in_fuse_region = false; + bool s_in_fuse_reduction_region = false; // Fusing kernels with elements greater than this are not recommended based tests on v100 Long s_fuse_size_threshold = 257*257; // If the number of kernels is less than this, fusing is not recommended based on tests on v100 @@ -18,6 +19,8 @@ namespace { std::unique_ptr Fuser::m_instance = nullptr; +#ifdef AMREX_USE_CUDA + Fuser::Fuser () { AMREX_ASSERT(!OpenMP::in_parallel()); @@ -86,12 +89,32 @@ void Fuser::Launch () constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; + bool is_reduction = s_in_fuse_reduction_region; + amrex::launch(nblocks, nthreads, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { int g_tid = blockDim.x*blockIdx.x + threadIdx.x; int g_wid = g_tid / Gpu::Device::warp_size; - if (g_wid >= ntotwarps) return; + if (g_wid >= ntotwarps) { + if (is_reduction) { + // for reduction, the assumption is all lambdas have function signature + FuseHelper& helper = d_lambda_helper[0]; + char* lambda_object = d_lambda_object + helper.m_offset; + if (helper.m_bx.isEmpty()) { + helper.m_fp.L1D(lambda_object,-1); + } else { + if (helper.m_N == 0) { + helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN); + } else { + for (int n = 0; n < helper.m_N; ++n) { + helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1); + } + } + } + } + return; + } int ilambda; { @@ -120,6 +143,8 @@ void Fuser::Launch () if (bx.isEmpty()) { if (icell < helper.m_N) { helper.m_fp.L1D(lambda_object,icell); + } else if (is_reduction) { + helper.m_fp.L1D(lambda_object,-1); } } else { int ncells = bx.numPts(); @@ -139,6 +164,14 @@ void Fuser::Launch () helper.m_fp.L4D(lambda_object,i,j,k,n); } } + } else if (is_reduction) { + if (helper.m_N == 0) { + helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN); + } else { + for (int n = 0; n < helper.m_N; ++n) { + helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1); + } + } } } }); @@ -181,6 +214,8 @@ Fuser::resize_helper_buf () m_helper_buf = p; } +#endif + Fuser& Fuser::getInstance () { @@ -218,8 +253,7 @@ setFuseSizeThreshold (Long new_threshold) int getFuseNumKernelsThreshold () { return s_fuse_numkernels_threshold; } -int -setFuseNumKernelsThreshold (int new_threshold) +int setFuseNumKernelsThreshold (int new_threshold) { int old = s_fuse_numkernels_threshold; s_fuse_numkernels_threshold = new_threshold; @@ -228,14 +262,22 @@ setFuseNumKernelsThreshold (int new_threshold) bool inFuseRegion () { return s_in_fuse_region; } -bool -setFuseRegion (bool flag) +bool setFuseRegion (bool flag) { bool old = s_in_fuse_region; s_in_fuse_region = flag; return old; } +bool inFuseReductionRegion () { return s_in_fuse_reduction_region; } + +bool setFuseReductionRegion (bool flag) +{ + bool old = s_in_fuse_reduction_region; + s_in_fuse_reduction_region = flag; + return old; +} + #endif }} diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H new file mode 100644 index 00000000000..94e125e5a0c --- /dev/null +++ b/Src/Base/AMReX_GpuKernelInfo.H @@ -0,0 +1,18 @@ +#ifndef AMREX_GPU_KERNEL_INFO_H_ +#define AMREX_GPU_KERNEL_INFO_H_ + +namespace amrex { +namespace Gpu { + +class KernelInfo +{ +public: + KernelInfo& setFusible (bool flag) { fusible = flag; return *this; } + bool isFusible () const { return fusible; } +private: + bool fusible = false; +}; + +}} + +#endif diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index f79901a2403..d6c35972427 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -2,6 +2,7 @@ #define AMREX_GPU_LAUNCH_H_ #include +#include #include #include #include @@ -9,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -187,6 +189,20 @@ namespace Gpu { AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) +#ifdef AMREX_USE_CUDA +#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE, \ + AMREX_WRONG_NUM_ARGS, \ + AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) +#else +#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__) +#endif #if (AMREX_SPACEDIM == 1) #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3) AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE (a1,a2,a2) @@ -215,6 +231,22 @@ namespace Gpu { #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__) #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) +#ifdef AMREX_USE_CUDA +#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__) +#else +#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__) +#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__) +#endif + #ifdef AMREX_USE_GPU #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \ diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 3b26b799841..584ad71d810 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -10,7 +10,7 @@ void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } template ::value> > -void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (T n, L&& f) noexcept { for (T i = 0; i < n; ++i) { f(i); @@ -18,7 +18,13 @@ void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } template ::value> > -void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + For(n, std::forward(f)); +} + +template ::value> > +void ParallelFor (T n, L&& f) noexcept { AMREX_PRAGMA_SIMD for (T i = 0; i < n; ++i) { @@ -26,8 +32,14 @@ void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } } +template ::value> > +void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + ParallelFor(n, std::forward(f)); +} + template -void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -39,7 +51,13 @@ void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept } template -void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + For(box, std::forward(f)); +} + +template +void ParallelFor (Box const& box, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -51,8 +69,14 @@ void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noe }}} } +template +void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + ParallelFor(box, std::forward(f)); +} + template ::value> > -void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box, T ncomp, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -66,7 +90,13 @@ void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) no } template ::value> > -void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + For(box, ncomp, std::forward(f)); +} + +template ::value> > +void ParallelFor (Box const& box, T ncomp, L&& f) noexcept { const auto lo = amrex::lbound(box); const auto hi = amrex::ubound(box); @@ -80,90 +110,161 @@ void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes } } +template ::value> > +void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(box, ncomp, std::forward(f)); +} + template -void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { For(box1, std::forward(f1)); For(box2, std::forward(f2)); } +template +void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + For (box1, box2, std::forward(f1), std::forward(f2)); +} + template -void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept +void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { For(box1, std::forward(f1)); For(box2, std::forward(f2)); For(box3, std::forward(f3)); } +template +void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +{ + For(box1, box2, box3, std::forward(f1), std::forward(f2), std::forward(f3)); +} + template ::value>, typename M2=amrex::EnableIf_t::value> > void For (Box const& box1, T1 ncomp1, L1&& f1, - Box const& box2, T2 ncomp2, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box2, T2 ncomp2, L2&& f2) noexcept { For(box1, ncomp1, std::forward(f1)); For(box2, ncomp2, std::forward(f2)); } +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + For(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > void For (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, - Box const& box3, T3 ncomp3, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box3, T3 ncomp3, L3&& f3) noexcept { For(box1, ncomp1, std::forward(f1)); For(box2, ncomp2, std::forward(f2)); For(box3, ncomp3, std::forward(f3)); } +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + For(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + template -void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept +void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { ParallelFor(box1, std::forward(f1)); ParallelFor(box2, std::forward(f2)); } +template +void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(box1,box2,f1,f2); +} + template -void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept +void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { ParallelFor(box1, std::forward(f1)); ParallelFor(box2, std::forward(f2)); ParallelFor(box3, std::forward(f3)); } +template +void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + template ::value>, typename M2=amrex::EnableIf_t::value> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, - Box const& box2, T2 ncomp2, L2&& f2, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box2, T2 ncomp2, L2&& f2) noexcept { ParallelFor(box1, ncomp1, std::forward(f1)); ParallelFor(box2, ncomp2, std::forward(f2)); } +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void ParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2)); +} + template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, - Box const& box3, T3 ncomp3, L3&& f3, - std::size_t /*shared_mem_bytes*/=0) noexcept + Box const& box3, T3 ncomp3, L3&& f3) noexcept { ParallelFor(box1, ncomp1, std::forward(f1)); ParallelFor(box2, ncomp2, std::forward(f2)); ParallelFor(box3, ncomp3, std::forward(f3)); } +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void ParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(box1, ncomp1, std::forward(f1), + box2, ncomp2, std::forward(f2), + box3, ncomp3, std::forward(f3)); +} + template ::value> > void FabReduce (Box const& box, N ncomp, T const& init_val, @@ -315,6 +416,118 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, box3,ncomp3,std::forward(f3)); } +template ::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + ParallelFor(n,std::forward(f)); +} + +template +void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + ParallelFor(box,std::forward(f)); +} + +template ::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(box,ncomp,std::forward(f)); +} + +template +void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void HostDeviceParallelFor (Gpu::KernelInfo const&, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + +template ::value> > +void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept +{ + For(n,std::forward(f)); +} + +template +void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept +{ + For(box,std::forward(f)); +} + +template ::value> > +void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept +{ + For(box,ncomp,std::forward(f)); +} + +template +void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + For(box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void HostDeviceFor (Gpu::KernelInfo const&, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + For(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void HostDeviceFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + For(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void HostDeviceFor (Gpu::KernelInfo const&, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + For(box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + } #endif diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 920570be12f..295d3320380 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -94,7 +94,7 @@ void launch (T const& n, L&& f) noexcept } template ::value> > -void ParallelFor (T n, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; const auto ec = Gpu::ExecutionConfig(n); @@ -122,7 +122,7 @@ void ParallelFor (T n, L&& f) noexcept } template -void ParallelFor (Box const& box, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -159,7 +159,7 @@ void ParallelFor (Box const& box, L&& f) noexcept } template ::value> > -void ParallelFor (Box const& box, T ncomp, L&& f) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); @@ -198,7 +198,7 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept } template -void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size ParallelFor(box1, std::forward(f1)); @@ -255,7 +255,9 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept } template -void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +void ParallelFor (Gpu::KernelInfo const& /*info*/, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size ParallelFor(box1, std::forward(f1)); @@ -327,7 +329,8 @@ void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2 template ::value>, typename M2=amrex::EnableIf_t::value> > -void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +void ParallelFor (Gpu::KernelInfo const& /*info*/, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { // xxxxx DPCPP todo: launch separate kernel to reduce kernel size @@ -392,7 +395,8 @@ template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > -void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +void ParallelFor (Gpu::KernelInfo const& /*info*/, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { @@ -628,209 +632,265 @@ void launch (T const& n, L&& f) noexcept template ::value> > amrex::EnableIf_t::value> -ParallelFor (T n, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) return; - const auto ec = Gpu::ExecutionConfig(n); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - i < n; i += stride) { - f(i); - } - }); - AMREX_GPU_ERROR_CHECK(); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(n, f); + } else +#endif + { + amrex::ignore_unused(info); + const auto ec = Gpu::ExecutionConfig(n); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + i < n; i += stride) { + f(i); + } + }); + AMREX_GPU_ERROR_CHECK(); + } } template amrex::EnableIf_t::value> -ParallelFor (Box const& box, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - i += lo.x; - j += lo.y; - k += lo.z; - f(i,j,k); - } - }); - AMREX_GPU_ERROR_CHECK(); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box, f); + } else +#endif + { + amrex::ignore_unused(info); + const auto lo = amrex::lbound(box); + const auto len = amrex::length(box); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + f(i,j,k); + } + }); + AMREX_GPU_ERROR_CHECK(); + } } template ::value> > amrex::EnableIf_t::value> -ParallelFor (Box const& box, T ncomp, L&& f) noexcept +ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) return; int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - int k = icell / (len.x*len.y); - int j = (icell - k*(len.x*len.y)) / len.x; - int i = (icell - k*(len.x*len.y)) - j*len.x; - i += lo.x; - j += lo.y; - k += lo.z; - for (T n = 0; n < ncomp; ++n) { - f(i,j,k,n); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box, ncomp, f); + } else +#endif + { + amrex::ignore_unused(info); + const auto lo = amrex::lbound(box); + const auto len = amrex::length(box); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + int k = icell / (len.x*len.y); + int j = (icell - k*(len.x*len.y)) / len.x; + int i = (icell - k*(len.x*len.y)) - j*len.x; + i += lo.x; + j += lo.y; + k += lo.z; + for (T n = 0; n < ncomp; ++n) { + f(i,j,k,n); + } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); + } } template amrex::EnableIf_t::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return; int ncells1 = box1.numPts(); int ncells2 = box2.numPts(); int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - f1(i,j,k); - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - f2(i,j,k); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, f1); + Gpu::Register(box2, f2); + } else +#endif + { + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + f1(i,j,k); + } + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + f2(i,j,k); + } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); + } } template amrex::EnableIf_t::value and MaybeDeviceRunnable::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) and amrex::isEmpty(box2) and amrex::isEmpty(box3)) return; int ncells1 = box1.numPts(); int ncells2 = box2.numPts(); int ncells3 = box3.numPts(); int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - f1(i,j,k); - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - f2(i,j,k); - } - if (icell < ncells3) { - int k = icell / (len3.x*len3.y); - int j = (icell - k*(len3.x*len3.y)) / len3.x; - int i = (icell - k*(len3.x*len3.y)) - j*len3.x; - i += lo3.x; - j += lo3.y; - k += lo3.z; - f3(i,j,k); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, f1); + Gpu::Register(box2, f2); + Gpu::Register(box3, f3); + } else +#endif + { + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto lo3 = amrex::lbound(box3); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto len3 = amrex::length(box3); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + f1(i,j,k); + } + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + f2(i,j,k); + } + if (icell < ncells3) { + int k = icell / (len3.x*len3.y); + int j = (icell - k*(len3.x*len3.y)) / len3.x; + int i = (icell - k*(len3.x*len3.y)) - j*len3.x; + i += lo3.x; + j += lo3.y; + k += lo3.z; + f3(i,j,k); + } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); + } } template ::value>, typename M2=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return; int ncells1 = box1.numPts(); int ncells2 = box2.numPts(); int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - for (T1 n = 0; n < ncomp1; ++n) { - f1(i,j,k,n); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, ncomp1, f1); + Gpu::Register(box2, ncomp2, f2); + } else +#endif + { + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + for (T1 n = 0; n < ncomp1; ++n) { + f1(i,j,k,n); + } } - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - for (T2 n = 0; n < ncomp2; ++n) { - f2(i,j,k,n); + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + for (T2 n = 0; n < ncomp2; ++n) { + f2(i,j,k,n); + } } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); + } } template ::value>, typename M3=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeDeviceRunnable::value and MaybeDeviceRunnable::value> -ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +ParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { @@ -847,53 +908,63 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, int ncells2 = box2.numPts(); int ncells3 = box3.numPts(); int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto ec = Gpu::ExecutionConfig(ncells); - AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - icell < ncells; icell += stride) { - if (icell < ncells1) { - int k = icell / (len1.x*len1.y); - int j = (icell - k*(len1.x*len1.y)) / len1.x; - int i = (icell - k*(len1.x*len1.y)) - j*len1.x; - i += lo1.x; - j += lo1.y; - k += lo1.z; - for (T1 n = 0; n < ncomp1; ++n) { - f1(i,j,k,n); +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) { + Gpu::Register(box1, ncomp1, f1); + Gpu::Register(box2, ncomp2, f2); + Gpu::Register(box3, ncomp3, f3); + } else +#endif + { + amrex::ignore_unused(info); + const auto lo1 = amrex::lbound(box1); + const auto lo2 = amrex::lbound(box2); + const auto lo3 = amrex::lbound(box3); + const auto len1 = amrex::length(box1); + const auto len2 = amrex::length(box2); + const auto len3 = amrex::length(box3); + const auto ec = Gpu::ExecutionConfig(ncells); + AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + icell < ncells; icell += stride) { + if (icell < ncells1) { + int k = icell / (len1.x*len1.y); + int j = (icell - k*(len1.x*len1.y)) / len1.x; + int i = (icell - k*(len1.x*len1.y)) - j*len1.x; + i += lo1.x; + j += lo1.y; + k += lo1.z; + for (T1 n = 0; n < ncomp1; ++n) { + f1(i,j,k,n); + } } - } - if (icell < ncells2) { - int k = icell / (len2.x*len2.y); - int j = (icell - k*(len2.x*len2.y)) / len2.x; - int i = (icell - k*(len2.x*len2.y)) - j*len2.x; - i += lo2.x; - j += lo2.y; - k += lo2.z; - for (T2 n = 0; n < ncomp2; ++n) { - f2(i,j,k,n); + if (icell < ncells2) { + int k = icell / (len2.x*len2.y); + int j = (icell - k*(len2.x*len2.y)) / len2.x; + int i = (icell - k*(len2.x*len2.y)) - j*len2.x; + i += lo2.x; + j += lo2.y; + k += lo2.z; + for (T2 n = 0; n < ncomp2; ++n) { + f2(i,j,k,n); + } } - } - if (icell < ncells3) { - int k = icell / (len3.x*len3.y); - int j = (icell - k*(len3.x*len3.y)) / len3.x; - int i = (icell - k*(len3.x*len3.y)) - j*len3.x; - i += lo3.x; - j += lo3.y; - k += lo3.z; - for (T3 n = 0; n < ncomp3; ++n) { - f3(i,j,k,n); + if (icell < ncells3) { + int k = icell / (len3.x*len3.y); + int j = (icell - k*(len3.x*len3.y)) / len3.x; + int i = (icell - k*(len3.x*len3.y)) - j*len3.x; + i += lo3.x; + j += lo3.y; + k += lo3.z; + for (T3 n = 0; n < ncomp3; ++n) { + f3(i,j,k,n); + } } } - } - }); - AMREX_GPU_ERROR_CHECK(); + }); + AMREX_GPU_ERROR_CHECK(); + } } template @@ -983,34 +1054,147 @@ void single_task (L&& f) noexcept single_task(Gpu::gpuStream(), std::forward(f)); } +template ::value> > +void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept +{ + ParallelFor(info, n,std::forward(f)); +} + +template +void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept +{ + ParallelFor(info, box,std::forward(f)); +} + +template ::value> > +void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(info,box,ncomp,std::forward(f)); +} + +template +void For (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(info,box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void For (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(info,box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(info,box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void For (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(info, + box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + +template ::value> > +void ParallelFor (T n, L&& f) noexcept +{ + ParallelFor(Gpu::KernelInfo{}, n, std::forward(f)); +} + +template +void ParallelFor (Box const& box, L&& f) noexcept +{ + ParallelFor(Gpu::KernelInfo{}, box, std::forward(f)); +} + +template ::value> > +void ParallelFor (Box const& box, T ncomp, L&& f) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward(f)); +} + +template +void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void ParallelFor (Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + ParallelFor(Gpu::KernelInfo{}, + box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + template ::value> > void For (T n, L&& f) noexcept { - ParallelFor(n,std::forward(f)); + ParallelFor(Gpu::KernelInfo{}, n,std::forward(f)); } template void For (Box const& box, L&& f) noexcept { - ParallelFor(box,std::forward(f)); + ParallelFor(Gpu::KernelInfo{}, box,std::forward(f)); } template ::value> > void For (Box const& box, T ncomp, L&& f) noexcept { - ParallelFor(box,ncomp,std::forward(f)); + ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward(f)); } template void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { - ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); + ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward(f1),std::forward(f2)); } template -void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept +void For (Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept { - ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); + ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); } template (f1),box2,ncomp2,std::forward(f2)); + ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); } template (f1), + ParallelFor(Gpu::KernelInfo{}, + box1,ncomp1,std::forward(f1), box2,ncomp2,std::forward(f2), box3,ncomp3,std::forward(f3)); } template ::value> > amrex::EnableIf_t::value> -HostDeviceParallelFor (T n, L&& f) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(n,std::forward(f)); + ParallelFor(info,n,std::forward(f)); } else { AMREX_PRAGMA_SIMD for (T i = 0; i < n; ++i) f(i); } } +template ::value> > +amrex::EnableIf_t::value> +HostDeviceParallelFor (T n, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{}, n, std::forward(f)); +} + template amrex::EnableIf_t::value> -HostDeviceParallelFor (Box const& box, L&& f) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box,std::forward(f)); + ParallelFor(info, box,std::forward(f)); } else { LoopConcurrentOnCpu(box,std::forward(f)); } @@ -1060,10 +1252,10 @@ HostDeviceParallelFor (Box const& box, L&& f) noexcept template ::value> > amrex::EnableIf_t::value> -HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box,ncomp,std::forward(f)); + ParallelFor(info, box,ncomp,std::forward(f)); } else { LoopConcurrentOnCpu(box,ncomp,std::forward(f)); } @@ -1071,10 +1263,11 @@ HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept template amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,box2,std::forward(f1),std::forward(f2)); + ParallelFor(info,box1,box2,std::forward(f1),std::forward(f2)); } else { LoopConcurrentOnCpu(box1,std::forward(f1)); LoopConcurrentOnCpu(box2,std::forward(f2)); @@ -1083,11 +1276,13 @@ HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexc template amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3, +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); + ParallelFor(info,box1,box2,box3, + std::forward(f1),std::forward(f2),std::forward(f3)); } else { LoopConcurrentOnCpu(box1,std::forward(f1)); LoopConcurrentOnCpu(box2,std::forward(f2)); @@ -1099,11 +1294,12 @@ template ::value>, typename M2=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); + ParallelFor(info,box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); } else { LoopConcurrentOnCpu(box1,ncomp1,std::forward(f1)); LoopConcurrentOnCpu(box2,ncomp2,std::forward(f2)); @@ -1115,12 +1311,14 @@ template ::value>, typename M3=amrex::EnableIf_t::value> > amrex::EnableIf_t::value and MaybeHostDeviceRunnable::value and MaybeHostDeviceRunnable::value> -HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, +HostDeviceParallelFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (Gpu::inLaunchRegion()) { - ParallelFor(box1,ncomp1,std::forward(f1), + ParallelFor(info, + box1,ncomp1,std::forward(f1), box2,ncomp2,std::forward(f2), box3,ncomp3,std::forward(f3)); } else { @@ -1131,54 +1329,115 @@ HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, } template ::value> > -void HostDeviceFor (T n, L&& f) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { - HostDeviceParallelFor(n,std::forward(f)); + HostDeviceParallelFor(info,n,std::forward(f)); } template -void HostDeviceFor (Box const& box, L&& f) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { - HostDeviceParallelFor(box,std::forward(f)); + HostDeviceParallelFor(info,box,std::forward(f)); } template ::value> > -void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept { - HostDeviceParallelFor(box,ncomp,std::forward(f)); + HostDeviceParallelFor(info,box,ncomp,std::forward(f)); } template -void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { - HostDeviceParallelFor(box1,box2,std::forward(f1),std::forward(f2)); + HostDeviceParallelFor(info,box1,box2,std::forward(f1),std::forward(f2)); } template -void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3, +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept { - HostDeviceParallelFor(box1,box2,box3,std::forward(f1),std::forward(f2),std::forward(f3)); + HostDeviceParallelFor(info, box1,box2,box3, + std::forward(f1),std::forward(f2),std::forward(f3)); } template ::value>, typename M2=amrex::EnableIf_t::value> > -void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2) noexcept { - HostDeviceParallelFor(box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); + HostDeviceParallelFor(info,box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); } template ::value>, typename M2=amrex::EnableIf_t::value>, typename M3=amrex::EnableIf_t::value> > -void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1, +void HostDeviceFor (Gpu::KernelInfo const& info, + Box const& box1, T1 ncomp1, L1&& f1, Box const& box2, T2 ncomp2, L2&& f2, Box const& box3, T3 ncomp3, L3&& f3) noexcept { - HostDeviceParallelFor(box1,ncomp1,std::forward(f1), + HostDeviceParallelFor(info, + box1,ncomp1,std::forward(f1), + box2,ncomp2,std::forward(f2), + box3,ncomp3,std::forward(f3)); +} + +template ::value> > +void HostDeviceParallelFor (T n, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},n,std::forward(f)); +} + +template +void HostDeviceParallelFor (Box const& box, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box,std::forward(f)); +} + +template ::value> > +void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward(f)); +} + +template +void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box1,box2,std::forward(f1),std::forward(f2)); +} + +template +void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3, + L1&& f1, L2&& f2, L3&& f3) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{}, box1,box2,box3, + std::forward(f1),std::forward(f2),std::forward(f3)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward(f1),box2,ncomp2,std::forward(f2)); +} + +template ::value>, + typename M2=amrex::EnableIf_t::value>, + typename M3=amrex::EnableIf_t::value> > +void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1, + Box const& box2, T2 ncomp2, L2&& f2, + Box const& box3, T3 ncomp3, L3&& f3) noexcept +{ + HostDeviceParallelFor(Gpu::KernelInfo{}, + box1,ncomp1,std::forward(f1), box2,ncomp2,std::forward(f2), box3,ncomp3,std::forward(f3)); } diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H index 83306b4b560..45400858abd 100644 --- a/Src/Base/AMReX_GpuLaunchMacrosG.H +++ b/Src/Base/AMReX_GpuLaunchMacrosG.H @@ -53,6 +53,34 @@ block \ } \ }}} +#ifdef AMREX_USE_CUDA +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE(TN,TI,block) \ + { auto const& amrex_i_tn = TN; \ + if (!amrex::isEmpty(amrex_i_tn)) { \ + if (amrex::Gpu::inLaunchRegion()) \ + { \ + const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \ + if (amrex::Gpu::inFuseRegion() && amrex_i_ec.numBlocks.x*amrex_i_ec.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \ + amrex::Gpu::Register(amrex_i_tn, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block \ + }); \ + } else { \ + AMREX_LAUNCH_KERNEL(amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \ + [=] AMREX_GPU_DEVICE () noexcept { \ + for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \ + block \ + } \ + }); \ + AMREX_GPU_ERROR_CHECK(); \ + } \ + } \ + else { \ + for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \ + block \ + } \ + }}} +#endif #endif // two fused launches @@ -134,6 +162,51 @@ block2 \ } \ }}} +#ifdef AMREX_USE_CUDA +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \ + { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \ + if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2)) { \ + if (amrex::Gpu::inLaunchRegion()) \ + { \ + const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \ + const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \ + dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \ + amrex_i_ec2.numBlocks.x); \ + amrex_i_nblocks.y = 2; \ + if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \ + amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block1 \ + }); \ + amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block2 \ + }); \ + } else { \ + AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \ + [=] AMREX_GPU_DEVICE () noexcept { \ + switch (blockIdx.y) { \ + case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + break; \ + case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + } \ + }); \ + AMREX_GPU_ERROR_CHECK(); \ + } \ + } \ + else { \ + for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + }}} +#endif #endif // three fused launches @@ -233,6 +306,64 @@ block3 \ } \ }}} +#ifdef AMREX_USE_CUDA +#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \ + { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \ + if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2) or !amrex::isEmpty(amrex_i_tn3)) { \ + if (amrex::Gpu::inLaunchRegion()) \ + { \ + const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \ + const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \ + const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \ + dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \ + amrex_i_ec2.numBlocks.x), \ + amrex_i_ec3.numBlocks.x); \ + amrex_i_nblocks.y = 3; \ + if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \ + amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block1 \ + }); \ + amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block2 \ + }); \ + amrex::Gpu::Register(amrex_i_tn3, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \ + amrex::Box TI3(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \ + block3 \ + }); \ + } else { \ + AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \ + [=] AMREX_GPU_DEVICE () noexcept { \ + switch (blockIdx.y) { \ + case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + break; \ + case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + break; \ + case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \ + block3 \ + } \ + } \ + }); \ + AMREX_GPU_ERROR_CHECK(); \ + } \ + } \ + else { \ + for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \ + block1 \ + } \ + for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \ + block2 \ + } \ + for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \ + block3 \ + } \ + }}} +#endif #endif #ifdef AMREX_USE_DPCPP @@ -452,6 +583,19 @@ } \ } +#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(n,i,block) \ +{ \ + auto const& amrex_i_n = n; \ + using amrex_i_inttype = typename std::remove_const::type; \ + if (amrex::Gpu::inLaunchRegion()) { \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \ + } else { \ + auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \ + AMREX_PRAGMA_SIMD \ + for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \ + } \ +} + #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \ { \ using amrex_i_inttype = typename std::remove_const::type; \ @@ -470,6 +614,16 @@ } \ } +#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(box,i,j,k,block) \ +{ \ + auto const& amrex_i_box = box; \ + if (amrex::Gpu::inLaunchRegion()) { \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \ + } else { \ + amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \ + } \ +} + #define AMREX_GPU_DEVICE_FOR_3D(box,i,j,k,block) \ { \ amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \ @@ -488,6 +642,17 @@ } \ } +#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(box,ncomp,i,j,k,n,block) \ +{ \ + auto const& amrex_i_box = box; \ + auto const& amrex_i_ncomp = ncomp; \ + if (amrex::Gpu::inLaunchRegion()) { \ + amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \ + } else { \ + amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \ + } \ +} + #define AMREX_GPU_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \ { \ amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \ diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H index 3df4cc2f884..8cae1464acc 100644 --- a/Src/Base/AMReX_MFIter.H +++ b/Src/Base/AMReX_MFIter.H @@ -222,6 +222,10 @@ protected: mutable Vector > real_device_reduce_list; #endif +#ifdef AMREX_USE_GPU + std::unique_ptr gpu_fsg; +#endif + static int nextDynamicIndex; void Initialize (); diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp index 3c07e07fe17..375fd3e578c 100644 --- a/Src/Base/AMReX_MFIter.cpp +++ b/Src/Base/AMReX_MFIter.cpp @@ -225,6 +225,9 @@ MFIter::~MFIter () AMREX_GPU_ERROR_CHECK(); Gpu::Device::resetStreamIndex(); Gpu::resetNumCallbacks(); + if (!OpenMP::in_parallel() && Gpu::inFuseRegion()) { + Gpu::LaunchFusedKernels(); + } #endif if (m_fa) { @@ -236,7 +239,7 @@ MFIter::~MFIter () } } -void +void MFIter::Initialize () { if (flags & SkipInit) { @@ -327,6 +330,11 @@ MFIter::Initialize () #ifdef AMREX_USE_GPU Gpu::Device::setStreamIndex((streams > 0) ? currentIndex%streams : -1); Gpu::resetNumCallbacks(); + if (!OpenMP::in_parallel()) { + if (index_map->size() >= Gpu::getFuseNumKernelsThreshold()) { + gpu_fsg.reset(new Gpu::FuseSafeGuard(true)); + } + } #endif typ = fabArray.boxArray().ixType(); diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp index 08d945cdca2..2ce916e627e 100644 --- a/Src/Base/AMReX_MultiFab.cpp +++ b/Src/Base/AMReX_MultiFab.cpp @@ -43,16 +43,36 @@ MultiFab::Dot (const MultiFab& x, int xcomp, BL_PROFILE("MultiFab::Dot()"); - Real sm = amrex::ReduceSum(x, y, nghost, - [=] AMREX_GPU_HOST_DEVICE (Box const& bx, Array4 const& xfab, Array4 const& yfab) -> Real - { - Real t = 0.0; - AMREX_LOOP_4D(bx, numcomp, i, j, k, n, + Real sm = 0.0; +#ifdef AMREX_USE_GPU + if (Gpu::inLaunchRegion()) { + sm = amrex::ReduceSum(x, y, nghost, + [=] AMREX_GPU_DEVICE (Box const& bx, Array4 const& xfab, Array4 const& yfab) -> Real { - t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n); + Real t = 0.0; + AMREX_LOOP_4D(bx, numcomp, i, j, k, n, + { + t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n); + }); + return t; }); - return t; - }); + } else +#endif + { +#ifdef _OPENMP +#pragma omp parallel reduction(+:sm) +#endif + for (MFIter mfi(x,true); mfi.isValid(); ++mfi) + { + Box const& bx = mfi.growntilebox(nghost); + Array4 const& xfab = x.const_array(mfi); + Array4 const& yfab = y.const_array(mfi); + AMREX_LOOP_4D(bx, numcomp, i, j, k, n, + { + sm += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n); + }); + } + } if (!local) ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub()); @@ -198,7 +218,7 @@ MultiFab::Swap (MultiFab& dst, MultiFab& src, if (bx.ok()) { auto sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { const amrex::Real tmp = dfab(i,j,k,n+dstcomp); dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp); @@ -298,7 +318,7 @@ MultiFab::Saxpy (MultiFab& dst, Real a, const MultiFab& src, if (bx.ok()) { auto const sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,dstcomp+n) += a * sfab(i,j,k,srccomp+n); }); @@ -332,7 +352,7 @@ MultiFab::Xpay (MultiFab& dst, Real a, const MultiFab& src, if (bx.ok()) { auto const sfab = src.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp) + a * dfab(i,j,k,n+dstcomp); }); @@ -374,7 +394,7 @@ MultiFab::LinComb (MultiFab& dst, auto const xfab = x.array(mfi); auto const yfab = y.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,dstcomp+n) = a*xfab(i,j,k,xcomp+n) + b*yfab(i,j,k,ycomp+n); }); @@ -415,7 +435,7 @@ MultiFab::AddProduct (MultiFab& dst, auto const s1fab = src1.array(mfi); auto const s2fab = src2.array(mfi); auto dfab = dst.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n, { dfab(i,j,k,n+dstcomp) += s1fab(i,j,k,n+comp1) * s2fab(i,j,k,n+comp2); }); diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index d312cb676cf..fb436d6dcaf 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -67,7 +67,7 @@ namespace amrex Array4 const& ccarr = cc.array(mfi); Array4 const& ndarr = nd.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_nd_to_cc(tbx, ccarr, ndarr, dcomp, scomp, ncomp); }); @@ -91,7 +91,7 @@ namespace amrex Array4 const& eyarr = edge[1]->const_array(mfi);, Array4 const& ezarr = edge[2]->const_array(mfi);); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_eg_to_cc(tbx, ccarr, AMREX_D_DECL(exarr,eyarr,ezarr), dcomp); }); @@ -133,12 +133,12 @@ namespace amrex Array4 const& fzarr = fc[2]->const_array(mfi);); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, fxarr, dcomp, GeometryData()); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp); }); @@ -168,12 +168,12 @@ namespace amrex Array4 const& fzarr = fc[2]->const_array(mfi);); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, fxarr, 0, gd); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), 0); }); @@ -218,12 +218,12 @@ namespace amrex Array4 const& ccarr = cc.const_array(mfi); #if (AMREX_SPACEDIM == 1) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx, { amrex_avg_cc_to_fc(tbx, xbx, fxarr, ccarr, gd); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx, { amrex_avg_cc_to_fc(tbx, AMREX_D_DECL(xbx,ybx,zbx), AMREX_D_DECL(fxarr,fyarr,fzarr), ccarr); @@ -288,7 +288,7 @@ namespace amrex Array4 const& finearr = S_fine.const_array(mfi); Array4 const& finevolarr = fvolume.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_with_vol(tbx,crsearr,finearr,finevolarr, 0,scomp,ncomp,ratio); @@ -337,7 +337,7 @@ namespace amrex Array4 const& crsearr = crse_S_fine.array(mfi); Array4 const& finearr = S_fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); @@ -375,12 +375,12 @@ namespace amrex Array4 const& finearr = S_fine.const_array(mfi); if (is_cell_centered) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_nodes(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio); }); @@ -406,12 +406,12 @@ namespace amrex // not part of the actual crse multifab which came in. if (is_cell_centered) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_nodes(tbx,crsearr,finearr,0,scomp,ncomp,ratio); }); @@ -486,7 +486,7 @@ namespace amrex Array4 const& crsearr = crse.array(mfi); Array4 const& finearr = fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_faces(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir); }); @@ -568,7 +568,7 @@ namespace amrex Array4 const& crsearr = crse.array(mfi); Array4 const& finearr = fine.const_array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { amrex_avgdown_edges(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir); }); @@ -640,7 +640,7 @@ namespace amrex if (interpolate) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tile_box, thread_box, { amrex_fill_slice_interp(thread_box, slice_arr, full_arr, 0, start_comp, ncomp, diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index beaeac8a218..aaef79e9253 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -309,6 +309,22 @@ public: Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r, gh); }); #else +#ifdef AMREX_USE_CUDA + if (Gpu::inFuseRegion() && Gpu::inFuseReductionRegion() + && ec.numBlocks.x*ec.numThreads.x <= Gpu::getFuseSizeThreshold()) + { + Gpu::Register(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept + { + ReduceTuple r = *(dp+1); + if (box.contains(IntVect(AMREX_D_DECL(i,j,k)))) { + auto pr = Reduce::detail::call_f(f,i,j,k,ixtype); + Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr); + } + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r); + }); + } else +#endif + { amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { ReduceTuple r = *(dp+1); @@ -325,6 +341,7 @@ public: } Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r); }); + } #endif } diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index e434e1c5a3f..7f253882494 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -173,6 +173,7 @@ target_sources( amrex # GPU -------------------------------------------------------------------- AMReX_Gpu.H AMReX_GpuQualifiers.H + AMReX_GpuKernelInfo.H AMReX_GpuPrint.H AMReX_GpuAssert.H AMReX_GpuTypes.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index c97b264374d..edea7af44e0 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -66,6 +66,7 @@ C$(AMREX_BASE)_headers += AMReX_parstream.H C$(AMREX_BASE)_sources += AMReX_parstream.cpp C$(AMREX_BASE)_headers += AMReX_Gpu.H AMReX_GpuQualifiers.H AMReX_GpuPrint.H AMReX_GpuAssert.H AMReX_GpuTypes.H AMReX_GpuError.H +C$(AMREX_BASE)_headers += AMReX_GpuKernelInfo.H C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosG.H AMReX_GpuLaunchFunctsG.H C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosC.H AMReX_GpuLaunchFunctsC.H diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp index d1cbedfedea..652baa1bf92 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp @@ -410,13 +410,13 @@ MLABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i const auto& bzfab = bzcoef.array(mfi);); if (m_overset_mask[amrlev][mglev]) { const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlabeclap_adotx_os(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), osm, dxinv, ascalar, bscalar, ncomp); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlabeclap_adotx(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), dxinv, ascalar, bscalar, ncomp); @@ -454,7 +454,7 @@ MLABecLaplacian::normalize (int amrlev, int mglev, MultiFab& mf) const const auto& byfab = bycoef.array(mfi);, const auto& bzfab = bzcoef.array(mfi);); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlabeclap_normalize(tbx, fab, afab, AMREX_D_DECL(bxfab,byfab,bzfab), dxinv, ascalar, bscalar, ncomp); @@ -609,7 +609,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& #else if (m_overset_mask[amrlev][mglev]) { const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box, { abec_gsrb_os(thread_box, solnfab, rhsfab, alpha, afab, AMREX_D_DECL(dhx, dhy, dhz), @@ -621,7 +621,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& osm, vbx, redblack, nc); }); } else if (regular_coarsening) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box, { abec_gsrb(thread_box, solnfab, rhsfab, alpha, afab, AMREX_D_DECL(dhx, dhy, dhz), @@ -688,7 +688,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, Real fac = bscalar*dxinv[0]; Box blo = amrex::bdryLo(box, 0); int blen = box.length(0); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox, { mlabeclap_flux_xface(tbox, fxarr, solarr, bx, fac, blen, ncomp); }); @@ -696,7 +696,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, fac = bscalar*dxinv[1]; blo = amrex::bdryLo(box, 1); blen = box.length(1); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox, { mlabeclap_flux_yface(tbox, fyarr, solarr, by, fac, blen, ncomp); }); @@ -705,7 +705,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, fac = bscalar*dxinv[2]; blo = amrex::bdryLo(box, 2); blen = box.length(2); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox, { mlabeclap_flux_zface(tbox, fzarr, solarr, bz, fac, blen, ncomp); }); @@ -715,14 +715,14 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, { Real fac = bscalar*dxinv[0]; Box bflux = amrex::surroundingNodes(box, 0); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_x(tbox, fxarr, solarr, bx, fac, ncomp); }); #if (AMREX_SPACEDIM >= 2) fac = bscalar*dxinv[1]; bflux = amrex::surroundingNodes(box, 1); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_y(tbox, fyarr, solarr, by, fac, ncomp); }); @@ -730,7 +730,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar, #if (AMREX_SPACEDIM == 3) fac = bscalar*dxinv[2]; bflux = amrex::surroundingNodes(box, 2); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox, { mlabeclap_flux_z(tbox, fzarr, solarr, bz, fac, ncomp); }); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp index aadc9ee0106..bb88308d595 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp @@ -314,7 +314,7 @@ MLCellLinOp::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiF const Box& bx = mfi.tilebox(); Array4 const& cfab = crse.const_array(mfi); Array4 const& ffab = fine.array(mfi); - AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n, + AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n, { int ic = amrex::coarsen(i,ratio3.x); int jc = amrex::coarsen(j,ratio3.y); @@ -505,7 +505,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State const Real bcllo = bdlv[icomp][olo]; const Real bclhi = bdlv[icomp][ohi]; if (idim == 0) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_x(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -517,7 +517,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State imaxorder, dxi, flagbc, icomp); }); } else if (idim == 1) { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_y(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -529,7 +529,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State imaxorder, dyi, flagbc, icomp); }); } else { - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tboxlo, { mllinop_apply_bc_z(0, tboxlo, blen, iofab, mlo, bctlo, bcllo, bvlo, @@ -619,6 +619,7 @@ MLCellLinOp::reflux (int crse_amrlev, AMREX_D_TERM(Elixir elifx = flux[0].elixir();, Elixir elify = flux[1].elixir();, Elixir elifz = flux[2].elixir();); + Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux FFlux(crse_amrlev, mfi, pflux, crse_sol[mfi], Location::FaceCentroid); fluxreg.CrseAdd(mfi, cpflux, crse_dx, dt, RunOn::Gpu); } @@ -640,6 +641,7 @@ MLCellLinOp::reflux (int crse_amrlev, AMREX_D_TERM(Elixir elifx = flux[0].elixir();, Elixir elify = flux[1].elixir();, Elixir elifz = flux[2].elixir();); + Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux FFlux(fine_amrlev, mfi, pflux, fine_sol[mfi], Location::FaceCentroid, face_only); fluxreg.FineAdd(mfi, cpflux, fine_dx, dt, RunOn::Gpu); } @@ -678,6 +680,7 @@ MLCellLinOp::compFlux (int amrlev, const Array& fluxes AMREX_D_TERM(Elixir elifx = flux[0].elixir();, Elixir elify = flux[1].elixir();, Elixir elifz = flux[2].elixir();); + Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux FFlux(amrlev, mfi, pflux, sol[mfi], loc); for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { const Box& nbx = mfi.nodaltilebox(idim); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp index 5ad48f59131..c556f22824a 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp @@ -77,30 +77,30 @@ MLPoisson::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) con const auto& yfab = out.array(mfi); #if (AMREX_SPACEDIM == 3) - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx(i, j, k, yfab, xfab, dhx, dhy, dhz); }); #elif (AMREX_SPACEDIM == 2) if (m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx_m(i, j, yfab, xfab, dhx, dhy, dx, probxlo); }); } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx(i, j, yfab, xfab, dhx, dhy); }); } #elif (AMREX_SPACEDIM == 1) if (m_has_metric_term) { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx_m(i, yfab, xfab, dhx, dx, probxlo); }); } else { - AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k, + AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k, { mlpoisson_adotx(i, yfab, xfab, dhx); }); @@ -134,12 +134,12 @@ MLPoisson::normalize (int amrlev, int mglev, MultiFab& mf) const const auto& fab = mf.array(mfi); #if (AMREX_SPACEDIM == 2) - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlpoisson_normalize(tbx, fab, dhx, dhy, dx, probxlo); }); #else - AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx, + AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx, { mlpoisson_normalize(tbx, fab, dhx, dx, probxlo); });