diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index ffc0c57e372..6ba27f84b1b 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -522,7 +522,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     BL_PROFILE_INITPARAMS();
 #endif
     machine::Initialize();
-#ifdef AMREX_USE_CUDA
+#ifdef AMREX_USE_GPU
     Gpu::Fuser::Initialize();
 #endif
 
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 4a33ee31f2a..970f63421e5 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -54,19 +54,26 @@ ParallelFor (Vector<Array4BoxTag<T> > const& tags, int ncomp, F && f)
     }
     nwarps.push_back(ntotwarps);
 
-    std::size_t nbytes = ntags*sizeof(TagType);
-    auto d_tags = static_cast<TagType*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_tags, tags.data(), nbytes);
+    std::size_t sizeof_tags = ntags*sizeof(TagType);
+    std::size_t offset_nwarps = Arena::align(sizeof_tags);
+    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+    std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
 
-    nbytes = (ntags+1)*sizeof(int);
-    auto d_nwarps = static_cast<int*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes);
+    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
 
-    constexpr int nthreads = 128;
+    std::memcpy(h_buffer, tags.data(), sizeof_tags);
+    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
+
+    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
+    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
+
+    constexpr int nthreads = 256;
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 #ifdef AMREX_USE_DPCPP
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
     {
@@ -112,7 +119,7 @@ ParallelFor (Vector<Array4BoxTag<T> > const& tags, int ncomp, F && f)
         }
     });
 #else
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
@@ -159,8 +166,8 @@ ParallelFor (Vector<Array4BoxTag<T> > const& tags, int ncomp, F && f)
 #endif
 
     Gpu::synchronize();
-    The_Device_Arena()->free(d_nwarps);
-    The_Device_Arena()->free(d_tags);
+    The_Pinned_Arena()->free(h_buffer);
+    The_Arena()->free(d_buffer);
 }
 #endif
 
@@ -221,19 +228,27 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
     }
     nwarps.push_back(ntotwarps);
 
-    std::size_t nbytes = N_locs*sizeof(TagType);
-    auto d_tags = static_cast<TagType*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes);
+    const int ntags = copy_tags.size();
+    std::size_t sizeof_tags = ntags*sizeof(TagType);
+    std::size_t offset_nwarps = Arena::align(sizeof_tags);
+    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+    std::size_t total_buf_size = offset_nwarps + sizeof_nwarps;
+
+    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
+
+    std::memcpy(h_buffer, copy_tags.data(), sizeof_tags);
+    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
 
-    nbytes = (N_locs+1)*sizeof(int);
-    auto d_nwarps = static_cast<int*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes);
+    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
+    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
 
-    constexpr int nthreads = 128;
+    constexpr int nthreads = 256;
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 #ifdef AMREX_USE_DPCPP
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
     {
@@ -280,7 +295,7 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
         }
     });
 #else
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
@@ -328,8 +343,8 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
 #endif
 
     Gpu::synchronize();
-    The_Device_Arena()->free(d_nwarps);
-    The_Device_Arena()->free(d_tags);
+    The_Pinned_Arena()->free(h_buffer);
+    The_Arena()->free(d_buffer);
 }
 
 template <class T, class F>
@@ -353,23 +368,31 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
     }
     nwarps.push_back(ntotwarps);
 
-    std::size_t nbytes = N_locs*sizeof(TagType);
-    auto d_tags = static_cast<TagType*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_tags, copy_tags.data(), nbytes);
+    const int ntags = copy_tags.size();
+    std::size_t sizeof_tags = ntags*sizeof(TagType);
+    std::size_t offset_nwarps = Arena::align(sizeof_tags);
+    std::size_t sizeof_nwarps = (ntags+1)*sizeof(int);
+    std::size_t offset_masks = Arena::align(offset_nwarps+sizeof_nwarps);
+    std::size_t sizeof_masks = masks.size()*sizeof(Array4<int>);
+    std::size_t total_buf_size = offset_masks + sizeof_masks;
 
-    nbytes = (N_locs+1)*sizeof(int);
-    auto d_nwarps = static_cast<int*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_nwarps, nwarps.data(), nbytes);
+    char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size);
+    char* d_buffer = (char*)The_Arena()->alloc(total_buf_size);
 
-    nbytes = masks.size()*sizeof(Array4<int>);
-    auto d_masks = static_cast<Array4<int>*>(The_Device_Arena()->alloc(nbytes));
-    Gpu::htod_memcpy(d_masks, masks.data(), nbytes);
+    std::memcpy(h_buffer, copy_tags.data(), sizeof_tags);
+    std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps);
+    std::memcpy(h_buffer+offset_masks, masks.data(), sizeof_masks);
+    Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size);
 
-    constexpr int nthreads = 128;
+    auto d_tags = reinterpret_cast<TagType*>(d_buffer);
+    auto d_nwarps = reinterpret_cast<int*>(d_buffer+offset_nwarps);
+    auto d_masks = reinterpret_cast<Array4<int>*>(d_buffer+offset_masks);
+
+    constexpr int nthreads = 256;
     constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
     int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 #ifdef AMREX_USE_DPCPP
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
     {
         int g_tid = item.get_global_id(0);
@@ -437,7 +460,7 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
         if (m) *m = 0;
     });
 #else
-    amrex::launch(nblocks, nthreads, Gpu::nullStream(),
+    amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
@@ -515,9 +538,8 @@ fab_to_fab (Vector<Array4CopyTag<T> > const& copy_tags, int scomp, int dcomp, in
 #endif
 
     Gpu::synchronize();
-    The_Device_Arena()->free(d_masks);
-    The_Device_Arena()->free(d_nwarps);
-    The_Device_Arena()->free(d_tags);
+    The_Pinned_Arena()->free(h_buffer);
+    The_Arena()->free(d_buffer);
 }
 
 template <typename T, amrex::EnableIf_t<amrex::IsStoreAtomic<T>::value,int> = 0>
@@ -661,16 +683,18 @@ FabArray<FAB>::FB_local_copy_gpu (const FB& TheFB, int scomp, int ncomp)
     }
 
     if (maskfabs.size() > 0) {
+        Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold());
         for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) {
             BaseFab<int>& mskfab = maskfabs[sit()];
             const Array4<int>& msk = mskfab.array();
             const Box& bx = mskfab.box();
-            amrex::ParallelFor(bx,
+            amrex::ParallelFor(Gpu::KernelInfo{}.setFusible(true), bx,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 msk(i,j,k) = 0;
             });
         }
+        Gpu::LaunchFusedKernels();
     }
 
     if (is_thread_safe) {
@@ -923,7 +947,7 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
         // Is the conditional ever expected false?
         int launches = 0;
         for (int send = 0; send < N_snds; ++send) {
-            if (send_data[send] != nullptr) {
+            if (send_size[send] > 0) {
                 launches += send_cctc[send]->size();
             }
         }
@@ -940,8 +964,7 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
                                                      std::size_t(sizeof(CopyMemory)*launches) );
 
             const int j = sit();
-            char* dptr = send_data[j];
-            if (dptr != nullptr)
+            if (send_size[j] > 0)
             {
                 auto const& cctc = *send_cctc[j];
                 for (auto const& tag : cctc)
@@ -971,9 +994,9 @@ FabArray<FAB>::FB_pack_send_buffer_cuda_graph (const FB& TheFB, int scomp, int n
     for (int send = 0; send < N_snds; ++send)
     {
         const int j = send;
-        char* dptr = send_data[j];
-        if (dptr != nullptr)
+        if (send_size[j] > 0)
         {
+            char* dptr = send_data[j];
             auto const& cctc = *send_cctc[j];
             for (auto const& tag : cctc)
             {
@@ -1009,9 +1032,9 @@ FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int
     LayoutData<Vector<VoidCopyTag> > recv_copy_tags(boxArray(),DistributionMap());
     for (int k = 0; k < N_rcvs; ++k)
     {
-        const char* dptr = recv_data[k];
-        if (dptr != nullptr)
+        if (recv_size[k] > 0)
         {
+            const char* dptr = recv_data[k];
             auto const& cctc = *recv_cctc[k];
             for (auto const& tag : cctc)
             {
@@ -1082,7 +1105,7 @@ FabArray<FAB>::FB_unpack_recv_buffer_cuda_graph (const FB& TheFB, int dcomp, int
 template <class FAB>
 void
 FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                     Vector<char*>& send_data,
+                                     Vector<char*> const& send_data,
                                      Vector<std::size_t> const& send_size,
                                      Vector<CopyComTagsContainer const*> const& send_cctc)
 {
@@ -1091,16 +1114,26 @@ FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int nc
     const int N_snds = send_data.size();
     if (N_snds == 0) return;
 
+    char* pbuffer = send_data[0];
+    std::size_t szbuffer = 0;
+#if 0
+    // For linear solver test on summit, this is slower than writing to
+    // pinned memory directly on device.
+    if (not ParallelDescriptor::UseGpuAwareMpi()) {
+        // Memory in send_data is pinned.
+        szbuffer = (send_data[N_snds-1]-send_data[0]) + send_size[N_snds-1];
+        pbuffer = (char*)The_Arena()->alloc(szbuffer);
+    }
+#endif
+
     typedef Array4CopyTag<value_type> TagType;
     Vector<TagType> snd_copy_tags;
-    // FIX HIP HERE -- Dim3
-    Dim3 zero;
-    zero.x = 0; zero.y = 0; zero.z = 0;
     for (int j = 0; j < N_snds; ++j)
     {
-        char* dptr = send_data[j];
-        if (dptr != nullptr)
+        if (send_size[j] > 0)
         {
+            std::size_t offset = send_data[j]-send_data[0];
+            char* dptr = pbuffer + offset;
             auto const& cctc = *send_cctc[j];
             for (auto const& tag : cctc)
             {
@@ -1108,16 +1141,24 @@ FabArray<FAB>::pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int nc
                     amrex::makeArray4((value_type*)(dptr), tag.sbox, ncomp),
                     src.array(tag.srcIndex),
                     tag.sbox,
-                    zero
+                    Dim3{0,0,0}
                 });
                 dptr += (tag.sbox.numPts() * ncomp * sizeof(value_type));
             }
-            BL_ASSERT(dptr <= send_data[j] + send_size[j]);
+            BL_ASSERT(dptr <= pbuffer + offset + send_size[j]);
         }
     }
 
     detail::fab_to_fab<value_type>(snd_copy_tags, scomp, 0, ncomp,
                                    detail::CellStore<value_type>());
+
+    // There is Gpu::synchronize in fab_to_fab.
+
+    if (pbuffer != send_data[0]) {
+        Gpu::copyAsync(Gpu::deviceToHost,pbuffer,pbuffer+szbuffer,send_data[0]);
+        Gpu::synchronize();
+        The_Arena()->free(pbuffer);
+    }
 }
 
 template <class FAB>
@@ -1133,6 +1174,20 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
     const int N_rcvs = recv_cctc.size();
     if (N_rcvs == 0) return;
 
+    char* pbuffer = recv_data[0];
+    std::size_t szbuffer = 0;
+#if 0
+    // For linear solver test on summit, this is slower than writing to
+    // pinned memory directly on device.
+    if (not ParallelDescriptor::UseGpuAwareMpi()) {
+        // Memory in recv_data is pinned.
+        szbuffer = (recv_data[N_rcvs-1]-recv_data[0]) + recv_size[N_rcvs-1];
+        pbuffer = (char*)The_Arena()->alloc(szbuffer);
+        Gpu::copyAsync(Gpu::hostToDevice,recv_data[0],recv_data[0]+szbuffer,pbuffer);
+        Gpu::synchronize();
+    }
+#endif
+
     typedef Array4CopyTag<value_type> TagType;
     Vector<TagType> recv_copy_tags;
 
@@ -1149,9 +1204,10 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
 
     for (int k = 0; k < N_rcvs; ++k)
     {
-        const char* dptr = recv_data[k];
-        if (dptr != nullptr)
+        if (recv_size[k] > 0)
         {
+            std::size_t offset = recv_data[k]-recv_data[0];
+            const char* dptr = pbuffer + offset;
             auto const& cctc = *recv_cctc[k];
             for (auto const& tag : cctc)
             {
@@ -1171,21 +1227,23 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
                     masks.push_back(maskfabs[li].array());
                 }
             }
-            BL_ASSERT(dptr <= recv_data[k] + recv_size[k]);
+            BL_ASSERT(dptr <= pbuffer + offset + recv_size[k]);
         }
     }
 
     if (maskfabs.size() > 0) {
+        Gpu::FuseSafeGuard fsg(maskfabs.size() >= Gpu::getFuseNumKernelsThreshold());
         for (Gpu::StreamIter sit(maskfabs.size()); sit.isValid(); ++sit) {
             BaseFab<int>& mskfab = maskfabs[sit()];
             const Array4<int>& msk = mskfab.array();
             const Box& bx = mskfab.box();
-            amrex::ParallelFor(bx,
+            amrex::ParallelFor(Gpu::KernelInfo().setFusible(true), bx,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 msk(i,j,k) = 0;
             });
         }
+        Gpu::LaunchFusedKernels();
     }
 
     if (op == FabArrayBase::COPY)
@@ -1206,6 +1264,12 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
             detail::fab_to_fab_atomic_add<value_type>(recv_copy_tags, 0, dcomp, ncomp, masks);
         }
     }
+
+    // There is Gpu::synchronize in fab_to_fab.
+
+    if (pbuffer != recv_data[0]) {
+        The_Arena()->free(pbuffer);
+    }
 }
 
 #endif /* AMREX_USE_GPU */
@@ -1213,7 +1277,7 @@ FabArray<FAB>::unpack_recv_buffer_gpu (FabArray<FAB>& dst, int dcomp, int ncomp,
 template <class FAB>
 void
 FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                     Vector<char*>& send_data,
+                                     Vector<char*> const& send_data,
                                      Vector<std::size_t> const& send_size,
                                      Vector<CopyComTagsContainer const*> const& send_cctc)
 {
@@ -1227,9 +1291,9 @@ FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int nc
 #endif
     for (int j = 0; j < N_snds; ++j)
     {
-        char* dptr = send_data[j];
-        if (dptr != nullptr)
+        if (send_size[j] > 0)
         {
+            char* dptr = send_data[j];
             auto const& cctc = *send_cctc[j];
             for (auto const& tag : cctc)
             {
@@ -1268,9 +1332,9 @@ FabArray<FAB>::unpack_recv_buffer_cpu (FabArray<FAB>& dst, int dcomp, int ncomp,
 #endif
         for (int k = 0; k < N_rcvs; ++k)
         {
-            const char* dptr = recv_data[k];
-            if (dptr != nullptr)
+            if (recv_size[k] > 0)
             {
+                const char* dptr = recv_data[k];
                 auto const& cctc = *recv_cctc[k];
                 for (auto const& tag : cctc)
                 {
@@ -1296,9 +1360,9 @@ FabArray<FAB>::unpack_recv_buffer_cpu (FabArray<FAB>& dst, int dcomp, int ncomp,
         recv_copy_tags.define(dst.boxArray(),dst.DistributionMap());
         for (int k = 0; k < N_rcvs; ++k)
         {
-            const char* dptr = recv_data[k];
-            if (dptr != nullptr)
+            if (recv_size[k] > 0)
             {
+                const char* dptr = recv_data[k];
                 auto const& cctc = *recv_cctc[k];
                 for (auto const& tag : cctc)
                 {
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 71e60674686..db0b2073d2f 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -718,7 +718,7 @@ public:
 #endif
 
     static void pack_send_buffer_gpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                      Vector<char*>& send_data,
+                                      Vector<char*> const& send_data,
                                       Vector<std::size_t> const& send_size,
                                       Vector<const CopyComTagsContainer*> const& send_cctc);
 
@@ -731,7 +731,7 @@ public:
 #endif
 
     static void pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int ncomp,
-                                      Vector<char*>& send_data,
+                                      Vector<char*> const& send_data,
                                       Vector<std::size_t> const& send_size,
                                       Vector<const CopyComTagsContainer*> const& send_cctc);
 
@@ -1582,7 +1582,7 @@ FabArray<FAB>::setVal (value_type val,
     {
 	const Box& bx = fai.growntilebox(nghost);
         auto fab = this->array(fai);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = val;
         });
@@ -1625,7 +1625,7 @@ FabArray<FAB>::setVal (value_type val,
 
         if (b.ok()) {
             auto fab = this->array(fai);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( b, ncomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( b, ncomp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = val;
             });
@@ -1655,7 +1655,7 @@ FabArray<FAB>::abs (int comp, int ncomp, const IntVect& nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = amrex::Math::abs(fab(i,j,k,n+comp));
         });
@@ -1674,7 +1674,7 @@ FabArray<FAB>::plus (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) += val;
         });
@@ -1694,7 +1694,7 @@ FabArray<FAB>::plus (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) += val;
             });
@@ -1714,7 +1714,7 @@ FabArray<FAB>::mult (value_type val, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) *= val;
         });
@@ -1734,7 +1734,7 @@ FabArray<FAB>::mult (value_type val, const Box& region, int comp, int num_comp,
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) *= val;
             });
@@ -1754,7 +1754,7 @@ FabArray<FAB>::invert (value_type numerator, int comp, int num_comp, int nghost)
     {
         const Box& bx = mfi.growntilebox(nghost);
         auto fab = this->array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
         {
             fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
         });
@@ -1774,7 +1774,7 @@ FabArray<FAB>::invert (value_type numerator, const Box& region, int comp, int nu
         const Box& bx = mfi.growntilebox(nghost) & region;
         if (bx.ok()) {
             auto fab = this->array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, num_comp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, num_comp, i, j, k, n,
             {
                 fab(i,j,k,n+comp) = numerator / fab(i,j,k,n+comp);
             });
@@ -1970,7 +1970,7 @@ FabArray<FAB>::BuildMask (const Box& phys_domain, const Periodicity& period,
         Box const& fbx = mfi.growntilebox();
         Box const& gbx = fbx & domain;
         Box const& vbx = mfi.validbox();
-        AMREX_HOST_DEVICE_FOR_4D(fbx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(fbx, ncomp, i, j, k, n,
         {
             IntVect iv(AMREX_D_DECL(i,j,k));
             if (vbx.contains(iv)) {
diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
index 77429049df3..50e00ad2c9c 100644
--- a/Src/Base/AMReX_FabArrayBase.cpp
+++ b/Src/Base/AMReX_FabArrayBase.cpp
@@ -118,7 +118,7 @@ FabArrayBase::Initialize ()
 
 #ifdef AMREX_USE_GPU
     if (ParallelDescriptor::UseGpuAwareMpi()) {
-        the_fa_arena = The_Device_Arena();
+        the_fa_arena = The_Arena();
     } else {
         the_fa_arena = The_Pinned_Arena();
     }
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index 46ecef31170..a0f05f93d24 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -143,9 +143,7 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
         {
             the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
             for (int i = 0, N = send_size.size(); i < N; ++i) {
-                if (send_size[i] > 0) {
-                    send_data[i] = the_send_data + offset[i];
-                }
+                send_data[i] = the_send_data + offset[i];
             }
         } else {
             the_send_data = nullptr;
@@ -495,9 +493,7 @@ FabArray<FAB>::ParallelCopy (const FabArray<FAB>& src,
             {
                 the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
                 for (int i = 0, N = send_size.size(); i < N; ++i) {
-                    if (send_size[i] > 0) {
-                        send_data[i] = the_send_data + offset[i];
-                    }
+                    send_data[i] = the_send_data + offset[i];
                 }
             }
 
@@ -749,9 +745,9 @@ FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers&  m_RcvTags,
 
         for (int i = 0; i < nrecv; ++i)
         {
+            recv_data[i] = the_recv_data + offset[i];
             if (recv_size[i] > 0)
             {
-                recv_data[i] = the_recv_data + offset[i];
                 const int rank = ParallelContext::global_to_local_rank(recv_from[i]);
                 const int comm_data_type = ParallelDescriptor::select_comm_data_type(recv_size[i]);
                 if (comm_data_type == 1) {
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index c31fd19b0c1..869fd172d89 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -153,11 +153,13 @@ ReduceSum_device (FabArray<FAB1> const& fa1, FabArray<FAB2> const& fa2,
     using value_type = typename FAB1::value_type;
     value_type sm = 0;
 
+    BL_PROFILE("ReduceSum_device");
+
     {
         ReduceOps<ReduceOpSum> reduce_op;
         ReduceData<value_type> reduce_data(reduce_op);
         using ReduceTuple = typename decltype(reduce_data)::Type;
-
+        Gpu::FuseReductionSafeGuard rsg(true);
         for (MFIter mfi(fa1); mfi.isValid(); ++mfi)
         {
             const Box& bx = amrex::grow(mfi.validbox(),nghost);
@@ -1467,7 +1469,7 @@ Add (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, int
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) += srcFab(i,j,k,n+srccomp);
             });
@@ -1499,7 +1501,7 @@ Copy (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp, in
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,dstcomp+n) = srcFab(i,j,k,srccomp+n);
             });
@@ -1531,7 +1533,7 @@ Subtract (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) -= srcFab(i,j,k,n+srccomp);
             });
@@ -1563,7 +1565,7 @@ Multiply (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) *= srcFab(i,j,k,n+srccomp);
             });
@@ -1595,7 +1597,7 @@ Divide (FabArray<FAB>& dst, FabArray<FAB> const& src, int srccomp, int dstcomp,
         {
             auto const srcFab = src.array(mfi);
             auto       dstFab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dstFab(i,j,k,n+dstcomp) /= srcFab(i,j,k,n+srccomp);
             });
@@ -1625,7 +1627,7 @@ Abs (FabArray<FAB>& fa, int icomp, int numcomp, const IntVect& nghost)
         if (bx.ok())
         {
             auto const& fab = fa.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 fab(i,j,k,n+icomp) = amrex::Math::abs(fab(i,j,k,n+icomp));
             });
@@ -1682,7 +1684,7 @@ OverrideSync (FabArray<FAB> & fa, FabArray<IFAB> const& msk, const Periodicity&
         const Box& bx = mfi.tilebox();
         auto fab = fa.array(mfi);
         auto const ifab = msk.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             if (!ifab(i,j,k)) fab(i,j,k,n) = 0;
         });
diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
index a0fd747a17a..5c59d89305b 100644
--- a/Src/Base/AMReX_Gpu.H
+++ b/Src/Base/AMReX_Gpu.H
@@ -9,6 +9,7 @@ namespace amrex { namespace Cuda {} }
 #endif
 
 #include <AMReX_GpuQualifiers.H>
+#include <AMReX_GpuKernelInfo.H>
 #include <AMReX_GpuPrint.H>
 #include <AMReX_GpuAssert.H>
 #include <AMReX_GpuTypes.H>
diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H
index dc3555cb39d..99467f9a956 100644
--- a/Src/Base/AMReX_GpuFuse.H
+++ b/Src/Base/AMReX_GpuFuse.H
@@ -13,6 +13,8 @@
 namespace amrex {
 namespace Gpu {
 
+#ifdef AMREX_USE_GPU
+
 #ifdef AMREX_USE_CUDA
 
 typedef void (*Lambda1DLauncher)(char*,int);
@@ -229,23 +231,6 @@ private:
     }
 };
 
-Long getFuseSizeThreshold ();
-Long setFuseSizeThreshold (Long new_threshold);
-int getFuseNumKernelsThreshold ();
-int setFuseNumKernelsThreshold (int new_threshold);
-bool inFuseRegion ();
-bool setFuseRegion (bool flag);
-
-struct FuseSafeGuard
-{
-    explicit FuseSafeGuard (bool flag) noexcept
-        : m_old(setFuseRegion(flag)) {}
-    ~FuseSafeGuard () { setFuseRegion(m_old); }
-private:
-    bool m_old;
-};
-
-
 template <typename F>
 void
 Register (Box const& bx, F&& f)
@@ -273,6 +258,56 @@ LaunchFusedKernels ()
     Fuser::getInstance().Launch();
 }
 
+#else
+
+class Fuser
+{
+public:
+    static Fuser& getInstance ();
+    static void Initialize ();
+    static void Finalize ();
+private:
+    static std::unique_ptr<Fuser> m_instance;
+};
+
+inline void LaunchFusedKernels () {}
+
+#endif
+
+Long getFuseSizeThreshold ();
+Long setFuseSizeThreshold (Long new_threshold);
+int getFuseNumKernelsThreshold ();
+int setFuseNumKernelsThreshold (int new_threshold);
+bool inFuseRegion ();
+bool setFuseRegion (bool flag);
+bool inFuseReductionRegion ();
+bool setFuseReductionRegion (bool flag);
+
+struct FuseSafeGuard
+{
+    explicit FuseSafeGuard (bool flag) noexcept
+        : m_old(setFuseRegion(flag)) {}
+    ~FuseSafeGuard () { setFuseRegion(m_old); }
+private:
+    bool m_old;
+};
+
+struct FuseReductionSafeGuard
+{
+    explicit FuseReductionSafeGuard (bool flag) noexcept
+        : m_old(setFuseReductionRegion(flag)) {}
+    ~FuseReductionSafeGuard () { setFuseReductionRegion(m_old); }
+private:
+    bool m_old;
+};
+
+#else
+
+struct FuseSafeGuard
+{
+    explicit FuseSafeGuard (bool) {}
+};
+
 #endif
 
 }}
diff --git a/Src/Base/AMReX_GpuFuse.cpp b/Src/Base/AMReX_GpuFuse.cpp
index 26f942aa963..e3a4fe812b1 100644
--- a/Src/Base/AMReX_GpuFuse.cpp
+++ b/Src/Base/AMReX_GpuFuse.cpp
@@ -6,10 +6,11 @@
 namespace amrex {
 namespace Gpu {
 
-#ifdef AMREX_USE_CUDA
+#ifdef AMREX_USE_GPU
 
 namespace {
     bool s_in_fuse_region = false;
+    bool s_in_fuse_reduction_region = false;
     // Fusing kernels with elements greater than this are not recommended based tests on v100
     Long s_fuse_size_threshold = 257*257;
     // If the number of kernels is less than this, fusing is not recommended based on tests on v100
@@ -18,6 +19,8 @@ namespace {
 
 std::unique_ptr<Fuser> Fuser::m_instance = nullptr;
 
+#ifdef AMREX_USE_CUDA
+
 Fuser::Fuser ()
 {
     AMREX_ASSERT(!OpenMP::in_parallel());
@@ -86,12 +89,32 @@ void Fuser::Launch ()
         constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size;
         int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block;
 
+        bool is_reduction = s_in_fuse_reduction_region;
+
         amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             int g_tid = blockDim.x*blockIdx.x + threadIdx.x;
             int g_wid = g_tid / Gpu::Device::warp_size;
-            if (g_wid >= ntotwarps) return;
+            if (g_wid >= ntotwarps) {
+                if (is_reduction) {
+                    // for reduction, the assumption is all lambdas have function signature
+                    FuseHelper& helper = d_lambda_helper[0];
+                    char* lambda_object = d_lambda_object + helper.m_offset;
+                    if (helper.m_bx.isEmpty()) {
+                        helper.m_fp.L1D(lambda_object,-1);
+                    } else {
+                        if (helper.m_N == 0) {
+                            helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN);
+                        } else {
+                            for (int n = 0; n < helper.m_N; ++n) {
+                                helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1);
+                            }
+                        }
+                    }
+                }
+                return;
+            }
 
             int ilambda;
             {
@@ -120,6 +143,8 @@ void Fuser::Launch ()
             if (bx.isEmpty()) {
                 if (icell < helper.m_N) {
                     helper.m_fp.L1D(lambda_object,icell);
+                } else if (is_reduction) {
+                    helper.m_fp.L1D(lambda_object,-1);
                 }
             } else {
                 int ncells = bx.numPts();
@@ -139,6 +164,14 @@ void Fuser::Launch ()
                             helper.m_fp.L4D(lambda_object,i,j,k,n);
                         }
                     }
+                } else if (is_reduction) {
+                    if (helper.m_N == 0) {
+                        helper.m_fp.L3D(lambda_object, INT_MIN, INT_MIN, INT_MIN);
+                    } else {
+                        for (int n = 0; n < helper.m_N; ++n) {
+                            helper.m_fp.L4D(lambda_object, INT_MIN, INT_MIN, INT_MIN, -1);
+                        }
+                    }
                 }
             }
         });
@@ -181,6 +214,8 @@ Fuser::resize_helper_buf ()
     m_helper_buf = p;
 }
 
+#endif
+
 Fuser&
 Fuser::getInstance ()
 {
@@ -218,8 +253,7 @@ setFuseSizeThreshold (Long new_threshold)
 
 int getFuseNumKernelsThreshold () { return s_fuse_numkernels_threshold; }
 
-int
-setFuseNumKernelsThreshold (int new_threshold)
+int setFuseNumKernelsThreshold (int new_threshold)
 {
     int old = s_fuse_numkernels_threshold;
     s_fuse_numkernels_threshold = new_threshold;
@@ -228,14 +262,22 @@ setFuseNumKernelsThreshold (int new_threshold)
 
 bool inFuseRegion () { return s_in_fuse_region; }
 
-bool
-setFuseRegion (bool flag)
+bool setFuseRegion (bool flag)
 {
     bool old = s_in_fuse_region;
     s_in_fuse_region = flag;
     return old;
 }
 
+bool inFuseReductionRegion () { return s_in_fuse_reduction_region; }
+
+bool setFuseReductionRegion (bool flag)
+{
+    bool old = s_in_fuse_reduction_region;
+    s_in_fuse_reduction_region = flag;
+    return old;
+}
+
 #endif
 
 }}
diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H
new file mode 100644
index 00000000000..94e125e5a0c
--- /dev/null
+++ b/Src/Base/AMReX_GpuKernelInfo.H
@@ -0,0 +1,18 @@
+#ifndef AMREX_GPU_KERNEL_INFO_H_
+#define AMREX_GPU_KERNEL_INFO_H_
+
+namespace amrex {
+namespace Gpu {
+
+class KernelInfo
+{
+public:
+    KernelInfo& setFusible (bool flag) { fusible = flag; return *this; }
+    bool isFusible () const { return fusible; }
+private:
+    bool fusible = false;
+};
+
+}}
+
+#endif
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index f79901a2403..d6c35972427 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -2,6 +2,7 @@
 #define AMREX_GPU_LAUNCH_H_
 
 #include <AMReX_GpuQualifiers.H>
+#include <AMReX_GpuKernelInfo.H>
 #include <AMReX_GpuControl.H>
 #include <AMReX_GpuTypes.H>
 #include <AMReX_GpuError.H>
@@ -9,6 +10,7 @@
 #include <AMReX_GpuDevice.H>
 #include <AMReX_GpuMemory.H>
 #include <AMReX_GpuReduce.H>
+#include <AMReX_GpuFuse.H>
 #include <AMReX_Tuple.H>
 #include <AMReX_Box.H>
 #include <AMReX_Loop.H>
@@ -187,6 +189,20 @@ namespace Gpu {
                                                         AMREX_WRONG_NUM_ARGS, \
                                                         AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
 
+#ifdef AMREX_USE_CUDA
+#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE, \
+                                                        AMREX_WRONG_NUM_ARGS, \
+                                                        AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)
+#else
+#define AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA(...) AMREX_LAUNCH_HOST_DEVICE_LAMBDA(__VA_ARGS__)
+#endif
 
 #if (AMREX_SPACEDIM == 1)
 #define AMREX_LAUNCH_DEVICE_LAMBDA_DIM(a1,a2,a3,b1,b2,b3,c1,c2,c3)      AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE     (a1,a2,a2)
@@ -215,6 +231,22 @@ namespace Gpu {
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_3D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_4D(...) AMREX_GPU_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
 
+#ifdef AMREX_USE_CUDA
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(__VA_ARGS__)
+#else
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_1D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_3D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE(...) AMREX_HOST_DEVICE_PARALLEL_FOR_4D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_1D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_1D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_3D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_3D(__VA_ARGS__)
+#define AMREX_HOST_DEVICE_FOR_4D_FUSIBLE(...) AMREX_GPU_HOST_DEVICE_FOR_4D(__VA_ARGS__)
+#endif
+
 #ifdef AMREX_USE_GPU
 
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 3b26b799841..584ad71d810 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -10,7 +10,7 @@ void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (T n, L&& f) noexcept
 {
     for (T i = 0; i < n; ++i) {
         f(i);
@@ -18,7 +18,13 @@ void For (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    For(n, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (T n, L&& f) noexcept
 {
     AMREX_PRAGMA_SIMD
     for (T i = 0; i < n; ++i) {
@@ -26,8 +32,14 @@ void ParallelFor (T n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
     }
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    ParallelFor(n, std::forward<L>(f));
+}
+
 template <typename L>
-void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -39,7 +51,13 @@ void For (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 }
 
 template <typename L>
-void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    For(box, std::forward<L>(f));
+}
+
+template <typename L>
+void ParallelFor (Box const& box, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -51,8 +69,14 @@ void ParallelFor (Box const& box, L&& f, std::size_t /*shared_mem_bytes*/=0) noe
     }}}
 }
 
+template <typename L>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    ParallelFor(box, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box, T ncomp, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -66,7 +90,13 @@ void For (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) no
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    For(box, ncomp, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
@@ -80,90 +110,161 @@ void ParallelFor (Box const& box, T ncomp, L&& f, std::size_t /*shared_mem_bytes
     }
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(box, ncomp, std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
-void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
 }
 
+template <typename L1, typename L2>
+void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    For (box1, box2, std::forward<L1>(f1), std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1, std::forward<L1>(f1));
     For(box2, std::forward<L2>(f2));
     For(box3, std::forward<L3>(f3));
 }
 
+template <typename L1, typename L2, typename L3>
+void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    For(box1, box2, box3, std::forward<L1>(f1), std::forward<L2>(f2), std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
-          Box const& box2, T2 ncomp2, L2&& f2,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
 }
 
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void For (Gpu::KernelInfo const&,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
-          Box const& box3, T3 ncomp3, L3&& f3,
-          std::size_t /*shared_mem_bytes*/=0) noexcept
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     For(box1, ncomp1, std::forward<L1>(f1));
     For(box2, ncomp2, std::forward<L2>(f2));
     For(box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void For (Gpu::KernelInfo const&,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
 }
 
+template <typename L1, typename L2>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(box1,box2,f1,f2);
+}
+
 template <typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1, std::forward<L1>(f1));
     ParallelFor(box2, std::forward<L2>(f2));
     ParallelFor(box3, std::forward<L3>(f3));
 }
 
+template <typename L1, typename L2, typename L3>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
-                  Box const& box2, T2 ncomp2, L2&& f2,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
 }
 
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void ParallelFor (Gpu::KernelInfo const&,
+                  Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
-                  Box const& box3, T3 ncomp3, L3&& f3,
-                  std::size_t /*shared_mem_bytes*/=0) noexcept
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     ParallelFor(box1, ncomp1, std::forward<L1>(f1));
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
     ParallelFor(box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void ParallelFor (Gpu::KernelInfo const&,
+                  Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(box1, ncomp1, std::forward<L1>(f1),
+                box2, ncomp2, std::forward<L2>(f2),
+                box3, ncomp3, std::forward<L3>(f3));
+}
+
 template <typename N, typename T, typename L1, typename L2,
           typename M=amrex::EnableIf_t<std::is_integral<N>::value> >
 void FabReduce (Box const& box, N ncomp, T const& init_val,
@@ -315,6 +416,118 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    ParallelFor(n,std::forward<L>(f));
+}
+
+template <typename L>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    ParallelFor(box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    For(n,std::forward<L>(f));
+}
+
+template <typename L>
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    For(box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    For(box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, Box const& box2, Box const& box3,
+                    L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2,
+                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 }
 
 #endif
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 920570be12f..295d3320380 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -94,7 +94,7 @@ void launch (T const& n, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (T n, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
@@ -122,7 +122,7 @@ void ParallelFor (T n, L&& f) noexcept
 }
 
 template <typename L>
-void ParallelFor (Box const& box, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -159,7 +159,7 @@ void ParallelFor (Box const& box, L&& f) noexcept
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -198,7 +198,7 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 }
 
 template <typename L1, typename L2>
-void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
     ParallelFor(box1, std::forward<L1>(f1));
@@ -255,7 +255,9 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 }
 
 template <typename L1, typename L2, typename L3>
-void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+void ParallelFor (Gpu::KernelInfo const& /*info*/,
+                  Box const& box1, Box const& box2, Box const& box3,
+                  L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
     ParallelFor(box1, std::forward<L1>(f1));
@@ -327,7 +329,8 @@ void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+void ParallelFor (Gpu::KernelInfo const& /*info*/,
+                  Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     // xxxxx DPCPP todo: launch separate kernel to reduce kernel size
@@ -392,7 +395,8 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
-void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+void ParallelFor (Gpu::KernelInfo const& /*info*/,
+                  Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
@@ -628,209 +632,265 @@ void launch (T const& n, L&& f) noexcept
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (T n, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
-    const auto ec = Gpu::ExecutionConfig(n);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             i < n; i += stride) {
-            f(i);
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(n, f);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto ec = Gpu::ExecutionConfig(n);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 i < n; i += stride) {
+                f(i);
+            }
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename L>
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Box const& box, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   (len.x*len.y);
-            int j = (icell - k*(len.x*len.y)) /   len.x;
-            int i = (icell - k*(len.x*len.y)) - j*len.x;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            f(i,j,k);
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box, f);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto lo  = amrex::lbound(box);
+        const auto len = amrex::length(box);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                int k =  icell /   (len.x*len.y);
+                int j = (icell - k*(len.x*len.y)) /   len.x;
+                int i = (icell - k*(len.x*len.y)) - j*len.x;
+                i += lo.x;
+                j += lo.y;
+                k += lo.z;
+                f(i,j,k);
+            }
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
-ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            int k =  icell /   (len.x*len.y);
-            int j = (icell - k*(len.x*len.y)) /   len.x;
-            int i = (icell - k*(len.x*len.y)) - j*len.x;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            for (T n = 0; n < ncomp; ++n) {
-                f(i,j,k,n);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box, ncomp, f);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto lo  = amrex::lbound(box);
+        const auto len = amrex::length(box);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                int k =  icell /   (len.x*len.y);
+                int j = (icell - k*(len.x*len.y)) /   len.x;
+                int i = (icell - k*(len.x*len.y)) - j*len.x;
+                i += lo.x;
+                j += lo.y;
+                k += lo.z;
+                for (T n = 0; n < ncomp; ++n) {
+                    f(i,j,k,n);
+                }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename L1, typename L2>
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value>
-ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return;
     int ncells1 = box1.numPts();
     int ncells2 = box2.numPts();
     int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                f1(i,j,k);
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                f2(i,j,k);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, f1);
+        Gpu::Register(box2, f2);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    f1(i,j,k);
+                }
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    f2(i,j,k);
+                }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename L1, typename L2, typename L3>
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value and MaybeDeviceRunnable<L3>::value>
-ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, Box const& box2, Box const& box3,
+             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) and amrex::isEmpty(box2) and amrex::isEmpty(box3)) return;
     int ncells1 = box1.numPts();
     int ncells2 = box2.numPts();
     int ncells3 = box3.numPts();
     int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                f1(i,j,k);
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                f2(i,j,k);
-            }
-            if (icell < ncells3) {
-                int k =  icell /   (len3.x*len3.y);
-                int j = (icell - k*(len3.x*len3.y)) /   len3.x;
-                int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
-                f3(i,j,k);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, f1);
+        Gpu::Register(box2, f2);
+        Gpu::Register(box3, f3);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto lo3  = amrex::lbound(box3);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto len3 = amrex::length(box3);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    f1(i,j,k);
+                }
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    f2(i,j,k);
+                }
+                if (icell < ncells3) {
+                    int k =  icell /   (len3.x*len3.y);
+                    int j = (icell - k*(len3.x*len3.y)) /   len3.x;
+                    int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
+                    i += lo3.x;
+                    j += lo3.y;
+                    k += lo3.z;
+                    f3(i,j,k);
+                }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value>
-ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, T1 ncomp1, L1&& f1,
              Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) and amrex::isEmpty(box2)) return;
     int ncells1 = box1.numPts();
     int ncells2 = box2.numPts();
     int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(i,j,k,n);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, ncomp1, f1);
+        Gpu::Register(box2, ncomp2, f2);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    for (T1 n = 0; n < ncomp1; ++n) {
+                        f1(i,j,k,n);
+                    }
                 }
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(i,j,k,n);
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    for (T2 n = 0; n < ncomp2; ++n) {
+                        f2(i,j,k,n);
+                    }
                 }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -838,7 +898,8 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L1>::value and MaybeDeviceRunnable<L2>::value and MaybeDeviceRunnable<L3>::value>
-ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, T1 ncomp1, L1&& f1,
              Box const& box2, T2 ncomp2, L2&& f2,
              Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
@@ -847,53 +908,63 @@ ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
     int ncells2 = box2.numPts();
     int ncells3 = box3.numPts();
     int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             icell < ncells; icell += stride) {
-            if (icell < ncells1) {
-                int k =  icell /   (len1.x*len1.y);
-                int j = (icell - k*(len1.x*len1.y)) /   len1.x;
-                int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
-                for (T1 n = 0; n < ncomp1; ++n) {
-                    f1(i,j,k,n);
+#ifdef AMREX_USE_CUDA
+    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+        Gpu::Register(box1, ncomp1, f1);
+        Gpu::Register(box2, ncomp2, f2);
+        Gpu::Register(box3, ncomp3, f3);
+    } else
+#endif
+    {
+        amrex::ignore_unused(info);
+        const auto lo1  = amrex::lbound(box1);
+        const auto lo2  = amrex::lbound(box2);
+        const auto lo3  = amrex::lbound(box3);
+        const auto len1 = amrex::length(box1);
+        const auto len2 = amrex::length(box2);
+        const auto len3 = amrex::length(box3);
+        const auto ec = Gpu::ExecutionConfig(ncells);
+        AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                 icell < ncells; icell += stride) {
+                if (icell < ncells1) {
+                    int k =  icell /   (len1.x*len1.y);
+                    int j = (icell - k*(len1.x*len1.y)) /   len1.x;
+                    int i = (icell - k*(len1.x*len1.y)) - j*len1.x;
+                    i += lo1.x;
+                    j += lo1.y;
+                    k += lo1.z;
+                    for (T1 n = 0; n < ncomp1; ++n) {
+                        f1(i,j,k,n);
+                    }
                 }
-            }
-            if (icell < ncells2) {
-                int k =  icell /   (len2.x*len2.y);
-                int j = (icell - k*(len2.x*len2.y)) /   len2.x;
-                int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
-                for (T2 n = 0; n < ncomp2; ++n) {
-                    f2(i,j,k,n);
+                if (icell < ncells2) {
+                    int k =  icell /   (len2.x*len2.y);
+                    int j = (icell - k*(len2.x*len2.y)) /   len2.x;
+                    int i = (icell - k*(len2.x*len2.y)) - j*len2.x;
+                    i += lo2.x;
+                    j += lo2.y;
+                    k += lo2.z;
+                    for (T2 n = 0; n < ncomp2; ++n) {
+                        f2(i,j,k,n);
+                    }
                 }
-            }
-            if (icell < ncells3) {
-                int k =  icell /   (len3.x*len3.y);
-                int j = (icell - k*(len3.x*len3.y)) /   len3.x;
-                int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
-                for (T3 n = 0; n < ncomp3; ++n) {
-                    f3(i,j,k,n);
+                if (icell < ncells3) {
+                    int k =  icell /   (len3.x*len3.y);
+                    int j = (icell - k*(len3.x*len3.y)) /   len3.x;
+                    int i = (icell - k*(len3.x*len3.y)) - j*len3.x;
+                    i += lo3.x;
+                    j += lo3.y;
+                    k += lo3.z;
+                    for (T3 n = 0; n < ncomp3; ++n) {
+                        f3(i,j,k,n);
+                    }
                 }
             }
-        }
-    });
-    AMREX_GPU_ERROR_CHECK();
+        });
+        AMREX_GPU_ERROR_CHECK();
+    }
 }
 
 template <typename T, typename L1, typename L2>
@@ -983,34 +1054,147 @@ void single_task (L&& f) noexcept
     single_task(Gpu::gpuStream(), std::forward<L>(f));
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+{
+    ParallelFor(info, n,std::forward<L>(f));
+}
+
+template <typename L>
+void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+{
+    ParallelFor(info, box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(info,box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, Box const& box2, Box const& box3,
+          L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(info,
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (T n, L&& f) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{}, n, std::forward<L>(f));
+}
+
+template <typename L>
+void ParallelFor (Box const& box, L&& f) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{}, box, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void ParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                  L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor(Gpu::KernelInfo{},
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 void For (T n, L&& f) noexcept
 {
-    ParallelFor(n,std::forward<L>(f));
+    ParallelFor(Gpu::KernelInfo{}, n,std::forward<L>(f));
 }
 
 template <typename L>
 void For (Box const& box, L&& f) noexcept
 {
-    ParallelFor(box,std::forward<L>(f));
+    ParallelFor(Gpu::KernelInfo{}, box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 void For (Box const& box, T ncomp, L&& f) noexcept
 {
-    ParallelFor(box,ncomp,std::forward<L>(f));
+    ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
-void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+void For (Box const& box1, Box const& box2, Box const& box3,
+          L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
@@ -1019,7 +1203,7 @@ template <typename T1, typename T2, typename L1, typename L2,
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1030,29 +1214,37 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
           Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+    ParallelFor(Gpu::KernelInfo{},
+                box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (T n, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(n,std::forward<L>(f));
+        ParallelFor(info,n,std::forward<L>(f));
     } else {
         AMREX_PRAGMA_SIMD
         for (T i = 0; i < n; ++i) f(i);
     }
 }
 
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
+HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{}, n, std::forward<L>(f));
+}
+
 template <typename L>
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Box const& box, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box,std::forward<L>(f));
+        ParallelFor(info, box,std::forward<L>(f));
     } else {
         LoopConcurrentOnCpu(box,std::forward<L>(f));
     }
@@ -1060,10 +1252,10 @@ HostDeviceParallelFor (Box const& box, L&& f) noexcept
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L>::value>
-HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box,ncomp,std::forward<L>(f));
+        ParallelFor(info, box,ncomp,std::forward<L>(f));
     } else {
         LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
     }
@@ -1071,10 +1263,11 @@ HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
 
 template <typename L1, typename L2>
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value>
-HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+        ParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
     } else {
         LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
@@ -1083,11 +1276,13 @@ HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexc
 
 template <typename L1, typename L2, typename L3>
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value and MaybeHostDeviceRunnable<L3>::value>
-HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, Box const& box2, Box const& box3,
                        L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+        ParallelFor(info,box1,box2,box3,
+                    std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
     } else {
         LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
@@ -1099,11 +1294,12 @@ template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value>
-HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, T1 ncomp1, L1&& f1,
                        Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+        ParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
     } else {
         LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
@@ -1115,12 +1311,14 @@ template <typename T1, typename T2, typename T3, typename L1, typename L2, typen
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
 amrex::EnableIf_t<MaybeHostDeviceRunnable<L1>::value and MaybeHostDeviceRunnable<L2>::value and MaybeHostDeviceRunnable<L3>::value>
-HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, T1 ncomp1, L1&& f1,
                        Box const& box2, T2 ncomp2, L2&& f2,
                        Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+        ParallelFor(info,
+                    box1,ncomp1,std::forward<L1>(f1),
                     box2,ncomp2,std::forward<L2>(f2),
                     box3,ncomp3,std::forward<L3>(f3));
     } else {
@@ -1131,54 +1329,115 @@ HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void HostDeviceFor (T n, L&& f) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
-    HostDeviceParallelFor(n,std::forward<L>(f));
+    HostDeviceParallelFor(info,n,std::forward<L>(f));
 }
 
 template <typename L>
-void HostDeviceFor (Box const& box, L&& f) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
-    HostDeviceParallelFor(box,std::forward<L>(f));
+    HostDeviceParallelFor(info,box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
-    HostDeviceParallelFor(box,ncomp,std::forward<L>(f));
+    HostDeviceParallelFor(info,box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
-void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    HostDeviceParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
-void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, Box const& box2, Box const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    HostDeviceParallelFor(info, box1,box2,box3,
+                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    HostDeviceParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
           typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
           typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
-void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2,
                     Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(box1,ncomp1,std::forward<L1>(f1),
+    HostDeviceParallelFor(info,
+                          box1,ncomp1,std::forward<L1>(f1),
+                          box2,ncomp2,std::forward<L2>(f2),
+                          box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},n,std::forward<L>(f));
+}
+
+template <typename L>
+void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box,std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{}, box1,box2,box3,
+                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=amrex::EnableIf_t<std::is_integral<T1>::value>,
+          typename M2=amrex::EnableIf_t<std::is_integral<T2>::value>,
+          typename M3=amrex::EnableIf_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    HostDeviceParallelFor(Gpu::KernelInfo{},
+                          box1,ncomp1,std::forward<L1>(f1),
                           box2,ncomp2,std::forward<L2>(f2),
                           box3,ncomp3,std::forward<L3>(f3));
 }
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H
index 83306b4b560..45400858abd 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.H
@@ -53,6 +53,34 @@
             block \
         } \
     }}}
+#ifdef AMREX_USE_CUDA
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE(TN,TI,block) \
+    { auto const& amrex_i_tn = TN; \
+    if (!amrex::isEmpty(amrex_i_tn)) { \
+    if (amrex::Gpu::inLaunchRegion()) \
+    { \
+      const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
+      if (amrex::Gpu::inFuseRegion() && amrex_i_ec.numBlocks.x*amrex_i_ec.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \
+        amrex::Gpu::Register(amrex_i_tn, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block \
+        }); \
+      } else { \
+        AMREX_LAUNCH_KERNEL(amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
+        [=] AMREX_GPU_DEVICE () noexcept { \
+            for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
+                block \
+            } \
+        }); \
+        AMREX_GPU_ERROR_CHECK(); \
+      } \
+    } \
+    else { \
+        for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
+            block \
+        } \
+    }}}
+#endif
 #endif
 
 // two fused launches
@@ -134,6 +162,51 @@
             block2 \
         } \
     }}}
+#ifdef AMREX_USE_CUDA
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
+    { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
+    if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2)) { \
+    if (amrex::Gpu::inLaunchRegion()) \
+    { \
+      const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
+      const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
+      dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
+                                        amrex_i_ec2.numBlocks.x); \
+      amrex_i_nblocks.y = 2; \
+      if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \
+        amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block1 \
+        }); \
+        amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block2 \
+        }); \
+      } else { \
+        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        [=] AMREX_GPU_DEVICE () noexcept { \
+            switch (blockIdx.y) { \
+            case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+                        block1 \
+                    } \
+                    break; \
+            case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+                        block2 \
+                    } \
+            } \
+        }); \
+        AMREX_GPU_ERROR_CHECK(); \
+      } \
+    } \
+    else { \
+        for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+            block1 \
+        } \
+        for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+            block2 \
+        } \
+    }}}
+#endif
 #endif
 
 // three fused launches
@@ -233,6 +306,64 @@
             block3 \
         } \
     }}}
+#ifdef AMREX_USE_CUDA
+#define AMREX_GPU_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
+    { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
+    if (!amrex::isEmpty(amrex_i_tn1) or !amrex::isEmpty(amrex_i_tn2) or !amrex::isEmpty(amrex_i_tn3)) { \
+    if (amrex::Gpu::inLaunchRegion()) \
+    { \
+      const auto amrex_i_ec1 = amrex::Gpu::ExecutionConfig(amrex_i_tn1); \
+      const auto amrex_i_ec2 = amrex::Gpu::ExecutionConfig(amrex_i_tn2); \
+      const auto amrex_i_ec3 = amrex::Gpu::ExecutionConfig(amrex_i_tn3); \
+      dim3 amrex_i_nblocks = amrex::max(amrex::max(amrex_i_ec1.numBlocks.x, \
+                                                   amrex_i_ec2.numBlocks.x), \
+                                                   amrex_i_ec3.numBlocks.x); \
+      amrex_i_nblocks.y = 3; \
+      if (amrex::Gpu::inFuseRegion() && amrex_i_nblocks.x*amrex_i_nblocks.y*amrex_i_ec1.numThreads.x <= amrex::Gpu::getFuseSizeThreshold()) { \
+        amrex::Gpu::Register(amrex_i_tn1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI1(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block1 \
+        }); \
+        amrex::Gpu::Register(amrex_i_tn2, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI2(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block2 \
+        }); \
+        amrex::Gpu::Register(amrex_i_tn3, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept { \
+            amrex::Box TI3(amrex::IntVect(AMREX_D_DECL(i,j,k)),amrex::IntVect(AMREX_D_DECL(i,j,k))); \
+            block3 \
+        }); \
+      } else {                                                             \
+        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        [=] AMREX_GPU_DEVICE () noexcept { \
+            switch (blockIdx.y) { \
+            case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+                        block1 \
+                    } \
+                    break; \
+            case 1: for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+                        block2 \
+                    } \
+                    break; \
+            case 2: for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
+                        block3 \
+                    } \
+            } \
+        }); \
+        AMREX_GPU_ERROR_CHECK(); \
+      } \
+    } \
+    else { \
+        for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
+            block1 \
+        } \
+        for (auto const TI2 : amrex::Gpu::Range(amrex_i_tn2)) { \
+            block2 \
+        } \
+        for (auto const TI3 : amrex::Gpu::Range(amrex_i_tn3)) { \
+            block3 \
+        } \
+    }}}
+#endif
 #endif
 
 #ifdef AMREX_USE_DPCPP
@@ -452,6 +583,19 @@
     } \
 }
 
+#define AMREX_GPU_HOST_DEVICE_FOR_1D_FUSIBLE(n,i,block) \
+{ \
+    auto const& amrex_i_n = n; \
+    using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
+    } else { \
+        auto amrex_i_lambda = [=] (amrex_i_inttype i) noexcept block; \
+        AMREX_PRAGMA_SIMD \
+        for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
+    } \
+}
+
 #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \
 { \
     using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
@@ -470,6 +614,16 @@
     } \
 }
 
+#define AMREX_GPU_HOST_DEVICE_FOR_3D_FUSIBLE(box,i,j,k,block) \
+{ \
+    auto const& amrex_i_box = box; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
+    } else { \
+        amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
+    } \
+}
+
 #define AMREX_GPU_DEVICE_FOR_3D(box,i,j,k,block) \
 { \
     amrex::ParallelFor(box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
@@ -488,6 +642,17 @@
     } \
 }
 
+#define AMREX_GPU_HOST_DEVICE_FOR_4D_FUSIBLE(box,ncomp,i,j,k,n,block) \
+{ \
+    auto const& amrex_i_box = box; \
+    auto const& amrex_i_ncomp = ncomp; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setFusible(true),amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
+    } else { \
+        amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
+    } \
+}
+
 #define AMREX_GPU_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
 { \
     amrex::ParallelFor(box,ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H
index 3df4cc2f884..8cae1464acc 100644
--- a/Src/Base/AMReX_MFIter.H
+++ b/Src/Base/AMReX_MFIter.H
@@ -222,6 +222,10 @@ protected:
     mutable Vector<Vector<Real*> > real_device_reduce_list;
 #endif
 
+#ifdef AMREX_USE_GPU
+    std::unique_ptr<Gpu::FuseSafeGuard> gpu_fsg;
+#endif
+
     static int nextDynamicIndex;
 
     void Initialize ();
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index 3c07e07fe17..375fd3e578c 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -225,6 +225,9 @@ MFIter::~MFIter ()
     AMREX_GPU_ERROR_CHECK();
     Gpu::Device::resetStreamIndex();
     Gpu::resetNumCallbacks();
+    if (!OpenMP::in_parallel() && Gpu::inFuseRegion()) {
+        Gpu::LaunchFusedKernels();
+    }
 #endif
 
     if (m_fa) {
@@ -236,7 +239,7 @@ MFIter::~MFIter ()
     }
 }
 
-void 
+void
 MFIter::Initialize ()
 {
     if (flags & SkipInit) {
@@ -327,6 +330,11 @@ MFIter::Initialize ()
 #ifdef AMREX_USE_GPU
 	Gpu::Device::setStreamIndex((streams > 0) ? currentIndex%streams : -1);
         Gpu::resetNumCallbacks();
+        if (!OpenMP::in_parallel()) {
+            if (index_map->size() >= Gpu::getFuseNumKernelsThreshold()) {
+                gpu_fsg.reset(new Gpu::FuseSafeGuard(true));
+            }
+        }
 #endif
 
 	typ = fabArray.boxArray().ixType();
diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp
index 08d945cdca2..2ce916e627e 100644
--- a/Src/Base/AMReX_MultiFab.cpp
+++ b/Src/Base/AMReX_MultiFab.cpp
@@ -43,16 +43,36 @@ MultiFab::Dot (const MultiFab& x, int xcomp,
 
     BL_PROFILE("MultiFab::Dot()");
 
-    Real sm = amrex::ReduceSum(x, y, nghost,
-    [=] AMREX_GPU_HOST_DEVICE (Box const& bx, Array4<Real const> const& xfab, Array4<Real const> const& yfab) -> Real
-    {
-        Real t = 0.0;
-        AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+    Real sm = 0.0;
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        sm = amrex::ReduceSum(x, y, nghost,
+        [=] AMREX_GPU_DEVICE (Box const& bx, Array4<Real const> const& xfab, Array4<Real const> const& yfab) -> Real
         {
-            t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            Real t = 0.0;
+            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+            {
+                t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            });
+            return t;
         });
-        return t;
-    });
+    } else
+#endif
+    {
+#ifdef _OPENMP
+#pragma omp parallel reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            Array4<Real const> const& xfab = x.const_array(mfi);
+            Array4<Real const> const& yfab = y.const_array(mfi);
+            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+            {
+                sm += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            });
+        }
+    }
 
     if (!local) ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
 
@@ -198,7 +218,7 @@ MultiFab::Swap (MultiFab& dst, MultiFab& src,
             if (bx.ok()) {
                 auto sfab = src.array(mfi);
                 auto dfab = dst.array(mfi);
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
                 {
                     const amrex::Real tmp = dfab(i,j,k,n+dstcomp);
                     dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp);
@@ -298,7 +318,7 @@ MultiFab::Saxpy (MultiFab& dst, Real a, const MultiFab& src,
         if (bx.ok()) {
             auto const sfab = src.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,dstcomp+n) += a * sfab(i,j,k,srccomp+n);
             });
@@ -332,7 +352,7 @@ MultiFab::Xpay (MultiFab& dst, Real a, const MultiFab& src,
         if (bx.ok()) {
             auto const sfab = src.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,n+dstcomp) = sfab(i,j,k,n+srccomp) + a * dfab(i,j,k,n+dstcomp);
             });
@@ -374,7 +394,7 @@ MultiFab::LinComb (MultiFab& dst,
             auto const xfab =   x.array(mfi);
             auto const yfab =   y.array(mfi);
             auto       dfab = dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,dstcomp+n) = a*xfab(i,j,k,xcomp+n) + b*yfab(i,j,k,ycomp+n);
             });
@@ -415,7 +435,7 @@ MultiFab::AddProduct (MultiFab& dst,
             auto const s1fab = src1.array(mfi);
             auto const s2fab = src2.array(mfi);
             auto        dfab =  dst.array(mfi);
-            AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, numcomp, i, j, k, n,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, numcomp, i, j, k, n,
             {
                 dfab(i,j,k,n+dstcomp) += s1fab(i,j,k,n+comp1) * s2fab(i,j,k,n+comp2);
             });
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index d312cb676cf..fb436d6dcaf 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -67,7 +67,7 @@ namespace amrex
             Array4<Real> const& ccarr = cc.array(mfi);
             Array4<Real const> const& ndarr = nd.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_nd_to_cc(tbx, ccarr, ndarr, dcomp, scomp, ncomp);
             });
@@ -91,7 +91,7 @@ namespace amrex
                          Array4<Real const> const& eyarr = edge[1]->const_array(mfi);,
                          Array4<Real const> const& ezarr = edge[2]->const_array(mfi););
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_eg_to_cc(tbx, ccarr, AMREX_D_DECL(exarr,eyarr,ezarr), dcomp);
             });
@@ -133,12 +133,12 @@ namespace amrex
                          Array4<Real const> const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, fxarr, dcomp, GeometryData());
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), dcomp);
             });
@@ -168,12 +168,12 @@ namespace amrex
                          Array4<Real const> const& fzarr = fc[2]->const_array(mfi););
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, fxarr, 0, gd);
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avg_fc_to_cc(tbx, ccarr, AMREX_D_DECL(fxarr,fyarr,fzarr), 0);
             });
@@ -218,12 +218,12 @@ namespace amrex
             Array4<Real const> const& ccarr = cc.const_array(mfi);
 
 #if (AMREX_SPACEDIM == 1)
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx,
             {
                 amrex_avg_cc_to_fc(tbx, xbx, fxarr, ccarr, gd);
             });
 #else
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA (index_bounds, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (index_bounds, tbx,
             {
                 amrex_avg_cc_to_fc(tbx, AMREX_D_DECL(xbx,ybx,zbx),
                                    AMREX_D_DECL(fxarr,fyarr,fzarr), ccarr);
@@ -288,7 +288,7 @@ namespace amrex
             Array4<Real const> const& finearr = S_fine.const_array(mfi);
             Array4<Real const> const& finevolarr = fvolume.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avgdown_with_vol(tbx,crsearr,finearr,finevolarr,
                                        0,scomp,ncomp,ratio);
@@ -337,7 +337,7 @@ namespace amrex
             Array4<Real> const& crsearr = crse_S_fine.array(mfi);
             Array4<Real const> const& finearr = S_fine.const_array(mfi);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
             });
@@ -375,12 +375,12 @@ namespace amrex
                 Array4<Real const> const& finearr = S_fine.const_array(mfi);
 
                 if (is_cell_centered) {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio);
                     });
                 } else {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown_nodes(tbx,crsearr,finearr,scomp,scomp,ncomp,ratio);
                     });
@@ -406,12 +406,12 @@ namespace amrex
                 //        not part of the actual crse multifab which came in.
 
                 if (is_cell_centered) {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
                     });
                 } else {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                     {
                         amrex_avgdown_nodes(tbx,crsearr,finearr,0,scomp,ncomp,ratio);
                     });
@@ -486,7 +486,7 @@ namespace amrex
                 Array4<Real> const& crsearr = crse.array(mfi);
                 Array4<Real const> const& finearr = fine.const_array(mfi);
 
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                 {
                     amrex_avgdown_faces(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir);
                 });
@@ -568,7 +568,7 @@ namespace amrex
                 Array4<Real> const& crsearr = crse.array(mfi);
                 Array4<Real const> const& finearr = fine.const_array(mfi);
 
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
                 {
                     amrex_avgdown_edges(tbx, crsearr, finearr, 0, 0, ncomp, ratio, dir);
                 });
@@ -640,7 +640,7 @@ namespace amrex
 
             if (interpolate)
             {
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, thread_box,
+                AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tile_box, thread_box,
                 {
                     amrex_fill_slice_interp(thread_box, slice_arr, full_arr,
                                             0, start_comp, ncomp,
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index beaeac8a218..aaef79e9253 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -309,6 +309,22 @@ public:
             Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r, gh);
         });
 #else
+#ifdef AMREX_USE_CUDA
+        if (Gpu::inFuseRegion() && Gpu::inFuseReductionRegion()
+            && ec.numBlocks.x*ec.numThreads.x <= Gpu::getFuseSizeThreshold())
+        {
+            Gpu::Register(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                ReduceTuple r = *(dp+1);
+                if (box.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
+                    auto pr = Reduce::detail::call_f(f,i,j,k,ixtype);
+                    Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
+                }
+                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r);
+            });
+        } else
+#endif
+        {
         amrex::launch(ec.numBlocks.x, ec.numThreads.x, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             ReduceTuple r = *(dp+1);
@@ -325,6 +341,7 @@ public:
             }
             Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(*dp,r);
         });
+        }
 #endif
     }
 
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index e434e1c5a3f..7f253882494 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -173,6 +173,7 @@ target_sources( amrex
    # GPU --------------------------------------------------------------------
    AMReX_Gpu.H
    AMReX_GpuQualifiers.H
+   AMReX_GpuKernelInfo.H
    AMReX_GpuPrint.H
    AMReX_GpuAssert.H
    AMReX_GpuTypes.H
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index c97b264374d..edea7af44e0 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -66,6 +66,7 @@ C$(AMREX_BASE)_headers += AMReX_parstream.H
 C$(AMREX_BASE)_sources += AMReX_parstream.cpp
 
 C$(AMREX_BASE)_headers += AMReX_Gpu.H AMReX_GpuQualifiers.H AMReX_GpuPrint.H AMReX_GpuAssert.H AMReX_GpuTypes.H AMReX_GpuError.H
+C$(AMREX_BASE)_headers += AMReX_GpuKernelInfo.H
 
 C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosG.H AMReX_GpuLaunchFunctsG.H
 C$(AMREX_BASE)_headers += AMReX_GpuLaunchMacrosC.H AMReX_GpuLaunchFunctsC.H
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
index d1cbedfedea..652baa1bf92 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
@@ -410,13 +410,13 @@ MLABecLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& i
                      const auto& bzfab = bzcoef.array(mfi););
         if (m_overset_mask[amrlev][mglev]) {
             const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 mlabeclap_adotx_os(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                    osm, dxinv, ascalar, bscalar, ncomp);
             });
         } else {
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
             {
                 mlabeclap_adotx(tbx, yfab, xfab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                 dxinv, ascalar, bscalar, ncomp);
@@ -454,7 +454,7 @@ MLABecLaplacian::normalize (int amrlev, int mglev, MultiFab& mf) const
                      const auto& byfab = bycoef.array(mfi);,
                      const auto& bzfab = bzcoef.array(mfi););
 
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
         {
             mlabeclap_normalize(tbx, fab, afab, AMREX_D_DECL(bxfab,byfab,bzfab),
                                 dxinv, ascalar, bscalar, ncomp);
@@ -609,7 +609,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
 #else
         if (m_overset_mask[amrlev][mglev]) {
             const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box,
             {
                 abec_gsrb_os(thread_box, solnfab, rhsfab, alpha, afab,
                              AMREX_D_DECL(dhx, dhy, dhz),
@@ -621,7 +621,7 @@ MLABecLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab&
                              osm, vbx, redblack, nc);
             });
         } else if (regular_coarsening) {
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, thread_box,
+            AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( tbx, thread_box,
             {
                 abec_gsrb(thread_box, solnfab, rhsfab, alpha, afab,
                           AMREX_D_DECL(dhx, dhy, dhz),
@@ -688,7 +688,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         Real fac = bscalar*dxinv[0];
         Box blo = amrex::bdryLo(box, 0);
         int blen = box.length(0);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_xface(tbox, fxarr, solarr, bx, fac, blen, ncomp);
         });
@@ -696,7 +696,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         fac = bscalar*dxinv[1];
         blo = amrex::bdryLo(box, 1);
         blen = box.length(1);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_yface(tbox, fyarr, solarr, by, fac, blen, ncomp);
         });
@@ -705,7 +705,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
         fac = bscalar*dxinv[2];
         blo = amrex::bdryLo(box, 2);
         blen = box.length(2);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( blo, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( blo, tbox,
         {
             mlabeclap_flux_zface(tbox, fzarr, solarr, bz, fac, blen, ncomp);
         });
@@ -715,14 +715,14 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
     {
         Real fac = bscalar*dxinv[0];
         Box bflux = amrex::surroundingNodes(box, 0);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_x(tbox, fxarr, solarr, bx, fac, ncomp);
         });
 #if (AMREX_SPACEDIM >= 2)
         fac = bscalar*dxinv[1];
         bflux = amrex::surroundingNodes(box, 1);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_y(tbox, fyarr, solarr, by, fac, ncomp);
         });
@@ -730,7 +730,7 @@ MLABecLaplacian::FFlux (Box const& box, Real const* dxinv, Real bscalar,
 #if (AMREX_SPACEDIM == 3)
         fac = bscalar*dxinv[2];
         bflux = amrex::surroundingNodes(box, 2);
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bflux, tbox,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bflux, tbox,
         {
             mlabeclap_flux_z(tbox, fzarr, solarr, bz, fac, ncomp);
         });
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
index aadc9ee0106..bb88308d595 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
@@ -314,7 +314,7 @@ MLCellLinOp::interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiF
         const Box& bx    = mfi.tilebox();
         Array4<Real const> const& cfab = crse.const_array(mfi);
         Array4<Real> const& ffab = fine.array(mfi);
-        AMREX_HOST_DEVICE_PARALLEL_FOR_4D ( bx, ncomp, i, j, k, n,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FUSIBLE ( bx, ncomp, i, j, k, n,
         {
             int ic = amrex::coarsen(i,ratio3.x);
             int jc = amrex::coarsen(j,ratio3.y);
@@ -505,7 +505,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                     const Real bcllo = bdlv[icomp][olo];
                     const Real bclhi = bdlv[icomp][ohi];
                     if (idim == 0) {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_x(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -517,7 +517,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                                            imaxorder, dxi, flagbc, icomp);
                         });
                     } else if (idim == 1) {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_y(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -529,7 +529,7 @@ MLCellLinOp::applyBC (int amrlev, int mglev, MultiFab& in, BCMode bc_mode, State
                                            imaxorder, dyi, flagbc, icomp);
                         });
                     } else {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (
+                        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA (
                         blo, tboxlo, {
                         mllinop_apply_bc_z(0, tboxlo, blen, iofab, mlo,
                                            bctlo, bcllo, bvlo,
@@ -619,6 +619,7 @@ MLCellLinOp::reflux (int crse_amrlev,
                 AMREX_D_TERM(Elixir elifx = flux[0].elixir();,
                              Elixir elify = flux[1].elixir();,
                              Elixir elifz = flux[2].elixir(););
+                Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux
                 FFlux(crse_amrlev, mfi, pflux, crse_sol[mfi], Location::FaceCentroid);
                 fluxreg.CrseAdd(mfi, cpflux, crse_dx, dt, RunOn::Gpu);
             }
@@ -640,6 +641,7 @@ MLCellLinOp::reflux (int crse_amrlev,
                 AMREX_D_TERM(Elixir elifx = flux[0].elixir();,
                              Elixir elify = flux[1].elixir();,
                              Elixir elifz = flux[2].elixir(););
+                Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux
                 FFlux(fine_amrlev, mfi, pflux, fine_sol[mfi], Location::FaceCentroid, face_only);
                 fluxreg.FineAdd(mfi, cpflux, fine_dx, dt, RunOn::Gpu);
             }
@@ -678,6 +680,7 @@ MLCellLinOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxes
             AMREX_D_TERM(Elixir elifx = flux[0].elixir();,
                          Elixir elify = flux[1].elixir();,
                          Elixir elifz = flux[2].elixir(););
+            Gpu::FuseSafeGuard fsg(false); // Turn off fusing in FFlux
             FFlux(amrlev, mfi, pflux, sol[mfi], loc);
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
                 const Box& nbx = mfi.nodaltilebox(idim);
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
index 5ad48f59131..c556f22824a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
@@ -77,30 +77,30 @@ MLPoisson::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) con
         const auto& yfab = out.array(mfi);
 
 #if (AMREX_SPACEDIM == 3)
-        AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+        AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
         {
             mlpoisson_adotx(i, j, k, yfab, xfab, dhx, dhy, dhz);
         });
 #elif (AMREX_SPACEDIM == 2)
         if (m_has_metric_term) {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx_m(i, j, yfab, xfab, dhx, dhy, dx, probxlo);
             });
         } else {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx(i, j, yfab, xfab, dhx, dhy);
             });
         }
 #elif (AMREX_SPACEDIM == 1)
         if (m_has_metric_term) {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx_m(i, yfab, xfab, dhx, dx, probxlo);
             });
         } else {
-            AMREX_HOST_DEVICE_PARALLEL_FOR_3D (bx, i, j, k,
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FUSIBLE (bx, i, j, k,
             {
                 mlpoisson_adotx(i, yfab, xfab, dhx);
             });
@@ -134,12 +134,12 @@ MLPoisson::normalize (int amrlev, int mglev, MultiFab& mf) const
         const auto& fab = mf.array(mfi);
 
 #if (AMREX_SPACEDIM == 2)
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
         {
             mlpoisson_normalize(tbx, fab, dhx, dhy, dx, probxlo);
         });
 #else
-        AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+        AMREX_LAUNCH_HOST_DEVICE_FUSIBLE_LAMBDA ( bx, tbx,
         {
             mlpoisson_normalize(tbx, fab, dhx, dx, probxlo);
         });